In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 필요한 라이브러리 임포트
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
import tensorflow as tf

In [None]:
# 데이터 불러오기
train_df = pd.read_csv('/content/drive/MyDrive/Epoch 공모전/2020_2023_최종데이터.csv')
train_df = train_df.drop(columns=['tm_dt'])

In [None]:
# 범주형 변수 원-핫 인코딩
train_df_encoded = pd.get_dummies(train_df, columns=['address_city', 'address_gu', 'sub_address'])

In [None]:
# target 변수와 feature 변수 나누기
X = train_df_encoded.drop(columns=['call_count'])
y = train_df_encoded['call_count']

In [None]:
# 데이터 분할 (train, test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 특성 스케일링 (표준화)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# GPU 사용을 위한 XGBoost 모델 설정 (device='cuda'로 GPU 사용)
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, tree_method='hist', device='cuda')

In [None]:
# 하이퍼파라미터 튜닝을 위한 XGBoost 파라미터 그리드
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

In [None]:
# TensorFlow를 사용하여 GPU가 정상적으로 활성화되었는지 확인
device_name = tf.config.list_physical_devices('GPU')
if device_name:
    print(f"GPU is available: {device_name}")
else:
    print("GPU is not available")

GPU is available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
# GridSearchCV를 사용하여 최적 하이퍼파라미터 찾기
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=1, verbose=1)
grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [None]:
# 최적 하이퍼파라미터 출력
print(f"Best parameters: {grid_search.best_params_}")

Best parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.9}


In [None]:
# 최적 모델로 예측
best_model = grid_search.best_estimator_
y_pred_xgb = best_model.predict(X_test_scaled)
y_pred_xgb = y_pred_xgb.round().astype(int)  # 정수형으로 변환

In [None]:
# 랜덤 포레스트 모델 학습
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

In [None]:
# 예측
y_pred_rf = rf_model.predict(X_test_scaled)
y_pred_rf = y_pred_rf.round().astype(int)  # 정수형으로 변환

In [None]:
# XGBoost와 랜덤 포레스트 앙상블 예측값을 정수로 변환
y_pred_ensemble = (y_pred_xgb + y_pred_rf) / 2
y_pred_ensemble = y_pred_ensemble.round().astype(int)  # 정수형으로 변환

In [None]:
# 성능 평가 (RMSE 계산)
rmse_ensemble = (mean_squared_error(y_test, y_pred_ensemble)) ** 0.5
print(f'Ensemble RMSE: {rmse_ensemble}')

Ensemble RMSE: 1.620342435963617


In [None]:
# 2024 데이터 불러오기
data_2024 = pd.read_csv('/content/drive/MyDrive/Epoch 공모전/2024_최종데이터.csv')

# 'tm_dt' 열 제거 (기존 train_df에서 'tm_dt' 열을 제거한 것과 동일)
data_2024 = data_2024.drop(columns=['tm_dt'])

# 2024 데이터에 대해서도 범주형 변수 원-핫 인코딩
data_2024_encoded = pd.get_dummies(data_2024, columns=['address_city', 'address_gu', 'sub_address'])

# 훈련 데이터에서 사용한 컬럼을 2024 데이터에 맞춰 추가 (누락된 컬럼을 0으로 채우기)
missing_cols = set(X.columns) - set(data_2024_encoded.columns)
for col in missing_cols:
    data_2024_encoded[col] = 0

# 2024 데이터에서 사용된 열들의 순서를 훈련 데이터와 맞추기
data_2024_encoded = data_2024_encoded[X.columns]

# 모델을 사용하여 예측 수행
X_2024_scaled = scaler.transform(data_2024_encoded)  # 학습한 스케일러로 변환

# 예측
y_pred_2024 = best_model.predict(X_2024_scaled)
y_pred_2024 = y_pred_2024.round().astype(int)  # 정수형으로 변환

# 예측한 call_count 칼럼만 DataFrame에 저장
data_2024['predicted_call_count'] = y_pred_2024

In [None]:
data_2024['predicted_call_count'].head()

Unnamed: 0,predicted_call_count
0,1
1,2
2,2
3,1
4,1


In [None]:
result_df = pd.read_csv('/content/drive/MyDrive/Epoch 공모전/test_call119.csv', encoding='euc-kr')

In [None]:
result_df['call_count'] = data_2024['predicted_call_count']

In [None]:
result_df.head()

Unnamed: 0,TM,address_city,address_gu,sub_address,STN,ta_max,ta_min,ta_max_min,hm_min,hm_max,ws_max,ws_ins_max,rn_day,call_count
0,20240501,부산광역시,강서구,대저2동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,1
1,20240501,부산광역시,강서구,생곡동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,2
2,20240501,부산광역시,강서구,송정동,937,16.9,9.9,7.0,55.3,93.9,4.5,9.7,0.0,2
3,20240501,부산광역시,강서구,신호동,950,16.6,11.4,5.2,48.1,84.6,6.4,13.5,0.0,1
4,20240501,부산광역시,금정구,구서동,940,16.9,10.2,6.7,46.8,91.3,3.3,8.7,0.0,1


In [None]:
result_df.to_csv('/content/drive/MyDrive/Epoch 공모전/250028.csv', index=False, encoding='euc-kr')