In [None]:
!pip install -U scikit-learn==1.3.2 xgboost==1.7.6



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error
from xgboost import XGBRegressor

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 데이터 불러오기 (파일 경로 직접 수정)
df = pd.read_csv('/content/drive/MyDrive/기상청/최종작업/2020_2023_최종데이터.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df

Unnamed: 0,tm,address_city,address_gu,sub_address,stn,ta_max,ta_min,ta_max_min,hm_min,hm_max,...,노년층_생활인구,세대수,사고,사망,부상,공휴일,평균 풍속(m/s),풍정합(100m),tm_dt,평균 지면온도(°C)
0,20200501,부산광역시,강서구,대저2동,904,23.7,16.6,7.1,63.2,90.3,...,30948,54994,1,0,1,0,5.8,5001.0,2020-05-01,26.1
1,20200501,부산광역시,사상구,엄궁동,904,23.7,16.6,7.1,63.2,90.3,...,51369,97038,3,0,4,0,5.8,5001.0,2020-05-01,26.1
2,20200501,부산광역시,사상구,학장동,904,23.7,16.6,7.1,63.2,90.3,...,51369,97038,3,0,4,0,5.8,5001.0,2020-05-01,26.1
3,20200501,부산광역시,사하구,감천동,159,22.7,16.7,6.0,63.7,87.5,...,69622,138962,2,0,3,0,5.8,5001.0,2020-05-01,26.1
4,20200501,부산광역시,사하구,다대동,950,23.6,15.4,8.2,61.9,93.8,...,69622,138962,2,0,3,0,5.8,5001.0,2020-05-01,26.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42919,20231031,부산광역시,동래구,온천동,940,24.5,10.0,14.5,20.4,94.0,...,72161,119841,2,0,2,0,3.1,2720.0,2023-10-31,21.4
42920,20231031,부산광역시,부산진구,당감동,938,23.7,10.4,13.3,20.5,91.9,...,104283,181565,0,0,0,0,3.1,2720.0,2023-10-31,21.4
42921,20231031,부산광역시,부산진구,부암동,938,23.7,10.4,13.3,20.5,91.9,...,104283,181565,0,0,0,0,3.1,2720.0,2023-10-31,21.4
42922,20231031,부산광역시,남구,감만동,942,24.0,12.0,12.0,23.0,88.0,...,73044,118390,2,0,2,0,3.1,2720.0,2023-10-31,21.4


In [None]:
# 3. tm_dt 제거
if 'tm_dt' in df.columns:
    df.drop(columns=['tm_dt'], inplace=True)

# 4. 로그 변환 타겟 생성
df['log_call_count'] = np.log1p(df['call_count'])

# 5. 날짜 파생 변수
df['year'] = df['tm'] // 10000
df['month'] = (df['tm'] % 10000) // 100
df['day'] = df['tm'] % 100
df['weekday'] = pd.to_datetime(df['tm'], format='%Y%m%d').dt.weekday
df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
df['is_before_holiday'] = df['공휴일'].shift(-1, fill_value=0)
df['is_after_holiday'] = df['공휴일'].shift(1, fill_value=0)

# 6. 범주형 인코딩
encoder = LabelEncoder()
df['address_city'] = encoder.fit_transform(df['address_city'])

# 7. 평균 타겟 인코딩 (삭제하지 않고 df에 남겨둠)
df['address_gu_mean_target'] = df.groupby('address_gu')['call_count'].transform('mean')
df['sub_address_mean_target'] = df.groupby('sub_address')['call_count'].transform('mean')

In [None]:
# 8. 학습 데이터 구성 (필요한 컬럼만 사용)
drop_cols = ['tm', 'call_count', 'log_call_count', 'address_gu', 'sub_address']
X = df.drop(columns=drop_cols)
y = df['log_call_count']

# float32로 변환 (메모리 최적화)
X = X.astype('float32')
y = y.astype('float32')

# 9. 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=300,
    max_depth=8,
    max_features=0.8,      # colsample_bytree에 대응
    max_samples=0.8,       # subsample에 대응 (v0.22 이상)
    random_state=42,
    n_jobs=-1              # 병렬 처리 (선택 사항)
)


model.fit(
    X_train, y_train,
)

In [None]:
# # 10. 모델 학습
# model = XGBRegressor(
#     n_estimators=300,
#     max_depth=6,
#     learning_rate=0.05,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     tree_method='hist',
#     random_state=42,
#     early_stopping_rounds=30
# )

# model.fit(
#     X_train, y_train,
#     eval_set=[(X_val, y_val)],
#     verbose=False
# )

In [None]:
from sklearn.metrics import mean_squared_error

# 11. 평가
preds = model.predict(X_val)
preds_real = np.expm1(preds)
y_val_real = np.expm1(y_val)
rmse = np.sqrt(mean_squared_error(y_val_real, preds_real))
print(f"✅ RMSE: {rmse:.2f}")

# 12. 예측 결과 저장
pred_df = pd.DataFrame({'예측값': preds_real.round().astype(int)})

✅ RMSE: 1.43


In [None]:
valid_df = pd.read_csv('/content/drive/MyDrive/기상청/최종작업/2024_최종데이터.csv')

In [None]:
valid_df.head()

Unnamed: 0,tm,address_city,address_gu,sub_address,stn,ta_max,ta_min,ta_max_min,hm_min,hm_max,...,부상,공휴일,평균 풍속(m/s),풍정합(100m),평균 현지기압(hPa),최고 해면기압(hPa),최저 해면기압(hPa),평균 해면기압(hPa),tm_dt,평균 지면온도(°C)
0,20240501,부산광역시,강서구,대저2동,904,18.5,11.1,7.4,42.5,82.5,...,2,0,3.7,3213,1003.7,1017.1,1007.7,1012.0,2024-05-01,15.9
1,20240501,부산광역시,북구,금곡동,941,18.4,10.6,7.8,42.4,82.3,...,1,0,3.7,3213,1003.7,1017.1,1007.7,1012.0,2024-05-01,15.9
2,20240501,부산광역시,북구,화명동,941,18.4,10.6,7.8,42.4,82.3,...,1,0,3.7,3213,1003.7,1017.1,1007.7,1012.0,2024-05-01,15.9
3,20240501,부산광역시,사상구,감전동,904,18.5,11.1,7.4,42.5,82.5,...,2,0,3.7,3213,1003.7,1017.1,1007.7,1012.0,2024-05-01,15.9
4,20240501,부산광역시,사상구,괘법동,904,18.5,11.1,7.4,42.5,82.5,...,2,0,3.7,3213,1003.7,1017.1,1007.7,1012.0,2024-05-01,15.9


In [None]:
# 2. tm_dt 제거 (있을 경우)
if 'tm_dt' in valid_df.columns:
    valid_df.drop(columns=['tm_dt'], inplace=True)

# 3. 날짜 파생 변수 생성
valid_df['year'] = valid_df['tm'] // 10000
valid_df['month'] = (valid_df['tm'] % 10000) // 100
valid_df['day'] = valid_df['tm'] % 100
valid_df['weekday'] = pd.to_datetime(valid_df['tm'], format='%Y%m%d').dt.weekday
valid_df['is_weekend'] = valid_df['weekday'].isin([5, 6]).astype(int)
valid_df['is_before_holiday'] = valid_df['공휴일'].shift(-1, fill_value=0)
valid_df['is_after_holiday'] = valid_df['공휴일'].shift(1, fill_value=0)

# 4. address_city 인코딩 (기존 encoder 사용)
valid_df['address_city'] = valid_df['address_city'].apply(
    lambda x: x if x in encoder.classes_ else 'Unknown'
)
if 'Unknown' not in encoder.classes_:
    encoder.classes_ = np.append(encoder.classes_, 'Unknown')
valid_df['address_city'] = encoder.transform(valid_df['address_city'])

# 5. 평균 타겟 인코딩 처리 (컬럼 유무 자동 대응)
if 'address_gu' in valid_df.columns:
    address_gu_mean_map = df.groupby('address_gu')['call_count'].mean()
    valid_df['address_gu_mean_target'] = valid_df['address_gu'].map(address_gu_mean_map)
    valid_df['address_gu_mean_target'] = valid_df['address_gu_mean_target'].fillna(address_gu_mean_map.mean())
else:
    print("⚠️ 'address_gu' 없음 - 평균값 사용")
    valid_df['address_gu_mean_target'] = df['call_count'].mean()

if 'sub_address' in valid_df.columns:
    sub_address_mean_map = df.groupby('sub_address')['call_count'].mean()
    valid_df['sub_address_mean_target'] = valid_df['sub_address'].map(sub_address_mean_map)
    valid_df['sub_address_mean_target'] = valid_df['sub_address_mean_target'].fillna(sub_address_mean_map.mean())
else:
    print("⚠️ 'sub_address' 없음 - 평균값 사용")
    valid_df['sub_address_mean_target'] = df['call_count'].mean()

# 6. 불필요 컬럼 제거 (존재할 때만)
valid_df.drop(columns=['address_gu', 'sub_address'], inplace=True, errors='ignore')

# 7. 입력 피처 구성
X_valid = valid_df.drop(columns=['tm', 'call_count'], errors='ignore')
X_valid = X_valid.astype('float32')

# ❗ 컬럼 일치시키기 (핵심)
for col in X.columns:
    if col not in X_valid.columns:
        X_valid[col] = 0.0
X_valid = X_valid[X.columns]

# 8. 예측 수행
preds_log = model.predict(X_valid)
preds = np.expm1(preds_log)
preds_rounded = preds.round().astype(int)

In [None]:
valid_df['call_count'] = preds_rounded

In [None]:
valid_df['call_count'].head()

Unnamed: 0,call_count
0,1
1,1
2,2
3,1
4,1


In [None]:
result_df = pd.read_csv('/content/drive/MyDrive/기상청/최종작업/test_call119.csv', encoding='euc-kr')

In [None]:
result_df.head()

Unnamed: 0,TM,address_city,address_gu,sub_address,STN,ta_max,ta_min,ta_max_min,hm_min,hm_max,ws_max,ws_ins_max,rn_day,call_count
0,20240501,부산광역시,강서구,대저2동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,
1,20240501,부산광역시,강서구,생곡동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,
2,20240501,부산광역시,강서구,송정동,937,16.9,9.9,7.0,55.3,93.9,4.5,9.7,0.0,
3,20240501,부산광역시,강서구,신호동,950,16.6,11.4,5.2,48.1,84.6,6.4,13.5,0.0,
4,20240501,부산광역시,금정구,구서동,940,16.9,10.2,6.7,46.8,91.3,3.3,8.7,0.0,


In [None]:
result_df['call_count'] = valid_df['call_count']

In [None]:
result_df.head()

Unnamed: 0,TM,address_city,address_gu,sub_address,STN,ta_max,ta_min,ta_max_min,hm_min,hm_max,ws_max,ws_ins_max,rn_day,call_count
0,20240501,부산광역시,강서구,대저2동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,1
1,20240501,부산광역시,강서구,생곡동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,1
2,20240501,부산광역시,강서구,송정동,937,16.9,9.9,7.0,55.3,93.9,4.5,9.7,0.0,2
3,20240501,부산광역시,강서구,신호동,950,16.6,11.4,5.2,48.1,84.6,6.4,13.5,0.0,1
4,20240501,부산광역시,금정구,구서동,940,16.9,10.2,6.7,46.8,91.3,3.3,8.7,0.0,1


In [None]:
result_df.to_csv('250028.csv', index=False, encoding='euc-kr')

In [None]:
valid_df['call_count'].value_counts()

Unnamed: 0_level_0,count
call_count,Unnamed: 1_level_1
2,4730
1,3991
3,603
4,119
6,94
5,34
7,30
