### 데이터 불러오기 및 연월 분리

In [9]:
# 📘 1. 데이터 불러오기 및 전처리 (연월 분리)
import pandas as pd

# 데이터 로드
raw_df = pd.read_csv("C:/Users/m/OneDrive - 계명대학교/25-1학기/sas viya/TRAIN_DATA.csv", encoding='cp949')

# 연월 분리
raw_df['YEAR'] = raw_df['DATA_YM'].astype(str).str[:4].astype(int)
raw_df['MONTH'] = raw_df['DATA_YM'].astype(str).str[4:].astype(int)

# 사용할 feature 정의
cluster_features = [
    'TOTAL_GAS', 'CMRC_GAS', 'FAC_NEIGH_2', 'FAC_STAY',
    'FAC_RETAIL', 'FAC_CULT_MTG', 'FAC_MEDI', 'FAC_LEISURE'
]

# 결측치 제거
df = raw_df.dropna(subset=cluster_features + ['TOTAL_ELEC']).copy()

### 클러터링용 변수 정의 + 결측 제거

In [10]:
# 📘 2. 학습/검증 데이터 분리 (25% 검증셋)
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df, test_size=0.25, random_state=42, shuffle=True
)

print(f"Train 샘플 수: {len(train_df)}")
print(f"Val 샘플 수: {len(val_df)}")


Train 샘플 수: 19572
Val 샘플 수: 6524


In [11]:
# 📘 3. 클러스터링 (StandardScaler + KMeans)
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

scaler = StandardScaler()
X_cluster = scaler.fit_transform(train_df[cluster_features])

kmeans = KMeans(n_clusters=3, random_state=42)
train_df['cluster'] = kmeans.fit_predict(X_cluster)

# 검증셋에도 같은 scaler, kmeans 적용
X_val_cluster = scaler.transform(val_df[cluster_features])
val_df['cluster'] = kmeans.predict(X_val_cluster)



In [12]:
# 📘 4. FLAML AutoML - 클러스터별 학습 및 최적 모델/파라미터 출력
from flaml import AutoML
from sklearn.metrics import mean_squared_error

results = []

for cluster_id in sorted(train_df['cluster'].unique()):
    print(f"\n[CLUSTER {cluster_id}] AutoML 시작")

    train_cluster = train_df[train_df['cluster'] == cluster_id].copy()
    val_cluster = val_df[val_df['cluster'] == cluster_id].copy()

    drop_cols = ['DATA_YM', 'AREA_ID', 'AREA_NM', 'DIST_CD', 'DIST_NM', 'cluster']
    feature_cols = [col for col in train_cluster.columns if col not in drop_cols + ['TOTAL_ELEC']]

    X_train = train_cluster[feature_cols]
    y_train = train_cluster['TOTAL_ELEC']
    X_val = val_cluster[feature_cols]
    y_val = val_cluster['TOTAL_ELEC']

    # FLAML 학습
    automl = AutoML()
    automl.fit(
        X_train=X_train,
        y_train=y_train,
        task="regression",
        time_budget=300,
        verbose=0
    )

    # 예측
    y_train_pred = automl.predict(X_train)
    y_val_pred = automl.predict(X_val)

    train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
    val_rmse = mean_squared_error(y_val, y_val_pred, squared=False)

    # 결과 저장
    results.append({
        "Cluster": cluster_id,
        "Best_Model": automl.best_estimator,
        "Best_Params": automl.best_config,
        "Train_RMSE": train_rmse,
        "Val_RMSE": val_rmse,
        "Gap": val_rmse - train_rmse
    })


[CLUSTER 0] AutoML 시작

[CLUSTER 1] AutoML 시작

[CLUSTER 2] AutoML 시작


In [13]:
# 📘 5. 결과 출력 및 비교
total_results_df = pd.DataFrame(results)
print("\n📊 클러스터별 최적 모델 및 RMSE 결과")
print(total_results_df)

# 시각화나 제출용 결과는 이후 셀에 추가 가능



📊 클러스터별 최적 모델 및 RMSE 결과
   Cluster Best_Model                                        Best_Params  \
0        0   catboost  {'early_stopping_rounds': 11, 'learning_rate':...   
1        1   catboost  {'early_stopping_rounds': 71, 'learning_rate':...   
2        2   catboost  {'early_stopping_rounds': 25, 'learning_rate':...   

   Train_RMSE     Val_RMSE         Gap  
0  309.414505   377.464712   68.050208  
1  367.164875  1111.564556  744.399680  
2  208.616097   352.611077  143.994981  


In [14]:
# 📘 6. 클러스터별 train/val 샘플 수 및 비율 비교
cluster_summary = []

for cluster_id in sorted(train_df['cluster'].unique()):
    train_count = len(train_df[train_df['cluster'] == cluster_id])
    val_count = len(val_df[val_df['cluster'] == cluster_id])
    total = train_count + val_count
    val_ratio = val_count / total * 100
    
    cluster_summary.append({
        "Cluster": cluster_id,
        "Train_Samples": train_count,
        "Val_Samples": val_count,
        "Total": total,
        "Val_Percent(%)": round(val_ratio, 2)
    })

cluster_summary_df = pd.DataFrame(cluster_summary)
print("\n📊 클러스터별 샘플 분포 및 검증 비율")
print(cluster_summary_df)



📊 클러스터별 샘플 분포 및 검증 비율
   Cluster  Train_Samples  Val_Samples  Total  Val_Percent(%)
0        0          16787         5642  22429           25.15
1        1            679          209    888           23.54
2        2           2106          673   2779           24.22


In [16]:
feature_cols

['TOTAL_BIDG',
 'FAC_NEIGH_1',
 'FAC_NEIGH_2',
 'FAC_CULT_MTG',
 'FAC_RELG',
 'FAC_RETAIL',
 'FAC_MEDI',
 'FAC_YOSE',
 'FAC_TRAIN',
 'FAC_SPORT',
 'FAC_STAY',
 'FAC_LEISURE',
 'TOTAL_GAS',
 'CMRC_GAS',
 'YEAR',
 'MONTH']

In [17]:
train_df

Unnamed: 0,DATA_YM,AREA_ID,AREA_NM,DIST_CD,DIST_NM,TOTAL_BIDG,FAC_NEIGH_1,FAC_NEIGH_2,FAC_CULT_MTG,FAC_RELG,...,FAC_TRAIN,FAC_SPORT,FAC_STAY,FAC_LEISURE,TOTAL_GAS,CMRC_GAS,TOTAL_ELEC,YEAR,MONTH,cluster
16403,202308,9590,내외동,48250,김해시,18.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,120193,52259,435.970,2023,8,0
21554,202207,9394,장한평역_1,11230,동대문구,54.0,23.0,12.0,1.0,1.0,...,0.0,0.0,1.0,0.0,402493,251179,1373.904,2022,7,0
2177,202310,9262,왕십리역_2,11200,성동구,101.0,10.0,17.0,0.0,1.0,...,0.0,0.0,14.0,0.0,336876,255132,575.651,2023,10,0
21716,202310,9880,샘내공원삼거리,41111,수원시 장안구,14.0,5.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,148277,105728,678.968,2023,10,0
1835,202208,9453,명륜역,26260,동래구,130.0,10.0,40.0,0.0,0.0,...,0.0,0.0,6.0,0.0,1257233,1240024,2624.919,2022,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21578,202308,9695,포항시청,47111,포항시 남구,55.0,8.0,26.0,0.0,0.0,...,0.0,0.0,0.0,0.0,91911,34675,409.454,2023,8,0
5390,202312,9243,영등포시장역,11560,영등포구,365.0,212.0,51.0,0.0,0.0,...,0.0,0.0,3.0,1.0,1480424,636248,1043.114,2023,12,0
860,202303,9950,정왕시장,41390,시흥시,279.0,25.0,28.0,0.0,2.0,...,0.0,0.0,0.0,0.0,387372,367318,611.182,2023,3,0
15796,202309,10094,도리섬상점가상권,41273,안산시 단원구,28.0,9.0,10.0,0.0,0.0,...,0.0,0.0,2.0,2.0,283407,146340,2807.412,2023,9,0
