In [None]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns
import chardet
# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import lightgbm
from sklearn.linear_model import LinearRegression

import eli5
from eli5.sklearn import PermutationImportance


In [None]:
from functools import partial

def importances(model_RF, model_lgbm, valid, valid_y, name, train_rf_rmse, train_lgbm_rmse):
    performance = pd.DataFrame(columns = ['train', 'valid'], index = ['RF', 'lgbm'])
    importances_lgbm = pd.Series(model_lgbm.feature_importances_, index=list(valid.columns))
    importances_lgbm = importances_lgbm.sort_values(ascending=False)
    y_pred = model_lgbm.predict(valid)
    mse = mean_squared_error(valid_y, y_pred)
    rmse = np.sqrt(mse)
    
    importances_RF = pd.Series(model_RF.feature_importances_, index=list(valid.columns))
    importances_RF = importances_RF.sort_values(ascending=False)
    y_pred_RF = model_RF.predict(valid)
    mse_RF = mean_squared_error(valid_y, y_pred_RF)
    rmse_RF = np.sqrt(mse_RF)
    
    performance.loc['lgbm'] = [train_lgbm_rmse, rmse]
    performance.loc['RF'] = [train_rf_rmse, rmse_RF]
    print(performance)

    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(16, 5))

    
    sns.barplot(x=importances_lgbm, y=importances_lgbm.index, ax=axs[0])
    axs[0].set_title("Feature Importances_lgbm")
    
    sns.barplot(x=importances_RF, y=importances_RF.index, ax=axs[1])
    axs[1].set_title("Feature Importances_RF")
    
    plt.show()
    

    return performance



def run_model(cols, df, name, n_estimators = 5):
    df = df[df['is_test'] == 0]
    valid, valid_y = df.query('계약년월 > 202304')[cols], df.query('계약년월 > 202304')['target']
    X = df.query('계약년월 <= 202304')[cols]
    y = df.query('계약년월 <= 202304')['target']
    
    model_RF = RandomForestRegressor(n_estimators=n_estimators, criterion='squared_error', random_state=1, n_jobs=-1)
    model_lgbm = lightgbm.LGBMRegressor(n_estimators=n_estimators, random_state=1, n_jobs=-1, verbosity=-1)

    model_RF.fit(X, y)
    model_lgbm.fit(X, y) 
    train_rf_rmse = np.sqrt(mean_squared_error(y, model_RF.predict(X)))
    train_lgbm_rmse = np.sqrt(mean_squared_error(y, model_lgbm.predict(X)))
    # print('training_RF rmse is ', np.sqrt(mean_squared_error(y, model_RF.predict(X))))
    # print('training_lgbm rmse is ', np.sqrt(mean_squared_error(y, model_lgbm.predict(X))))
    # 최적의 파라미터
    importances(model_RF, model_lgbm, valid, valid_y, name, train_rf_rmse, train_lgbm_rmse)

In [None]:
total_path = '/data/ephemeral/home/0122_total_data.csv' 

dt = pd.read_csv(total_path)

In [None]:
df_2018 = dt.query('계약년월 >= 201801')

In [1]:
#후보 컬럼들을 정의해주세요.

col = ['x', '계약년월', '이전가격', '전용면적', 'mean_지상층수', 'sum_세대수', '층', '부속건축물수',
       '한강거리', 'y', 'bus_count_1000', '기준금리', '매매수급동향', '연GDP']

new_col = ['x', '계약년월', '이전가격', '전용면적', 'mean_지상층수', 'sum_세대수', '층', 
       '한강거리', 'y', 'bus_count_1000', '매매수급동향']

min_col = ['x', '계약년월', '이전가격', '전용면적', 'sum_세대수', '층', 
       ]

corr_col = ['이전가격', '전용면적', 'mean_지상층수', 'sum_세대수', 
       '한강거리', 'y', 'bus_count_1000']

best_cols = ['한강거리', 'y', 'mean_지상층수', '전용면적_filtered', '이전가격']

In [None]:
#run_model(후보 컬럼, 데이터셋 이름, 모델 이름, estimators 수)

run_model(best_cols, df_2018, "best_cols", 300)

In [None]:
for estimator in range(10, 20):
    print(f"estimator's counts : {estimator}")
    run_model(best_cols, df_2018, estimator, estimator)