In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import random
import os
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
%matplotlib inline
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [1]:
from sklearn import metrics 
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    print(all_nrmse[:8])
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

# 데이터 소환

In [34]:
# 일반 데이터 소환
from sklearn.model_selection import train_test_split

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv("./test.csv")

train_df, valid_df = train_test_split(train_df, train_size=0.8,random_state=42)

X_train = train_df.iloc[:,:57].drop(["ID","X_04","X_23","X_47","X_48"],axis = 1)
y_train = train_df.iloc[:,57]

X_test = valid_df.iloc[:,:57].drop(["ID","X_04","X_23","X_47","X_48"],axis = 1)
y_test = valid_df.iloc[:,57]


In [25]:
# # cross_val_score 돌릴때 데이터 소환
# train_df = pd.read_csv('./train.csv')
# test_df = pd.read_csv("./test.csv")

# X_train = train_df.iloc[:,:57].drop(["ID","X_04","X_23","X_47","X_48"],axis = 1)
# y_train = train_df.iloc[:,57]

In [44]:
X_test

Unnamed: 0,X_01,X_02,X_03,X_05,X_06,X_07,X_08,X_09,X_10,X_11,...,X_45,X_46,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56
17963,66.465,103.320,62.57,102.080,67.845,28.91,101.23,260.04,0.0,0.0,...,0.12,1469,12604.63,128.065556,142.927876,143.934663,127.675734,128.842655,134.760060,126.813557
14237,70.544,103.320,78.17,101.929,71.923,28.86,111.90,214.75,0.0,0.0,...,0.12,1469,15321.23,127.643056,133.561871,140.044099,127.529585,126.330578,133.354809,133.543911
2992,68.504,103.320,68.17,103.136,68.864,28.14,108.46,364.07,0.0,0.0,...,0.17,1469,12411.43,136.358098,129.140545,132.645387,126.858297,124.748920,128.367512,121.985188
18587,74.623,103.320,81.37,103.157,74.983,30.13,376.16,188.52,0.0,0.0,...,0.12,1469,14008.93,131.659291,125.397381,148.744196,131.340460,135.328876,144.221821,130.223950
19112,65.445,103.320,64.57,101.986,67.845,27.11,103.94,223.55,0.0,0.0,...,0.16,1469,10530.03,132.419370,129.721226,143.947633,132.284460,126.447689,146.282254,133.663632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33978,69.524,103.321,65.57,103.134,69.884,29.50,128.35,178.23,0.0,0.0,...,0.13,1469,13561.53,137.524754,131.410183,139.238588,135.518782,124.800785,138.075589,135.456371
22572,72.583,103.320,71.17,103.150,72.943,29.66,102.68,171.40,0.0,0.0,...,0.16,1468,13979.33,121.420750,126.243290,136.164087,122.200575,130.297443,133.995617,125.595280
2865,68.504,103.320,69.97,103.139,68.864,27.61,220.20,144.28,0.0,0.0,...,0.11,1469,14287.43,130.814908,128.802586,135.012003,125.067930,123.093868,140.852940,118.918759
22194,67.485,103.321,61.87,103.132,67.845,29.39,112.83,202.16,0.0,0.0,...,0.12,1469,12598.63,135.452397,131.939602,139.432458,124.437475,124.588634,135.540692,123.279461


# RandomizedSearchCV

In [None]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
# grid_search 함수를 만듭니다.
def grid_search(params, random=False,cv = 5,idx=57):

    reg=XGBRegressor(random_state=42,tree_method='gpu_hist', gpu_id=0,verbosity=0)
    
    if random:
        grid_reg = RandomizedSearchCV(reg, params, cv=cv, n_iter=20, 
                                  n_jobs=-1, random_state=42)
    else:
        grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    # GridSearchCV 객체를 만듭니다.
    
    # X_train와 y_train에서 그리드 서치를 수행합니다.
    grid_reg.fit(X_train, y_train,eval_set=[(X_test, y_test)],eval_metric='rmse',early_stopping_rounds=30)

    # 최상의 매개변수를 추출합니다.
    best_params = grid_reg.best_params_

    # 최상의 매개변수를 출력합니다.
    print("최상의 매개변수:", best_params)
    
    # 최상의 점수를 계산합니다.
    best_score = grid_reg.best_score_/np.mean(np.abs(y_test[:,idx]))

    # 최상의 점수를 출력합니다.
    print("훈련 점수: {:.3f}".format(best_score))

    # 테스트 세트에 대한 예측을 만듭니다.
    y_pred = grid_reg.predict(X_test)

    # 평균 제곱근 오차를 계산합니다.
    nrmse_test = mean_squared_error(y_test, y_pred, squared=False)/np.mean(np.abs(y_test[:,idx]))

    # 테스트 세트 점수를 출력합니다.
    print('테스트 점수: {:.3f}'.format(nrmse_test))

In [49]:
grid_search(params={'n_estimators':[2, 25, 50, 75, 100]})

[0]	validation_0-rmse:0.70672
[1]	validation_0-rmse:0.55846
[2]	validation_0-rmse:0.46775
[3]	validation_0-rmse:0.41540
[4]	validation_0-rmse:0.38657
[5]	validation_0-rmse:0.37094
[6]	validation_0-rmse:0.36278
[7]	validation_0-rmse:0.35847
[8]	validation_0-rmse:0.35618
[9]	validation_0-rmse:0.35493
[10]	validation_0-rmse:0.35431
[11]	validation_0-rmse:0.35405
[12]	validation_0-rmse:0.35397
[13]	validation_0-rmse:0.35398
[14]	validation_0-rmse:0.35390
[15]	validation_0-rmse:0.35388




[16]	validation_0-rmse:0.35387
[17]	validation_0-rmse:0.35395
[18]	validation_0-rmse:0.35411
[19]	validation_0-rmse:0.35423
[20]	validation_0-rmse:0.35411
[21]	validation_0-rmse:0.35403
[22]	validation_0-rmse:0.35408
[23]	validation_0-rmse:0.35393
[24]	validation_0-rmse:0.35390
최상의 매개변수: {'n_estimators': 25}
훈련 점수: 0.349
테스트 점수: 0.354


In [51]:
grid_search(params={'max_depth':[1, 2, 3, 4, 6, 7, 8], 
                    'n_estimators':[2, 50, 100]})

[0]	validation_0-rmse:0.70701
[1]	validation_0-rmse:0.55944
[2]	validation_0-rmse:0.46922
[3]	validation_0-rmse:0.41675
[4]	validation_0-rmse:0.38769
[5]	validation_0-rmse:0.37212
[6]	validation_0-rmse:0.36381
[7]	validation_0-rmse:0.35928
[8]	validation_0-rmse:0.35683
[9]	validation_0-rmse:0.35521
[10]	validation_0-rmse:0.35426
[11]	validation_0-rmse:0.35352
[12]	validation_0-rmse:0.35306
[13]	validation_0-rmse:0.35278
[14]	validation_0-rmse:0.35261
[15]	validation_0-rmse:0.35241
[16]	validation_0-rmse:0.35221
[17]	validation_0-rmse:0.35207
[18]	validation_0-rmse:0.35191
[19]	validation_0-rmse:0.35182
[20]	validation_0-rmse:0.35176
[21]	validation_0-rmse:0.35156
[22]	validation_0-rmse:0.35158
[23]	validation_0-rmse:0.35147
[24]	validation_0-rmse:0.35140
[25]	validation_0-rmse:0.35138
[26]	validation_0-rmse:0.35136
[27]	validation_0-rmse:0.35136
[28]	validation_0-rmse:0.35130
[29]	validation_0-rmse:0.35127
[30]	validation_0-rmse:0.35123
[31]	validation_0-rmse:0.35119
[32]	validation_0-



[40]	validation_0-rmse:0.35097
[41]	validation_0-rmse:0.35096
[42]	validation_0-rmse:0.35097
[43]	validation_0-rmse:0.35097
[44]	validation_0-rmse:0.35094
[45]	validation_0-rmse:0.35092
[46]	validation_0-rmse:0.35095
[47]	validation_0-rmse:0.35092
[48]	validation_0-rmse:0.35091
[49]	validation_0-rmse:0.35091
최상의 매개변수: {'max_depth': 2, 'n_estimators': 50}
훈련 점수: 0.347
테스트 점수: 0.351


In [65]:
import random
grid_search(params={'subsample':[0.5, 0.6, 0.7, 0.8, 0.9, 1], 
                    'min_child_weight':[ 4, 5,6,7], 
                    'learning_rate':[0.1,0.05,0.01,0.005,0.001], 
                    'max_depth':[5,6,7,8, None], 
                    'n_estimators':[75, 100,150,200,300,500,1000]}, random=True)

In [None]:
import tensorflow as tf
def my_loss(y_true, y_pred):
    # 내가 정의한 손실 함수
    error = y_true - y_pred
    rmse=tf.math.sqrt(tf.reduce_mean(tf.math.square(error)))
    nrmse = rmse/tf.reduce_mean(tf.math.abs (y_true))
    return nrmse

def my_metric(y_true, y_pred):  
    error = y_true - y_pred
    rmse=tf.math.sqrt(tf.reduce_mean(tf.math.square(error)))
    nrmse = rmse/tf.reduce_mean(tf.math.abs(y_true))
    return nrmse

In [None]:
loss_fold = []

for train, test in kfold.split(X_train):

    model.fit(X_train.loc[train], y_train.loc[train], epochs=20 , batch_size = 64)
        
    scores = model.evaluate(X_train.loc[test], y_train.loc[test], verbose=0)
    loss_fold.append(scores[1])

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True)