### Import Libraries

In [1]:
import pandas as pd
import random
import os
import numpy as np
from functools import partial
from lightgbm import LGBMRegressor
from hyperopt import fmin, hp, tpe, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [3]:
def dataset_split_X_y(df):    
    """
    @Description: split data into features and labels
    @Param: df, pandas dataframe with columns starting with X for features and Y for labels
    @Return: features and labels in pandas dataframes
    """
    xs = df.filter(regex='X') # Input : X Feature
    ys = df.filter(regex='Y') # Output : Y Feature
    return xs, ys

In [4]:
def zero_variance(df):
    """
    @Description: check for zero_variance
    @Param1: df, pandas dataframe
    @Return: names of the columns with zero variance
    """
    result = []
    for col in df.columns:
        if df[col].var() == 0:
            result.append(col)
    return result

In [5]:
def get_top_correlation(df, n=10):
    """
    @Description: print out top correlated features
    @Param1: df, pandas dataframe
    @Param2: n, number of lines to print 
    @Return: pandas series
    """
    pairs = set()
    for idx1 in range(0, df.shape[1]):
        for idx2 in range(0, idx1+1):
            pairs.add((df.columns[idx1], df.columns[idx2]))
    corr = df.corr().abs().unstack()
    corr = corr.drop(labels=pairs).sort_values(ascending=False)
    return corr[0:n]

In [6]:
def lg_nrmse(gt, preds):
    """
    @Description: Metric used in this project
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: nrmse score
    """
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    preds = pd.DataFrame(preds)
    all_nrmse = []
    for idx in range(0,14):
        rmse = mean_squared_error(gt.iloc[:,idx], preds.iloc[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [51]:
train_x = pd.read_csv('data/train.csv')

test_x = pd.read_csv('data/test.csv')
train_x, train_y = dataset_split_X_y(train_x)


y_feature_spec_info = pd.read_csv('data/meta/y_feature_spec_info.csv')

In [8]:
train_y.head()

Unnamed: 0,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,2.056,1.456,1.68,10.502,29.632,16.083,4.276,-25.381,-25.529,-22.769,23.792,-25.47,-25.409,-25.304
1,1.446,1.184,1.268,18.507,33.179,16.736,3.229,-26.619,-26.523,-22.574,24.691,-26.253,-26.497,-26.438
2,1.251,0.665,0.782,14.082,31.801,17.08,2.839,-26.238,-26.216,-22.169,24.649,-26.285,-26.215,-26.37
3,1.464,1.079,1.052,16.975,34.503,17.143,3.144,-25.426,-25.079,-21.765,24.913,-25.254,-25.021,-25.345
4,0.983,0.646,0.689,15.047,32.602,17.569,3.138,-25.376,-25.242,-21.072,25.299,-25.072,-25.195,-24.974


In [52]:
train_spec_df = y_feature_spec_info
df_indicator_y = pd.DataFrame()
for idx in range(len(train_spec_df.Feature)):
    if train_spec_df.Feature[idx] in train_y.columns:
        y_series = ~train_y[train_spec_df.Feature[idx]].between(train_spec_df.iloc[idx, :].Min, train_spec_df.iloc[idx, :].Max)
        df_indicator_y = pd.concat([df_indicator_y, y_series.astype(int)], axis = 1)

In [53]:
normal_data = df_indicator_y[df_indicator_y==0]
spec_data = df_indicator_y[df_indicator_y==1]

In [54]:
spec_data.count()

Y_01    1476
Y_02     558
Y_03     464
Y_04     500
Y_05      91
Y_06      10
Y_07    1822
Y_08      19
Y_09      19
Y_10       5
Y_11       3
Y_12      16
Y_13      15
Y_14      13
dtype: int64

In [55]:
train_x_spec_y_01 = train_x[spec_data['Y_02'] == 1]
train_x_normal_y_01 = train_x[normal_data['Y_02'] == 0]

In [56]:
train_x_spec_y_01['X_01']

23       64.425
37       74.623
177      77.682
239      69.524
246      67.485
          ...  
39352    66.465
39378    68.504
39393    68.504
39583    67.485
39605    66.465
Name: X_01, Length: 558, dtype: float64

In [57]:
for k in train_y.columns:
    train_x_spec_y_01 = train_x[spec_data[k] == 1]
    train_x_normal_y_01 = train_x[normal_data[k] == 0]
    print(k)
    for i in train_x.columns:
        if max(train_x_normal_y_01[i]) < max(train_x_spec_y_01[i]):
            print('max :' ,i)
        if min(train_x_normal_y_01[i]) > min(train_x_spec_y_01[i]) :
            print('min :' ,i)
    print()




Y_01
min : X_14
min : X_15
min : X_16
min : X_18
max : X_51

Y_02
min : X_49

Y_03
min : X_49

Y_04
max : X_37
max : X_54

Y_05

Y_06

Y_07
min : X_41

Y_08

Y_09

Y_10

Y_11

Y_12

Y_13

Y_14



In [58]:
train_x_normal_y_01.iloc[:,10:20].describe()

Unnamed: 0,X_11,X_12,X_13,X_14,X_15,X_16,X_17,X_18,X_19,X_20
count,39594.0,39594.0,39594.0,39594.0,39594.0,39594.0,39594.0,39594.0,39594.0,39594.0
mean,0.000366,4.37323,0.143335,13.372203,13.381916,13.46386,13.51259,13.449264,3.24025,3.184512
std,0.014148,0.021808,0.025332,0.029867,0.029468,0.036741,0.023436,0.029094,0.110484,0.105266
min,0.0,4.27,0.05,13.15,13.23,13.26,13.41,13.26,2.86,2.83
25%,0.0,4.36,0.13,13.35,13.36,13.44,13.5,13.43,3.16,3.1
50%,0.0,4.37,0.14,13.37,13.38,13.47,13.51,13.45,3.22,3.18
75%,0.0,4.39,0.16,13.39,13.41,13.49,13.53,13.47,3.31,3.27
max,0.7,4.49,0.28,13.49,13.5,13.61,13.61,13.57,3.75,3.67


In [66]:
train_x_spec_y_01.iloc[:,10:20].describe()

Unnamed: 0,X_11,X_12,X_13,X_14,X_15,X_16,X_17,X_18,X_19,X_20
count,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0
mean,0.001075,4.372688,0.14586,13.370358,13.382348,13.461649,13.513799,13.449713,3.268602,3.215072
std,0.0254,0.020486,0.025122,0.029096,0.029234,0.038131,0.023544,0.028219,0.131336,0.11024
min,0.0,4.31,0.07,13.27,13.26,13.35,13.43,13.35,2.91,2.91
25%,0.0,4.36,0.13,13.35,13.36,13.44,13.5,13.43,3.17,3.12
50%,0.0,4.37,0.14,13.37,13.38,13.46,13.52,13.45,3.25,3.23
75%,0.0,4.38,0.16,13.39,13.4,13.49,13.53,13.46,3.38,3.31
max,0.6,4.44,0.25,13.46,13.45,13.56,13.57,13.53,3.6,3.45


In [67]:
train_x_normal_y_01.iloc[:,20:30].describe()

Unnamed: 0,X_21,X_22,X_23,X_24,X_25,X_26,X_27,X_28,X_29,X_30
count,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0
mean,3.173867,3.232196,1.0,2.115659,2.093904,2.090331,2.098221,2.118552,2.173698,1.379079
std,0.106594,0.108809,0.0,0.032451,0.033155,0.0385,0.038003,0.042768,0.046691,0.029953
min,2.83,2.85,1.0,1.83,1.96,1.98,1.99,1.93,2.02,0.57
25%,3.09,3.14,1.0,2.09,2.07,2.06,2.07,2.09,2.14,1.37
50%,3.16,3.23,1.0,2.12,2.09,2.09,2.09,2.12,2.17,1.37
75%,3.25,3.32,1.0,2.14,2.12,2.12,2.12,2.14,2.2,1.38
max,3.68,3.79,1.0,2.35,2.35,2.35,2.35,2.35,2.36,2.11


In [68]:
train_x_spec_y_01.iloc[:,20:30].describe()

Unnamed: 0,X_21,X_22,X_23,X_24,X_25,X_26,X_27,X_28,X_29,X_30
count,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0
mean,3.202491,3.266039,1.0,2.11672,2.093244,2.093602,2.100197,2.121541,2.175753,1.373065
std,0.121381,0.115769,0.0,0.031665,0.032717,0.039304,0.04156,0.043679,0.046513,0.053234
min,2.88,3.0,1.0,2.03,2.01,2.01,2.01,2.04,2.09,0.57
25%,3.1,3.18,1.0,2.09,2.07,2.07,2.07,2.09,2.14,1.37
50%,3.185,3.28,1.0,2.12,2.09,2.09,2.1,2.12,2.17,1.37
75%,3.3,3.36,1.0,2.14,2.12,2.12,2.12,2.14,2.2,1.38
max,3.55,3.6,1.0,2.19,2.21,2.32,2.35,2.35,2.34,1.51


In [69]:
train_x_normal_y_01.iloc[:,30:40].describe()

Unnamed: 0,X_31,X_32,X_33,X_34,X_35,X_36,X_37,X_38,X_39,X_40
count,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0
mean,1.571179,1.362961,1.595846,12.95025,12.920309,12.94173,12.919113,-15.904671,-15.890222,-16.572931
std,0.074738,0.03006,0.107967,0.044023,0.052232,0.047839,0.052295,0.594186,0.747688,0.343985
min,0.6,0.57,0.61,12.84,12.81,12.84,12.81,-17.09,-17.09,-17.72
25%,1.53,1.35,1.55,12.92,12.87,12.9,12.87,-16.16,-16.16,-16.81
50%,1.55,1.36,1.57,12.96,12.91,12.95,12.91,-15.99,-15.99,-16.64
75%,1.6,1.37,1.61,12.99,12.97,12.98,12.97,-15.75,-15.75,-16.4
max,7.21,2.45,7.81,13.08,13.09,13.09,13.08,32.23,-2.65,-14.8


In [70]:
train_x_spec_y_01.iloc[:,30:40].describe()

Unnamed: 0,X_31,X_32,X_33,X_34,X_35,X_36,X_37,X_38,X_39,X_40
count,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0
mean,1.568495,1.360663,1.587151,12.951685,12.922724,12.941631,12.920914,-15.838423,-15.844373,-16.518315
std,0.057227,0.020243,0.112019,0.044521,0.052634,0.047217,0.052147,0.673639,0.670276,0.374343
min,1.5,1.33,0.61,12.85,12.83,12.85,12.81,-16.76,-16.76,-17.36
25%,1.53,1.35,1.55,12.92,12.87,12.9,12.88,-16.14,-16.14,-16.79
50%,1.55,1.36,1.57,12.96,12.925,12.95,12.92,-15.94,-15.96,-16.605
75%,1.59,1.37,1.59,12.99,12.97,12.98,12.97,-15.64,-15.65,-16.29
max,1.88,1.47,2.16,13.04,13.06,13.04,13.03,-2.65,-2.65,-15.34


In [71]:
train_x_normal_y_01.iloc[:,40:50].describe()

Unnamed: 0,X_41,X_42,X_43,X_44,X_45,X_46,X_47,X_48,X_49,X_50
count,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0
mean,21.186992,21.059368,21.203696,21.160097,0.15452,1468.276166,1.0,1.0,16678.583539,130.782024
std,0.031099,0.040272,0.047227,0.042166,0.046989,2.120836,0.0,0.0,8552.008319,5.991171
min,20.73,20.79,20.8,20.93,0.0,1457.0,1.0,1.0,3382.63,21.8
25%,21.17,21.03,21.17,21.13,0.12,1469.0,1.0,1.0,13108.63,126.956647
50%,21.19,21.06,21.2,21.16,0.15,1469.0,1.0,1.0,15278.93,130.732472
75%,21.21,21.09,21.24,21.19,0.19,1469.0,1.0,1.0,17564.73,134.54736
max,21.62,21.44,21.41,21.32,0.42,1469.0,1.0,1.0,114563.63,162.619458


In [72]:
train_x_spec_y_01.iloc[:,40:50].describe()

Unnamed: 0,X_41,X_42,X_43,X_44,X_45,X_46,X_47,X_48,X_49,X_50
count,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0
mean,21.186953,21.056971,21.204337,21.160771,0.157849,1468.215054,1.0,1.0,16560.053513,130.252055
std,0.033132,0.04135,0.046088,0.042851,0.045367,2.169725,0.0,0.0,10618.710723,5.878046
min,21.08,20.89,21.04,21.0,0.04,1457.0,1.0,1.0,3341.83,113.547627
25%,21.17,21.03,21.17,21.13,0.12,1469.0,1.0,1.0,12175.205,126.111388
50%,21.19,21.06,21.21,21.16,0.16,1469.0,1.0,1.0,14783.88,130.313973
75%,21.21,21.08,21.23,21.2,0.19,1469.0,1.0,1.0,17264.08,133.887299
max,21.51,21.25,21.37,21.24,0.29,1469.0,1.0,1.0,114211.13,150.277661


In [73]:
train_x_normal_y_01.describe()

Unnamed: 0,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,X_10,...,X_47,X_48,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56
count,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,...,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0,39049.0
mean,68.41161,103.320167,68.81657,1.0,102.337957,70.595443,29.394517,164.448757,225.514783,0.002407,...,1.0,1.0,16678.583539,130.782024,131.459133,138.58734,127.995251,128.017585,137.886895,128.446802
std,2.658364,0.000373,5.154366,0.0,0.548553,2.260763,7.258563,220.62927,66.638349,0.085533,...,0.0,0.0,8552.008319,5.991171,5.941484,6.474068,5.712359,5.441066,6.558856,5.450783
min,56.268,103.32,56.47,1.0,101.774,61.726,14.14,38.46,37.58,0.0,...,1.0,1.0,3382.63,21.8,21.91,23.1,21.33,21.34,22.98,21.41
25%,66.465,103.32,65.07,1.0,101.949,68.864,27.89,105.96,188.63,0.0,...,1.0,1.0,13108.63,126.956647,127.668461,134.471696,124.379934,124.696049,133.751523,125.140644
50%,68.504,103.32,67.27,1.0,102.006,69.884,28.84,115.04,234.58,0.0,...,1.0,1.0,15278.93,130.732472,131.356266,138.524881,128.027021,128.103878,137.901829,128.415753
75%,69.524,103.32,71.77,1.0,103.144,71.923,29.87,132.63,263.96,0.0,...,1.0,1.0,17564.73,134.54736,135.173489,142.704385,131.618765,131.499576,142.07549,131.842556
max,84.82,103.321,89.17,1.0,103.16,87.219,163.86,2387.44,637.49,3.6,...,1.0,1.0,114563.63,162.619458,194.513195,173.438623,152.40663,175.052891,170.15598,155.277538


In [74]:
train_x_spec_y_01.describe()

Unnamed: 0,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,X_10,...,X_47,X_48,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56
count,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,...,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0
mean,68.442138,103.320149,69.511039,1.0,102.284437,70.72097,30.315323,164.488692,217.187832,0.005376,...,1.0,1.0,16560.053513,130.252055,131.337355,138.605408,127.794313,127.770899,137.731848,128.043417
std,2.485748,0.000356,4.877934,0.0,0.531973,2.19123,11.601906,204.086726,72.760905,0.127,...,0.0,0.0,10618.710723,5.878046,5.412467,6.170294,5.770202,4.909347,6.064247,4.953707
min,58.307,103.32,60.27,1.0,101.815,63.766,24.96,58.38,37.58,0.0,...,1.0,1.0,3341.83,113.547627,108.960072,114.213213,109.122007,113.056164,120.323552,101.781635
25%,66.465,103.32,65.77,1.0,101.93225,68.864,28.1925,107.2775,176.275,0.0,...,1.0,1.0,12175.205,126.111388,127.694327,134.810781,123.699215,124.752235,133.336472,124.771286
50%,68.504,103.32,68.87,1.0,101.998,70.904,28.885,115.565,226.36,0.0,...,1.0,1.0,14783.88,130.313973,131.15773,138.598226,127.528109,127.865219,137.789589,128.311512
75%,69.524,103.32,72.645,1.0,103.13675,71.923,29.9675,128.57,264.9175,0.0,...,1.0,1.0,17264.08,133.887299,134.592182,142.458873,131.542738,130.825727,141.862725,131.575552
max,77.682,103.321,84.07,1.0,103.16,78.042,163.86,2358.07,619.9,3.0,...,1.0,1.0,114211.13,150.277661,147.817028,158.63183,145.078482,144.995846,155.575505,140.942455


In [59]:
def outlier_iqr_upper(tmp, i):
    data = tmp[i]
    
    print(np.percentile(data,75))
    print(np.percentile(data,25))
    q25, q75 = np.percentile(data, 25), np.percentile(data,75)
    iqr = q75 - q25
    
    cut_off = iqr * 1.5
    lower, upper = q25 - cut_off, q75 + cut_off
    print('변수 명 : ',i)
    print('IQR : ', iqr)
    print('lower bound : ', lower)
    print('upper bound : ', upper)
    print(np.where(train_x['X_57'] == 1, 1, np.where(data>upper, 1, 0)))
    
    train_x['X_57'] = np.where(train_x['X_57'] == 1, 1, np.where(data>upper, 1, 0))
    
    print("tmp['X_57'].value_counts() : ", train_x['X_57'].value_counts());
    return tmp

In [60]:
def outlier_iqr_lower(tmp, i):
    data = tmp[i]
    
    print(np.percentile(data,75))
    print(np.percentile(data,25))
    q25, q75 = np.percentile(data, 25), np.percentile(data,75)
    iqr = q75 - q25
    
    cut_off = iqr * 1.5
    lower, upper = q25 - cut_off, q75 + cut_off
    print('변수 명 : ',i)
    print('IQR : ', iqr)
    print('lower bound : ', lower)
    print('upper bound : ', upper)
    train_x['X_57'] = np.where(train_x['X_57'] == 1, 1, np.where(data<lower, 1, 0))
    
    print("tmp['X_57'].value_counts() : ", train_x['X_57'].value_counts());
    return tmp

In [61]:
X_57 = [0 for i in range(0, train_x.shape[0])]
len(X_57)
X_57 = pd.DataFrame(X_57)
train_x['X_57'] = X_57

In [62]:
# 2차 시도 (y와 corr 기준) fixed 2 !!!!!!!!!!
cols_lower = ["X_14","X_15","X_16", "X_18","X_41", "X_49"]
cols_upper = ["X_51",  "X_37", "X_54"]


In [18]:
# 2차 시도 (y와 corr 기준) fixed 2 !!!!!!!!!!
cols = ["X_14","X_15", "X_17", "X_18", "X_19", "X_20", "X_21","X_22",
        "X_24", "X_25", "X_26", "X_27", "X_28","X_29", "X_39", "X_40","X_41","X_42","X_43", "X_44","X_45"]


In [19]:
cols = ["X_11", "X_14","X_15", "X_16", "X_17", "X_18", "X_19", "X_20", "X_21","X_22",
        "X_24", "X_25", "X_26", "X_27", "X_28","X_29", 
        "X_30", "X_31", "X_32", "X_33",
        "X_34", "X_35", "X_36", "X_37", 
        "X_39", "X_40","X_41","X_42","X_43", "X_44","X_45"
       ]

In [63]:
for i in cols_upper:
    q25, q75 = np.percentile(train_x[i], 25), np.percentile(train_x[i],75)
    iqr = q75 - q25
    cut_off = iqr * 1.5
    lower, upper = q25 - cut_off, q75 + cut_off
    train_x['X_57'] = np.where(train_x['X_57'] == 1, 1, np.where(train_x[i]>upper, 1, 0))

    

In [64]:
for i in cols_lower:
    q25, q75 = np.percentile(train_x[i], 25), np.percentile(train_x[i],75)
    iqr = q75 - q25
    cut_off = iqr * 1.5
    lower, upper = q25 - cut_off, q75 + cut_off
    train_x['X_57'] = np.where(train_x['X_57'] == 1, 1, np.where(train_x[i]<lower, 1, 0))
    

In [65]:
train_x['X_57'].value_counts()

0    38357
1     1250
Name: X_57, dtype: int64

In [66]:

X_57 = [0 for i in range(0, test_x.shape[0])]
X_57 = pd.DataFrame(X_57)
test_x['X_57'] = X_57

for i in cols_upper:
    q25, q75 = np.percentile(test_x[i], 25), np.percentile(test_x[i],75)
    iqr = q75 - q25
    cut_off = iqr * 1.5
    lower, upper = q25 - cut_off, q75 + cut_off
    test_x['X_57'] = np.where(test_x['X_57'] == 1, 1, np.where(test_x[i]>upper, 1, 0))

    

In [67]:
for i in cols_lower:
    q25, q75 = np.percentile(test_x[i], 25), np.percentile(test_x[i],75)
    iqr = q75 - q25
    cut_off = iqr * 1.5
    lower, upper = q25 - cut_off, q75 + cut_off
    test_x['X_57'] = np.where(test_x['X_57'] == 1, 1, np.where(test_x[i]<lower, 1, 0))
    

In [46]:

cols_with_zero_variance = zero_variance(train_x)
train_x = train_x.drop(cols_with_zero_variance, axis = 1)
test_x = test_x.drop(cols_with_zero_variance, axis = 1)

highly_correlated = [i[1] for i in get_top_correlation(train_x, 3).index]
#train_x = train_x.drop(highly_correlated, axis = 1)

#test_x = test_x.drop(highly_correlated, axis = 1)
#test_x = test_x.drop('ID', axis=1)

In [68]:
def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **params))
    
    loss = -cross_val_score(model, train_x, train_y, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
    print("NRMSE Loss {:.5f} params {}".format(loss, params))
    return loss

In [69]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 58),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 200,
            rstate=np.random.default_rng(1))

NRMSE Loss 1.94459 params {'n_estimators': 1334, 'max_depth': 47, 'num_leaves': 20, 'min_child_samples': 240, 'colsample_bytree': '0.715', 'subsample': '0.657', 'min_split_gain': '0.655', 'scale_pos_weight': '4.117', 'reg_alpha': '22.649', 'reg_lambda': '55.509', 'learning_rate': '0.042'}
NRMSE Loss 1.94481 params {'n_estimators': 1508, 'max_depth': 13, 'num_leaves': 90, 'min_child_samples': 110, 'colsample_bytree': '0.538', 'subsample': '0.901', 'min_split_gain': '0.652', 'scale_pos_weight': '7.202', 'reg_alpha': '1.693', 'reg_lambda': '75.762', 'learning_rate': '0.159'}
NRMSE Loss 1.95297 params {'n_estimators': 232, 'max_depth': 67, 'num_leaves': 80, 'min_child_samples': 170, 'colsample_bytree': '0.387', 'subsample': '0.902', 'min_split_gain': '0.436', 'scale_pos_weight': '8.879', 'reg_alpha': '86.379', 'reg_lambda': '88.854', 'learning_rate': '0.039'}
NRMSE Loss 1.95513 params {'n_estimators': 580, 'max_depth': 27, 'num_leaves': 60, 'min_child_samples': 120, 'colsample_bytree': '0.

KeyboardInterrupt: 

In [None]:
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
model.fit(train_x, train_y)
preds = model.predict(test_x)

In [None]:
submit = pd.read_csv('./sample_submission.csv')
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
submit.to_csv('./submission_3.csv', index = False)