### Import Libraries

In [46]:
import pandas as pd
import random
import os
import numpy as np
from functools import partial
from lightgbm import LGBMRegressor
from hyperopt import fmin, hp, tpe, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [47]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [48]:
def dataset_split_X_y(df):    
    """
    @Description: split data into features and labels
    @Param: df, pandas dataframe with columns starting with X for features and Y for labels
    @Return: features and labels in pandas dataframes
    """
    xs = df.filter(regex='X') # Input : X Feature
    ys = df.filter(regex='Y') # Output : Y Feature
    return xs, ys

In [49]:
def zero_variance(df):
    """
    @Description: check for zero_variance
    @Param1: df, pandas dataframe
    @Return: names of the columns with zero variance
    """
    result = []
    for col in df.columns:
        if df[col].var() == 0:
            result.append(col)
    return result

In [50]:
def get_top_correlation(df, n=10):
    """
    @Description: print out top correlated features
    @Param1: df, pandas dataframe
    @Param2: n, number of lines to print 
    @Return: pandas series
    """
    pairs = set()
    for idx1 in range(0, df.shape[1]):
        for idx2 in range(0, idx1+1):
            pairs.add((df.columns[idx1], df.columns[idx2]))
    corr = df.corr().abs().unstack()
    corr = corr.drop(labels=pairs).sort_values(ascending=False)
    return corr[0:n]

In [51]:
def lg_nrmse(gt, preds):
    """
    @Description: Metric used in this project
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: nrmse score
    """
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    preds = pd.DataFrame(preds)
    all_nrmse = []
    for idx in range(0,14):
        rmse = mean_squared_error(gt.iloc[:,idx], preds.iloc[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [52]:
def get_binary_target(df):
    """
    @Description: transform numeric target to binary
    @Param1 df, pandas dataframe
    @Param2 y_range, list of lists with min-max
    @return labels, binary labels
    """
    
    ys = ['Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 
          'Y_06', 'Y_07', 'Y_08', 'Y_09', 'Y_10', 
          'Y_11', 'Y_12', 'Y_13', 'Y_14']
    ys_bounds = [[0.2, 2], [0.2, 2.1], [0.2, 2.1], [7, 19], [22, 36.5], [-19.2, 19], 
                 [2.4, 4], [-29.2, -24], [-29.2, -24],[-30.6, -20], [19.6, 26.6], 
                 [-29.2, -24], [-29.2, -24], [-29.2, -24]]
    labels = pd.DataFrame()
    for idx in range(len(ys)):
        y_series = ~df[ys[idx]].between(ys_bounds[idx][0], ys_bounds[idx][1], inclusive='both')
        labels = pd.concat([labels, y_series.astype(int)], axis = 1)
    return labels


In [53]:
train_df = pd.read_csv('data/train.csv')
test_x = pd.read_csv('data/test.csv')
train_x, train_y = dataset_split_X_y(train_df)

cols_with_zero_variance = zero_variance(train_x) # 분산이 0 (통과 여부)
train_x = train_x.drop(cols_with_zero_variance, axis = 1)
test_x = test_x.drop(cols_with_zero_variance, axis = 1)

train_x = train_x.drop(['X_10', 'X_11'], axis = 1) # 결측치가 많음 (결측치 = 0, 공지사항)
test_x = test_x.drop(['X_10', 'X_11'], axis = 1)

test_x = test_x.drop('ID', axis=1)
y_binary_label = get_binary_target(train_y)


In [54]:

# def get_splitted_data(binary_target, col, train_x_df, train_y_df, test_size = 0.2):
    
#     train = pd.concat([train_x_df, train_y_df[col]], axis = 1) # 학습데이터에 수치형 타겟 칼럼 추가 
#     target = binary_target[col] # 칼럼 이진 데이터 (불량 vs. 정상)
#     X_train, X_test, y_train, y_test = train_test_split(train, target, random_state=1, test_size=test_size, stratify=target)
    
#     # 여기서 X_test, y_test 는 이진 데이터이므로 사용하지 않음
#     # 나눠진 데이터에서 불량/정상 데이터 비율 확인 
#     print("학습 데이터에서의 불량/정상 Ratio : ", sum(y_train ==0) / sum(y_train))
#     print("테스트 데이터에서의 불량/정상 Ratio: ", sum(y_test ==0) / sum(y_test))
    
#     train_numerical_target = X_train[col] # 나눠진 *학습* 데이터에서 수치형 데이터 다시 추출
#     train_feature = X_train.drop([col], axis = 1) # 나눠진 *학습* 데이터에서 수치형 데이터 제거

#     test_numerical_target = X_test[col] # 나눠진 *테스트* 데이터에서 수치형 데이터 다시 추출
#     test_feature = X_test.drop([col], axis = 1) # 나눠진 *테스트* 데이터에서 수치형 데이터 제거
    
#     return train_feature, train_numerical_target, test_feature, test_numerical_target


In [55]:

# train_feature, train_target, test_feature, test_target = get_splitted_data(y_binary_label, 'Y_01', train_x, train_y, test_size=0.2)


In [56]:

def get_splitted_data(binary_target, df, test_size = 0.2):
    # 불량 데이터 비율 보장하기 위한 splitting 방식
    
    list_of_idx = set() # 불량 데이터 (행) 인덱스 추출
    for col_id in binary_target.columns:  
        list_of_idx.update(binary_target[binary_target[col_id] == 1].index)
    
    df_without_id = df.drop('ID', axis=1)
    
    data_defect = df_without_id.loc[list(list_of_idx), :]  # 불량 데이터
    data_normal = df_without_id.drop(data_defect.index) # 정상 데이터
    
    # Shuffle your dataset (normal)
    data_normal = data_normal.sample(frac=1, random_state = 1)
    data_defect = data_defect.sample(frac=1, random_state = 1)
    
    # Define train size
    normal_size = int((1 - test_size) * len(data_normal))
    defect_size = int((1 - test_size) * len(data_defect))

    # Split normal dataset 
    train_normal = data_normal[:normal_size]
    test_normal = data_normal[normal_size:]
    
    # Split defect dataset 
    train_defect = data_defect[:defect_size]
    test_defect = data_defect[defect_size:]
    
    train = pd.concat([train_normal, train_defect], axis = 0)
    test = pd.concat([test_normal, test_defect], axis = 0)
    
    train_x, train_y = dataset_split_X_y(train)
    test_x, test_y = dataset_split_X_y(test)
    
    return train_x, train_y, test_x, test_y

In [57]:


train_feature, train_target, test_feature, test_target = get_splitted_data(y_binary_label, train_df, test_size=0.2)




In [58]:
train_feature.reset_index(inplace=True)
train_target.reset_index(inplace=True)
test_feature.reset_index(inplace=True)
test_target.reset_index(inplace=True)

In [59]:
test_target

Unnamed: 0,index,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,34652,1.386,0.991,0.790,15.729,33.777,17.275,3.125,-26.394,-26.285,-22.732,25.029,-26.350,-26.390,-26.234
1,7348,1.389,0.822,1.048,11.085,30.448,15.794,3.680,-26.422,-26.416,-22.598,23.508,-26.387,-26.531,-26.369
2,25160,0.984,0.778,0.849,16.335,32.390,16.971,2.829,-26.042,-26.158,-21.638,24.594,-26.203,-26.190,-26.078
3,32553,1.308,0.749,0.645,15.410,29.863,16.224,3.163,-27.083,-26.812,-22.488,23.756,-26.834,-26.770,-26.796
4,38549,1.336,0.902,1.130,17.167,34.276,15.893,3.550,-27.149,-26.833,-22.885,23.453,-26.910,-26.950,-27.043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7917,9940,2.042,1.649,1.348,11.911,32.050,16.518,3.585,-26.368,-26.193,-22.247,24.155,-26.285,-26.480,-26.161
7918,16217,2.278,2.454,2.030,10.951,31.540,17.808,3.023,-25.534,-25.582,-21.260,25.504,-25.600,-25.462,-25.430
7919,10392,1.422,0.949,1.299,8.453,28.597,14.933,4.081,-27.166,-26.912,-23.566,22.535,-27.169,-27.203,-27.060
7920,8615,1.572,1.366,1.168,14.760,33.716,17.515,2.386,-25.607,-25.817,-21.342,25.381,-25.578,-25.507,-25.696


In [60]:
train_feature = train_feature.iloc[:, 1:]
train_target = train_target.iloc[:, 1:]
test_feature = test_feature.iloc[:, 1:]
test_target = test_target.iloc[:, 1:]

In [61]:
test_feature

Unnamed: 0,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,X_10,...,X_47,X_48,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56
0,68.504,103.32,64.77,1,103.143,68.864,32.94,116.25,110.72,0.0,...,1,1,10233.03,135.149174,134.105244,145.366876,136.616293,125.860592,142.370633,128.030255
1,68.504,103.32,75.17,1,101.953,73.963,25.84,110.08,209.04,0.0,...,1,1,12305.73,134.015660,133.461048,149.357454,125.936964,138.031231,134.255589,128.465794
2,66.465,103.32,63.07,1,102.022,66.825,29.47,109.31,281.86,0.0,...,1,1,15599.83,119.444501,125.047764,131.281938,117.584929,116.492245,135.838688,123.104511
3,65.445,103.32,67.17,1,102.019,70.904,28.88,117.85,274.87,0.0,...,1,1,18672.03,141.844617,148.533383,150.412004,138.782515,136.802196,149.772618,142.554333
4,65.445,103.32,64.67,1,101.937,69.884,29.67,109.72,173.23,0.0,...,1,1,15522.13,133.649228,129.953167,140.925699,128.949820,133.736790,140.555874,132.495213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7917,68.504,103.32,67.97,1,103.152,68.864,27.36,119.35,142.84,0.0,...,1,1,25572.83,133.801549,136.006786,135.362367,126.393995,122.958874,141.654744,132.121166
7918,65.445,103.32,64.77,1,101.928,68.864,30.37,98.88,295.09,0.0,...,1,1,17750.13,125.115269,138.201679,136.646623,126.229742,121.117939,136.472821,124.747479
7919,66.465,103.32,73.27,1,101.910,71.923,31.18,113.43,217.49,0.0,...,1,1,18550.43,126.512761,123.219822,134.736719,126.809942,123.285305,143.182301,124.058140
7920,69.524,103.32,70.47,1,101.933,69.884,33.97,119.33,271.29,0.0,...,1,1,12975.83,124.199429,127.733830,138.303499,128.232634,119.337894,136.303196,122.898960


In [62]:
#cols_with_zero_variance = zero_variance(tv_train_x) # 분산이 0 (통과 여부)
#tv_train_x = tv_train_x.drop(cols_with_zero_variance, axis = 1)
#tv_valid_x = tv_train_x.drop(cols_with_zero_variance, axis = 1)

train_feature = train_feature.drop(['X_10', 'X_11'], axis = 1) # 결측치가 많음 (결측치 = 0, 공지사항)
test_feature = test_feature.drop(['X_10', 'X_11'], axis = 1)

#tv_valid_x = tv_valid_x.drop('ID', axis=1)

In [81]:
def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
    }
    
    model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **params))
    
    loss = -cross_val_score(model, train_y, train_x, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
    print("NRMSE Loss {:.5f} params {}".format(loss, params))
    return loss

In [82]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1000, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 50,
            rstate=np.random.default_rng(1))

NRMSE Loss 2.58854 params {'n_estimators': 900, 'max_depth': 47, 'num_leaves': 20, 'min_child_samples': 240, 'colsample_bytree': '0.715', 'subsample': '0.657', 'min_split_gain': '0.655', 'scale_pos_weight': '4.117', 'reg_alpha': '22.649', 'reg_lambda': '55.509', 'learning_rate': '0.042'}
NRMSE Loss 2.71815 params {'n_estimators': 1000, 'max_depth': 13, 'num_leaves': 90, 'min_child_samples': 110, 'colsample_bytree': '0.538', 'subsample': '0.901', 'min_split_gain': '0.652', 'scale_pos_weight': '7.202', 'reg_alpha': '1.693', 'reg_lambda': '75.762', 'learning_rate': '0.159'}
NRMSE Loss 2.57949 params {'n_estimators': 200, 'max_depth': 67, 'num_leaves': 80, 'min_child_samples': 170, 'colsample_bytree': '0.387', 'subsample': '0.902', 'min_split_gain': '0.436', 'scale_pos_weight': '8.879', 'reg_alpha': '86.379', 'reg_lambda': '88.854', 'learning_rate': '0.039'}
  6%|██▊                                           | 3/50 [07:02<1:50:21, 140.89s/trial, best loss: 2.5794933206056334]



KeyboardInterrupt



In [28]:
best = {'n_estimators': 900, 'max_depth': 47, 'num_leaves': 20, 'min_child_samples': 240, 'colsample_bytree': '0.715', 'subsample': '0.657', 'min_split_gain': '0.655', 'scale_pos_weight': '4.117', 'reg_alpha': '22.649', 'reg_lambda': '55.509', 'learning_rate': '0.042'}
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
    
loss = -cross_val_score(model, train_target, train_feature, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
print("NRMSE Loss {:.5f} params {}".format(loss, best))


KeyboardInterrupt



In [None]:
print(len(tv_train_x.columns))
print(len(tv_valid_x.columns))

In [90]:
best = {'n_estimators': 900, 'max_depth': 47, 'num_leaves': 20, 'min_child_samples': 240, 'colsample_bytree': '0.715', 'subsample': '0.657', 'min_split_gain': '0.655', 'scale_pos_weight': '4.117', 'reg_alpha': '22.649', 'reg_lambda': '55.509', 'learning_rate': '0.042'}
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
model.fit(train_feature, train_target)
preds = model.predict(test_feature)

In [91]:
a = pd.DataFrame(preds)

In [92]:
for i in a.columns:
    print('{}  /  Min : {}    ,   Max : {}'.format(i, min(a[i]), max(a[i])) )

0  /  Min : 1.154606104232035    ,   Max : 1.495434543247299
1  /  Min : 0.855935405935274    ,   Max : 1.194809132018322
2  /  Min : 0.8492868654208032    ,   Max : 1.148255713877774
3  /  Min : 10.734430578532928    ,   Max : 16.74577384875757
4  /  Min : 28.227846311377384    ,   Max : 33.33677397867025
5  /  Min : 2.8165006420289065    ,   Max : 19.042525010530177
6  /  Min : 2.972683196140373    ,   Max : 3.289344342419169
7  /  Min : -26.867131268929853    ,   Max : -25.62532381609436
8  /  Min : -26.828583422961938    ,   Max : -25.65329962927672
9  /  Min : -23.97215933408869    ,   Max : -21.67578312805638
10  /  Min : 23.743177633973918    ,   Max : 25.069971296176
11  /  Min : -26.764471135398395    ,   Max : -25.56694504974986
12  /  Min : -26.782111088818915    ,   Max : -25.580113837642102
13  /  Min : -26.78204687365752    ,   Max : -25.594853535195114


In [94]:
a

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.253628,0.917809,0.925049,13.645019,31.377077,16.390242,3.137287,-26.588899,-26.581720,-23.034813,24.225088,-26.537224,-26.544711,-26.512825
1,1.383685,1.139642,1.036173,14.006568,31.586416,14.803468,3.143829,-26.078688,-26.065890,-22.224363,24.500698,-25.985517,-26.009362,-26.030551
2,1.281098,1.003387,0.969184,13.319544,32.037674,16.735729,3.152385,-26.400764,-26.419480,-22.613995,24.240342,-26.382157,-26.330617,-26.370004
3,1.372201,1.069160,1.052126,12.665445,30.680151,16.343223,3.241957,-26.353382,-26.386639,-22.241651,24.106398,-26.290148,-26.278321,-26.308223
4,1.310064,1.015448,1.010559,14.593074,32.141255,16.820464,3.166319,-26.427430,-26.474987,-22.425594,24.475476,-26.410251,-26.401151,-26.379272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7917,1.368647,1.084917,1.001295,13.295844,31.154280,16.014930,3.095531,-26.175750,-26.220364,-22.251711,24.286942,-26.145990,-26.155009,-26.130499
7918,1.345020,1.064957,1.035966,12.439491,31.882498,16.724136,3.179389,-26.368690,-26.351999,-22.290521,24.511780,-26.338473,-26.282442,-26.304285
7919,1.336282,1.070830,1.039721,12.310210,29.537848,16.183210,3.170066,-26.341585,-26.385461,-22.515815,24.063156,-26.325829,-26.285388,-26.326577
7920,1.329804,1.031117,0.988537,15.466144,31.471088,12.344689,3.147544,-26.297785,-26.339188,-22.618268,24.174104,-26.225971,-26.220166,-26.279585


In [95]:
submit = pd.read_csv('data/validation_test_submission.csv')
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
#submit.to_csv('data/param_test.csv', index = False)

In [100]:
y_feature_spec_info = pd.read_csv('data/meta/y_feature_spec_info_new.csv')
submit.head()

df_indicator = pd.DataFrame()

for i, k in enumerate(submit.columns):
    if k == 'ID':
        continue
    y_series = ~submit[k].between(y_feature_spec_info['Min'][i-1], y_feature_spec_info['Max'][i-1])
    if i == 1:
        df_indicator = y_series
    else:
        df_indicator = df_indicator + y_series


In [101]:
submit.iloc[:,1:]

Unnamed: 0,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,1.253628,0.917809,0.925049,13.645019,31.377077,16.390242,3.137287,-26.588899,-26.581720,-23.034813,24.225088,-26.537224,-26.544711,-26.512825
1,1.383685,1.139642,1.036173,14.006568,31.586416,14.803468,3.143829,-26.078688,-26.065890,-22.224363,24.500698,-25.985517,-26.009362,-26.030551
2,1.281098,1.003387,0.969184,13.319544,32.037674,16.735729,3.152385,-26.400764,-26.419480,-22.613995,24.240342,-26.382157,-26.330617,-26.370004
3,1.372201,1.069160,1.052126,12.665445,30.680151,16.343223,3.241957,-26.353382,-26.386639,-22.241651,24.106398,-26.290148,-26.278321,-26.308223
4,1.310064,1.015448,1.010559,14.593074,32.141255,16.820464,3.166319,-26.427430,-26.474987,-22.425594,24.475476,-26.410251,-26.401151,-26.379272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7917,1.368647,1.084917,1.001295,13.295844,31.154280,16.014930,3.095531,-26.175750,-26.220364,-22.251711,24.286942,-26.145990,-26.155009,-26.130499
7918,1.345020,1.064957,1.035966,12.439491,31.882498,16.724136,3.179389,-26.368690,-26.351999,-22.290521,24.511780,-26.338473,-26.282442,-26.304285
7919,1.336282,1.070830,1.039721,12.310210,29.537848,16.183210,3.170066,-26.341585,-26.385461,-22.515815,24.063156,-26.325829,-26.285388,-26.326577
7920,1.329804,1.031117,0.988537,15.466144,31.471088,12.344689,3.147544,-26.297785,-26.339188,-22.618268,24.174104,-26.225971,-26.220166,-26.279585


In [102]:
print(df_indicator.value_counts())
df_indicator[df_indicator==True] = 1
df_indicator[df_indicator==False] = 0

False    7896
True       26
dtype: int64


In [41]:
df_indicator

0       0
1       0
2       0
3       0
4       0
       ..
7917    0
7918    0
7919    0
7920    0
7921    0
Length: 7922, dtype: object

In [103]:
test_feature['X_57'] = df_indicator



In [104]:
test_feature

Unnamed: 0,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,X_12,...,X_48,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56,X_57
0,68.504,103.32,64.77,1,103.143,68.864,32.94,116.25,110.72,4.38,...,1,10233.03,135.149174,134.105244,145.366876,136.616293,125.860592,142.370633,128.030255,0
1,68.504,103.32,75.17,1,101.953,73.963,25.84,110.08,209.04,4.33,...,1,12305.73,134.015660,133.461048,149.357454,125.936964,138.031231,134.255589,128.465794,0
2,66.465,103.32,63.07,1,102.022,66.825,29.47,109.31,281.86,4.35,...,1,15599.83,119.444501,125.047764,131.281938,117.584929,116.492245,135.838688,123.104511,0
3,65.445,103.32,67.17,1,102.019,70.904,28.88,117.85,274.87,4.39,...,1,18672.03,141.844617,148.533383,150.412004,138.782515,136.802196,149.772618,142.554333,0
4,65.445,103.32,64.67,1,101.937,69.884,29.67,109.72,173.23,4.36,...,1,15522.13,133.649228,129.953167,140.925699,128.949820,133.736790,140.555874,132.495213,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7917,68.504,103.32,67.97,1,103.152,68.864,27.36,119.35,142.84,4.36,...,1,25572.83,133.801549,136.006786,135.362367,126.393995,122.958874,141.654744,132.121166,0
7918,65.445,103.32,64.77,1,101.928,68.864,30.37,98.88,295.09,4.40,...,1,17750.13,125.115269,138.201679,136.646623,126.229742,121.117939,136.472821,124.747479,0
7919,66.465,103.32,73.27,1,101.910,71.923,31.18,113.43,217.49,4.39,...,1,18550.43,126.512761,123.219822,134.736719,126.809942,123.285305,143.182301,124.058140,0
7920,69.524,103.32,70.47,1,101.933,69.884,33.97,119.33,271.29,4.40,...,1,12975.83,124.199429,127.733830,138.303499,128.232634,119.337894,136.303196,122.898960,0


In [105]:
test_feature['X_57'] = test_feature['X_57'].astype('int')

In [106]:
test_feature.dtypes

X_01    float64
X_02    float64
X_03    float64
X_04      int64
X_05    float64
X_06    float64
X_07    float64
X_08    float64
X_09    float64
X_12    float64
X_13    float64
X_14    float64
X_15    float64
X_16    float64
X_17    float64
X_18    float64
X_19    float64
X_20    float64
X_21    float64
X_22    float64
X_23      int64
X_24    float64
X_25    float64
X_26    float64
X_27    float64
X_28    float64
X_29    float64
X_30    float64
X_31    float64
X_32    float64
X_33    float64
X_34    float64
X_35    float64
X_36    float64
X_37    float64
X_38    float64
X_39    float64
X_40    float64
X_41    float64
X_42    float64
X_43    float64
X_44    float64
X_45    float64
X_46      int64
X_47      int64
X_48      int64
X_49    float64
X_50    float64
X_51    float64
X_52    float64
X_53    float64
X_54    float64
X_55    float64
X_56    float64
X_57      int32
dtype: object

In [49]:
test_feature

Unnamed: 0,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,X_12,...,X_48,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56,X_57
0,68.504,103.32,64.77,1,103.143,68.864,32.94,116.25,110.72,4.38,...,1,10233.03,135.149174,134.105244,145.366876,136.616293,125.860592,142.370633,128.030255,0
1,68.504,103.32,75.17,1,101.953,73.963,25.84,110.08,209.04,4.33,...,1,12305.73,134.015660,133.461048,149.357454,125.936964,138.031231,134.255589,128.465794,0
2,66.465,103.32,63.07,1,102.022,66.825,29.47,109.31,281.86,4.35,...,1,15599.83,119.444501,125.047764,131.281938,117.584929,116.492245,135.838688,123.104511,0
3,65.445,103.32,67.17,1,102.019,70.904,28.88,117.85,274.87,4.39,...,1,18672.03,141.844617,148.533383,150.412004,138.782515,136.802196,149.772618,142.554333,0
4,65.445,103.32,64.67,1,101.937,69.884,29.67,109.72,173.23,4.36,...,1,15522.13,133.649228,129.953167,140.925699,128.949820,133.736790,140.555874,132.495213,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7917,68.504,103.32,67.97,1,103.152,68.864,27.36,119.35,142.84,4.36,...,1,25572.83,133.801549,136.006786,135.362367,126.393995,122.958874,141.654744,132.121166,0
7918,65.445,103.32,64.77,1,101.928,68.864,30.37,98.88,295.09,4.40,...,1,17750.13,125.115269,138.201679,136.646623,126.229742,121.117939,136.472821,124.747479,0
7919,66.465,103.32,73.27,1,101.910,71.923,31.18,113.43,217.49,4.39,...,1,18550.43,126.512761,123.219822,134.736719,126.809942,123.285305,143.182301,124.058140,0
7920,69.524,103.32,70.47,1,101.933,69.884,33.97,119.33,271.29,4.40,...,1,12975.83,124.199429,127.733830,138.303499,128.232634,119.337894,136.303196,122.898960,0


In [50]:
test_target

Unnamed: 0,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,1.386,0.991,0.790,15.729,33.777,17.275,3.125,-26.394,-26.285,-22.732,25.029,-26.350,-26.390,-26.234
1,1.389,0.822,1.048,11.085,30.448,15.794,3.680,-26.422,-26.416,-22.598,23.508,-26.387,-26.531,-26.369
2,0.984,0.778,0.849,16.335,32.390,16.971,2.829,-26.042,-26.158,-21.638,24.594,-26.203,-26.190,-26.078
3,1.308,0.749,0.645,15.410,29.863,16.224,3.163,-27.083,-26.812,-22.488,23.756,-26.834,-26.770,-26.796
4,1.336,0.902,1.130,17.167,34.276,15.893,3.550,-27.149,-26.833,-22.885,23.453,-26.910,-26.950,-27.043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7917,2.042,1.649,1.348,11.911,32.050,16.518,3.585,-26.368,-26.193,-22.247,24.155,-26.285,-26.480,-26.161
7918,2.278,2.454,2.030,10.951,31.540,17.808,3.023,-25.534,-25.582,-21.260,25.504,-25.600,-25.462,-25.430
7919,1.422,0.949,1.299,8.453,28.597,14.933,4.081,-27.166,-26.912,-23.566,22.535,-27.169,-27.203,-27.060
7920,1.572,1.366,1.168,14.760,33.716,17.515,2.386,-25.607,-25.817,-21.342,25.381,-25.578,-25.507,-25.696


In [60]:
best = {'n_estimators': 900, 'max_depth': 47, 'num_leaves': 20, 'min_child_samples': 240, 'colsample_bytree': '0.715', 'subsample': '0.657', 'min_split_gain': '0.655', 'scale_pos_weight': '4.117', 'reg_alpha': '22.649', 'reg_lambda': '55.509', 'learning_rate': '0.042'}
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
    
loss = -cross_val_score(model, submit.iloc[:,1:], test_target, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
print("NRMSE Loss {:.5f} params {}".format(loss, best))

NRMSE Loss 1.87927 params {'n_estimators': 900, 'max_depth': 47, 'num_leaves': 20, 'min_child_samples': 240, 'colsample_bytree': '0.715', 'subsample': '0.657', 'min_split_gain': '0.655', 'scale_pos_weight': '4.117', 'reg_alpha': '22.649', 'reg_lambda': '55.509', 'learning_rate': '0.042'}


In [113]:
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
model.fit(test_feature, test_target)
preds = model.predict(test_feature)

In [116]:
b= pd.DataFrame(preds)

In [117]:
b

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.316435,0.991588,0.995047,14.850051,32.173498,16.493588,3.136791,-26.540139,-26.533582,-22.889096,24.202780,-26.423159,-26.431323,-26.467291
1,1.383256,1.126500,1.017209,13.107511,31.297425,16.419290,3.125470,-26.136169,-26.204127,-22.337806,24.407195,-26.122547,-26.095382,-26.136684
2,1.304580,1.011743,1.017633,13.700655,31.376411,16.516856,3.162507,-26.503798,-26.494975,-22.615982,24.210708,-26.424324,-26.419407,-26.435342
3,1.342964,1.052820,1.013550,13.278053,31.046873,16.467960,3.193613,-26.391369,-26.384055,-22.208247,24.182902,-26.326289,-26.333388,-26.314350
4,1.321002,0.989286,0.998137,14.307567,31.889490,16.498772,3.191602,-26.497967,-26.470532,-22.582583,24.245869,-26.412785,-26.412871,-26.435464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7917,1.393782,1.113805,1.017489,13.578492,30.869594,16.345977,3.135388,-26.192864,-26.210285,-22.272499,24.249670,-26.131847,-26.118081,-26.146348
7918,1.321585,1.031034,1.031777,11.661096,31.316874,16.877165,3.162542,-26.522273,-26.489288,-22.514508,24.319198,-26.440794,-26.427942,-26.437188
7919,1.324995,1.063633,1.018852,12.264213,30.353612,16.174260,3.111157,-26.279229,-26.322913,-22.398996,24.290252,-26.259364,-26.248024,-26.252236
7920,1.335942,1.060860,1.004451,14.911044,32.413450,15.539666,3.107898,-26.263880,-26.301083,-22.345632,24.372491,-26.210446,-26.211739,-26.212333


In [None]:
submit = submit.drop('ID', axis=1)

In [118]:
lg_nrmse(b, test_target)

1.8846963699388326

In [108]:
submit.iloc[:,1:]

Unnamed: 0,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,1.253628,0.917809,0.925049,13.645019,31.377077,16.390242,3.137287,-26.588899,-26.581720,-23.034813,24.225088,-26.537224,-26.544711,-26.512825
1,1.383685,1.139642,1.036173,14.006568,31.586416,14.803468,3.143829,-26.078688,-26.065890,-22.224363,24.500698,-25.985517,-26.009362,-26.030551
2,1.281098,1.003387,0.969184,13.319544,32.037674,16.735729,3.152385,-26.400764,-26.419480,-22.613995,24.240342,-26.382157,-26.330617,-26.370004
3,1.372201,1.069160,1.052126,12.665445,30.680151,16.343223,3.241957,-26.353382,-26.386639,-22.241651,24.106398,-26.290148,-26.278321,-26.308223
4,1.310064,1.015448,1.010559,14.593074,32.141255,16.820464,3.166319,-26.427430,-26.474987,-22.425594,24.475476,-26.410251,-26.401151,-26.379272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7917,1.368647,1.084917,1.001295,13.295844,31.154280,16.014930,3.095531,-26.175750,-26.220364,-22.251711,24.286942,-26.145990,-26.155009,-26.130499
7918,1.345020,1.064957,1.035966,12.439491,31.882498,16.724136,3.179389,-26.368690,-26.351999,-22.290521,24.511780,-26.338473,-26.282442,-26.304285
7919,1.336282,1.070830,1.039721,12.310210,29.537848,16.183210,3.170066,-26.341585,-26.385461,-22.515815,24.063156,-26.325829,-26.285388,-26.326577
7920,1.329804,1.031117,0.988537,15.466144,31.471088,12.344689,3.147544,-26.297785,-26.339188,-22.618268,24.174104,-26.225971,-26.220166,-26.279585
