### Import Libraries

In [1]:
import pandas as pd
import random
import os
import numpy as np
from functools import partial
from lightgbm import LGBMRegressor
from hyperopt import fmin, hp, tpe, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [3]:
def dataset_split_X_y(df):    
    """
    @Description: split data into features and labels
    @Param: df, pandas dataframe with columns starting with X for features and Y for labels
    @Return: features and labels in pandas dataframes
    """
    xs = df.filter(regex='X') # Input : X Feature
    ys = df.filter(regex='Y') # Output : Y Feature
    return xs, ys

In [4]:
def zero_variance(df):
    """
    @Description: check for zero_variance
    @Param1: df, pandas dataframe
    @Return: names of the columns with zero variance
    """
    result = []
    for col in df.columns:
        if df[col].var() == 0:
            result.append(col)
    return result

In [5]:
def get_top_correlation(df, n=10):
    """
    @Description: print out top correlated features
    @Param1: df, pandas dataframe
    @Param2: n, number of lines to print 
    @Return: pandas series
    """
    pairs = set()
    for idx1 in range(0, df.shape[1]):
        for idx2 in range(0, idx1+1):
            pairs.add((df.columns[idx1], df.columns[idx2]))
    corr = df.corr().abs().unstack()
    corr = corr.drop(labels=pairs).sort_values(ascending=False)
    return corr[0:n]

In [6]:
def lg_nrmse(gt, preds):
    """
    @Description: Metric used in this project
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: nrmse score
    """
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    preds = pd.DataFrame(preds)
    all_nrmse = []
    for idx in range(0,14):
        rmse = mean_squared_error(gt.iloc[:,idx], preds.iloc[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [7]:
train_df = pd.read_csv('data/train.csv')
test_x = pd.read_csv('data/test.csv')
y_feature_spec_info = pd.read_csv('data/meta/y_feature_spec_info.csv')

train_x, train_y = dataset_split_X_y(train_df)

cols_with_zero_variance = zero_variance(train_x) # 분산이 0 (통과 여부)
train_x = train_x.drop(cols_with_zero_variance, axis = 1)
test_x = test_x.drop(cols_with_zero_variance, axis = 1)

train_x = train_x.drop(['X_10', 'X_11'], axis = 1) # 결측치가 많음 (결측치 = 0, 공지사항)
test_x = test_x.drop(['X_10', 'X_11'], axis = 1)

test_x = test_x.drop('ID', axis=1)

In [8]:
def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **params))
    
    loss = -cross_val_score(model, train_y, train_x, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
    print("NRMSE Loss {:.5f} params {}".format(loss, params))
    return loss

In [9]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 20000),
    'reg_lambda': hp.uniform('reg_lambda', 0, 20000),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 200,
            rstate=np.random.default_rng(1))

  0%|                                                                          | 0/200 [00:41<?, ?trial/s, best loss=?]



KeyboardInterrupt



In [None]:
best = {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
    
loss = -cross_val_score(model, train_x, train_y, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
print("NRMSE Loss {:.5f} params {}".format(loss, best))

In [78]:
test_x

Unnamed: 0,X_01,X_02,X_03,X_05,X_06,X_07,X_08,X_09,X_12,X_13,...,X_45,X_46,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56
0,68.504,103.321,76.67,101.867,73.963,30.51,63.57,239.80,4.37,0.15,...,0.16,1469,17227.63,138.130429,129.460682,141.506570,133.427229,129.711498,133.138096,121.859684
1,67.485,103.320,69.37,101.992,67.845,28.03,116.99,189.23,4.36,0.17,...,0.27,1462,17134.53,136.148839,128.266277,145.911745,131.196417,132.411480,133.629025,124.178623
2,69.524,103.320,68.97,101.884,77.022,29.65,205.68,214.93,4.43,0.20,...,0.14,1469,14860.83,120.447446,119.988804,132.099908,120.450155,130.051708,128.252972,114.475628
3,69.524,103.320,65.87,101.866,73.963,28.15,103.38,180.80,4.39,0.18,...,0.13,1469,15252.53,133.994695,125.069180,147.507669,123.142653,125.963665,139.666592,126.589253
4,73.603,103.321,66.67,101.891,74.983,29.92,71.20,231.93,4.38,0.12,...,0.09,1469,10752.23,137.918202,135.116192,138.600473,127.173033,137.252712,134.411335,124.020016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39603,68.504,103.320,63.97,103.157,68.864,29.49,116.35,284.16,4.35,0.14,...,0.11,1469,62123.53,127.741246,126.494312,139.119905,125.271109,128.284572,140.176945,128.292843
39604,68.504,103.320,61.37,103.137,68.864,32.29,116.28,272.41,4.34,0.16,...,0.19,1469,61844.13,127.767377,124.062809,138.238664,119.879393,127.322529,137.312047,131.570614
39605,69.524,103.320,63.67,103.149,69.884,30.00,113.05,295.54,4.38,0.16,...,0.12,1469,60277.53,128.593640,124.774037,138.659624,123.999571,126.075542,135.656132,127.671108
39606,67.485,103.321,61.77,103.148,67.845,32.05,115.05,267.26,4.37,0.16,...,0.11,1469,60236.73,121.110646,125.471699,134.989984,120.889578,129.296909,132.673977,131.882893


In [34]:

best = {'n_estimators': 1350, 'max_depth': 47, 'num_leaves': 20, 'min_child_samples': 240, 'colsample_bytree': '0.715', 'subsample': '0.657', 'min_split_gain': '0.655', 'scale_pos_weight': '4.117', 'reg_alpha': '4529.838', 'reg_lambda': '11101.877', 'learning_rate': '0.042'}
best = {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}

model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
model.fit(train_x, train_y)
preds = model.predict(test_x)

In [52]:
preds = model.predict(test_x)

In [54]:
submit = pd.read_csv('data/sample_submission.csv')
for idx, col in enumerate(submit.columns):
    if col=='ID': 
        continue
    submit[col] = preds[:,idx-1]
#submit.to_csv('data/param_test.csv', index = False)

In [55]:
submit.iloc[:,1:]

Unnamed: 0,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,1.432051,1.260735,1.138461,14.173415,31.537159,16.969034,3.061951,-26.033224,-26.044785,-22.133934,24.599187,-25.962838,-25.959880,-25.963732
1,1.519650,1.222405,1.147717,13.442493,31.100670,16.409455,3.165281,-26.168091,-26.173052,-22.371636,24.287345,-26.091418,-26.115986,-26.147796
2,1.442589,1.130963,1.112844,15.229491,32.498247,16.815731,3.032005,-25.818451,-25.845895,-22.004676,24.754985,-25.845643,-25.795810,-25.826802
3,1.449344,1.156537,1.036375,15.037902,32.402320,17.148886,3.028165,-25.559868,-25.649252,-21.734286,24.950788,-25.594460,-25.576386,-25.604822
4,1.323170,0.993623,0.962414,14.837016,31.814839,16.993405,3.114336,-25.769935,-25.792954,-21.991556,24.755070,-25.650565,-25.666614,-25.707264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39603,1.269251,0.986056,1.036102,13.039392,30.985015,16.745831,3.152048,-26.406107,-26.407885,-22.859235,24.429421,-26.320101,-26.343872,-26.305076
39604,1.233524,0.861514,0.986410,14.413688,31.619093,16.771652,3.182390,-26.364432,-26.354073,-22.816082,24.468589,-26.297126,-26.261742,-26.325169
39605,1.232144,0.910327,0.980275,13.206351,30.964499,16.652172,3.172926,-26.488657,-26.475605,-22.852334,24.257843,-26.442365,-26.396198,-26.412550
39606,1.196108,0.839242,0.921132,13.646885,31.110144,16.820252,3.188380,-26.390333,-26.371361,-22.838428,24.440315,-26.313766,-26.322418,-26.349635


In [56]:
submit.iloc[:,1:]
for i in submit.iloc[:,1:].columns:
    print(max(submit.iloc[:,1:][i]), min(submit.iloc[:,1:][i]))

1.6333274984228525 0.9889961150355959
1.388108822993127 0.7182677753966189
1.3121524568859466 0.6617088585400049
16.56030762677721 10.772416388505842
33.34960136860852 28.991489236839772
18.1407732400426 3.077705198441634
3.4786421080934136 2.585940741213836
-25.401634390083238 -26.986946637254
-25.432314073954466 -26.98822804939489
-21.4960922117876 -24.626283277115146
25.22593776551957 23.63229004043102
-25.33400223974844 -26.922008111711918
-25.346384054329995 -26.93149715326836
-25.35393921404075 -26.944769550983967


In [14]:
df_indicator = pd.DataFrame()
for idx in range(len(ys)):
    y_series = ~train_y[ys[idx]].between(ys_bounds[idx][0], ys_bounds[idx][1])
    df_indicator = pd.concat([df_indicator, y_series.astype(int)], axis = 1)

In [15]:
lst = []
for i in df_indicator.columns:
    lst.append(df_indicator[df_indicator[i] == 1].index)
ans=set()
for i in lst:
    for k in i:
        ans.add(k)
print(len(ans))

ans = list(ans)
ans.sort()
spec_x = train_x.loc[ans, :]

3917


In [153]:
lst = []
for i in df_indicator.columns:
    lst.append(df_indicator[df_indicator[i] == 1].index)
ans=set()
for i in lst:
    for k in i:
        ans.add(k)
print(len(ans))

ans = list(ans)
ans.sort()
spec_x = train_x.loc[ans, :]

3917


In [154]:
lst_spec_x = pd.DataFrame([0 for i in range(len(train_x))])
lst_spec_x.loc[spec_x.index] = 1

In [155]:
train_x['X_57'] = lst_spec_x

In [156]:
train_x_normal = train_x.drop(spec_x.index)
train_y_normal = train_y.drop(spec_x.index)

train_x_spec = train_x.drop(train_x_normal.index)
train_y_spec = train_y.drop(train_y_normal.index)


In [134]:
min_max_lst = pd.DataFrame()
mins = []
maxs = []

In [159]:
spec_x

Unnamed: 0,X_01,X_02,X_03,X_05,X_06,X_07,X_08,X_09,X_12,X_13,...,X_46,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56,X_57
0,70.544,103.320,67.47,101.892,74.983,29.45,62.38,245.71,4.34,0.18,...,1463,9706.03,137.043591,135.359219,147.837968,134.313475,125.605427,136.721425,125.028256,1
6,71.563,103.320,66.07,101.921,73.963,29.30,69.22,237.51,4.37,0.14,...,1469,11111.83,141.235753,132.356259,138.011690,133.733046,142.292843,132.479969,125.146511,1
9,71.563,103.320,68.97,101.990,77.022,28.97,66.88,228.22,4.35,0.14,...,1469,12461.23,133.556149,123.122143,141.628915,123.268843,130.163105,131.452740,114.120251,1
12,71.563,103.320,71.27,101.910,76.002,26.29,98.60,162.44,4.35,0.18,...,1469,16017.63,140.558987,131.504597,141.168291,135.963270,135.482075,130.598690,133.943936,1
14,68.504,103.320,66.07,101.902,72.943,28.14,69.78,239.63,4.39,0.15,...,1469,11288.63,133.464757,131.049343,149.105867,130.468000,135.834301,141.178227,126.327862,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39569,69.524,103.320,62.87,103.136,69.884,30.66,113.99,282.98,4.36,0.15,...,1469,10646.93,130.736523,126.124493,152.441049,126.511465,130.105411,137.517964,135.793003,1
39571,62.386,103.320,61.47,101.969,64.785,29.70,115.75,282.20,4.37,0.12,...,1469,11042.83,126.189549,127.862774,132.837549,130.349073,135.239231,133.047474,118.998982,1
39583,67.485,103.320,65.87,102.036,69.884,29.99,118.21,299.56,4.38,0.15,...,1469,9616.53,131.119595,134.646145,134.475389,123.865919,136.058071,136.967442,133.063221,1
39596,68.504,103.321,63.27,103.153,68.864,30.73,112.71,219.37,4.36,0.15,...,1469,62021.43,134.647397,123.303173,134.512743,120.068983,131.278179,141.322545,130.763279,1


In [152]:
max(train_x['X_01'])

84.82

In [150]:
max(train_x_normal['X_01'])

84.82

In [135]:
for i in train_x_normal.columns:
    print(i, min(train_x_normal[i]), max(train_x_normal[i]))
    mins.append(min(train_x_normal[i]))
    maxs.append(max(train_x_normal[i]))

X_01 56.268 84.82
X_02 103.32 103.321
X_03 56.47 89.17
X_05 101.774 103.16
X_06 61.726 87.219
X_07 14.14 163.86
X_08 38.46 2387.44
X_09 37.58 637.49
X_12 4.27 4.49
X_13 0.05 0.28
X_14 13.16 13.49
X_15 13.24 13.5
X_16 13.27 13.61
X_17 13.41 13.61
X_18 13.27 13.57
X_19 2.86 3.75
X_20 2.83 3.67
X_21 2.83 3.68
X_22 2.85 3.79
X_24 1.83 2.35
X_25 1.96 2.35
X_26 1.98 2.35
X_27 1.99 2.35
X_28 1.93 2.35
X_29 2.02 2.36
X_30 0.57 2.11
X_31 0.6 7.21
X_32 0.57 2.45
X_33 0.61 7.81
X_34 12.84 13.08
X_35 12.81 13.09
X_36 12.84 13.09
X_37 12.81 13.07
X_38 -17.09 32.23
X_39 -17.09 -2.65
X_40 -17.72 -14.8
X_41 20.78 21.62
X_42 20.79 21.44
X_43 20.8 21.41
X_44 20.93 21.32
X_45 0.0 0.42
X_46 1457 1469
X_49 3382.63 114563.63
X_50 21.8 162.619458
X_51 21.91 161.351391
X_52 23.1 173.438623
X_53 21.33 152.40663
X_54 21.34 152.397556
X_55 22.98 170.15598
X_56 21.41 155.277538


In [136]:
min_max_lst['Min'] = mins
min_max_lst['Max'] = maxs

In [32]:
min_max_lst.to_csv('train_spec_info.csv')

In [44]:
xs = test_x.columns
xs_bounds = list(zip(mins, maxs))

In [137]:
xs_bounds

[(56.268, 84.82),
 (103.32, 103.321),
 (56.47, 89.17),
 (101.774, 103.16),
 (61.726, 87.219),
 (14.14, 163.86),
 (38.46, 2387.44),
 (37.58, 637.49),
 (4.27, 4.49),
 (0.05, 0.28),
 (13.16, 13.49),
 (13.24, 13.5),
 (13.27, 13.61),
 (13.41, 13.61),
 (13.27, 13.57),
 (2.86, 3.75),
 (2.83, 3.67),
 (2.83, 3.68),
 (2.85, 3.79),
 (1.83, 2.35),
 (1.96, 2.35),
 (1.98, 2.35),
 (1.99, 2.35),
 (1.93, 2.35),
 (2.02, 2.36),
 (0.57, 2.11),
 (0.6, 7.21),
 (0.57, 2.45),
 (0.61, 7.81),
 (12.84, 13.08),
 (12.81, 13.09),
 (12.84, 13.09),
 (12.81, 13.07),
 (-17.09, 32.23),
 (-17.09, -2.65),
 (-17.72, -14.8),
 (20.78, 21.62),
 (20.79, 21.44),
 (20.8, 21.41),
 (20.93, 21.32),
 (0.0, 0.42),
 (1457, 1469),
 (3382.63, 114563.63),
 (21.8, 162.619458),
 (21.91, 161.351391),
 (23.1, 173.438623),
 (21.33, 152.40663),
 (21.34, 152.397556),
 (22.98, 170.15598),
 (21.41, 155.277538)]

In [131]:
train_x

Unnamed: 0,X_01,X_02,X_03,X_05,X_06,X_07,X_08,X_09,X_12,X_13,...,X_46,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56,X_57
0,70.544,103.320,67.47,101.892,74.983,29.45,62.38,245.71,4.34,0.18,...,1463,9706.03,137.043591,135.359219,147.837968,134.313475,125.605427,136.721425,125.028256,1
1,69.524,103.321,65.17,101.944,72.943,28.73,61.23,233.61,4.38,0.18,...,1463,10423.43,133.736691,135.979817,149.924692,123.630583,127.893337,143.322659,124.877308,0
2,72.583,103.320,64.07,103.153,72.943,28.81,105.77,272.20,4.36,0.15,...,1468,10948.53,132.805112,131.055355,146.814592,128.939070,127.012195,140.395688,122.238232,0
3,71.563,103.320,67.57,101.971,77.022,28.92,115.21,255.36,4.33,0.21,...,1469,15007.03,134.138760,133.239422,139.720132,132.260824,130.723186,147.624829,134.875225,0
4,69.524,103.320,63.57,101.981,70.904,29.68,103.38,241.46,4.35,0.16,...,1469,11051.03,142.728970,136.620022,134.853555,134.760252,125.647793,139.331105,123.272762,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39602,66.465,103.320,62.27,103.150,66.825,30.20,77.83,298.05,4.36,0.15,...,1469,60630.73,129.965741,130.807148,133.481737,125.273130,121.780933,133.780110,129.029812,0
39603,66.465,103.321,62.77,102.021,66.825,29.21,102.25,270.67,4.40,0.13,...,1458,60763.43,127.633885,120.158764,142.667802,122.465490,122.987209,143.090741,122.811413,0
39604,68.504,103.320,64.67,103.144,68.864,29.96,102.61,198.07,4.38,0.14,...,1459,8813.33,132.501286,136.893025,134.419328,129.115431,130.920147,140.489232,119.166699,0
39605,66.465,103.320,63.67,102.025,67.845,30.30,112.60,275.52,4.33,0.16,...,1469,62222.33,128.189679,121.495930,141.288011,130.141676,125.518825,136.603634,124.525929,1


In [138]:
df_indicator_x2 = pd.DataFrame()
for idx in range(len(xs)):
    x_series = ~train_x[xs[idx]].between(xs_bounds[idx][0], xs_bounds[idx][1])
    df_indicator_x2 = pd.concat([df_indicator_x2, x_series.astype(int)], axis = 1)

In [141]:
df_indicator_x2.value_counts()

X_01  X_02  X_03  X_05  X_06  X_07  X_08  X_09  X_12  X_13  X_14  X_15  X_16  X_17  X_18  X_19  X_20  X_21  X_22  X_24  X_25  X_26  X_27  X_28  X_29  X_30  X_31  X_32  X_33  X_34  X_35  X_36  X_37  X_38  X_39  X_40  X_41  X_42  X_43  X_44  X_45  X_46  X_49  X_50  X_51  X_52  X_53  X_54  X_55  X_56
0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0       39601
                                                                                                                                                                                                                                                                                          1     0     0           1
                                                                                     

In [140]:

k=0
for i in df_indicator_x2.columns:
    k += len(df_indicator_x2[df_indicator_x2[i] == 1].index)
print(k)

9


In [None]:
lst = []
for i in df_indicator_x.columns:
    lst.append(df_indicator_x1[df_indicator_x1[i] == 1].index)
ans=set()
for i in lst:
    for k in i:
        ans.add(k)
print(len(ans))

ans = list(ans)
ans.sort()


In [76]:
df_indicator_x = pd.DataFrame()
for idx in range(len(xs)):
    x_series = ~train_x[xs[idx]].between(xs_bounds[idx][0], xs_bounds[idx][1])
    df_indicator_x = pd.concat([df_indicator_x, x_series.astype(int)], axis = 1)

In [77]:
df_indicator_x = df_indicator_x.loc[:39606]

In [78]:
len(df_indicator_x[df_indicator_x==1])

39607

In [79]:
lst = []
for i in df_indicator_x.columns:
    lst.append(df_indicator_x[df_indicator_x[i] == 1].index)
ans=set()
for i in lst:
    for k in i:
        ans.add(k)
print(len(ans))

ans = list(ans)
ans.sort()
spec_test_x = test_x.loc[ans, :]

58


In [80]:
X_57 = pd.DataFrame([0 for i in range(len(test_x))])
X_57.loc[ans] = 1

In [81]:
X_57.value_counts()


0    39550
1       58
dtype: int64

In [82]:
test_x['X_57'] = X_57

In [10]:
ys = ['Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 
      'Y_06', 'Y_07', 'Y_08', 'Y_09', 'Y_10', 
      'Y_11', 'Y_12', 'Y_13', 'Y_14']
ys_bounds = [[0.2, 2], [0.2, 2.1], [0.2, 2.1], 
             [7, 19], [22, 36.5], [-19.2, 19], 
             [2.4, 4], [-29.2, -24], [-29.2, -24],
             [-30.6, -20], [19.6, 26.6], [-29.2, -24],
             [-29.2, -24], [-29.2, -24]]

In [11]:
df_indicator = pd.DataFrame()
for idx in range(len(ys)):
    y_series = ~train_y[ys[idx]].between(ys_bounds[idx][0], ys_bounds[idx][1])
    df_indicator = pd.concat([df_indicator, y_series.astype(int)], axis = 1)

In [12]:
lst = []
for i in df_indicator.columns:
    lst.append(df_indicator[df_indicator[i] == 1].index)
ans=set()
for i in lst:
    for k in i:
        ans.add(k)
print(len(ans))

ans = list(ans)
ans.sort()
spec_x = train_x.loc[ans, :]

3917


In [None]:

submit.iloc[:,1:]
for i in submit.iloc[:,1:].columns:
    print(i,min(submit.loc[spec_x.index,:][i]), max(submit.loc[spec_x.index,:][i]))

In [58]:
y_feature_spec_info = pd.read_csv('data/meta/y_feature_spec_info_new.csv')

In [60]:
submit.head()

df_indicator = pd.DataFrame()

for i, k in enumerate(submit.columns):
    if k == 'ID':
        continue
    submit.head()

    y_series = ~submit.iloc[:,1:][k].between(y_feature_spec_info['Min'][i-1], y_feature_spec_info['Max'][i-1])
    if i == 1:
        df_indicator = y_series
    else:
        df_indicator = df_indicator + y_series


In [61]:
df_indicator[df_indicator==True] = 1
df_indicator[df_indicator==False] = 0

In [62]:
df_indicator.value_counts()

0    39588
1       20
dtype: int64

In [None]:
test_x['X_57'] = df_indicator
test_x['X_57'] = test_x['X_57'].astype('int')


In [20]:
test_x.dtypes

X_01    float64
X_02    float64
X_03    float64
X_05    float64
X_06    float64
X_07    float64
X_08    float64
X_09    float64
X_12    float64
X_13    float64
X_14    float64
X_15    float64
X_16    float64
X_17    float64
X_18    float64
X_19    float64
X_20    float64
X_21    float64
X_22    float64
X_24    float64
X_25    float64
X_26    float64
X_27    float64
X_28    float64
X_29    float64
X_30    float64
X_31    float64
X_32    float64
X_33    float64
X_34    float64
X_35    float64
X_36    float64
X_37    float64
X_38    float64
X_39    float64
X_40    float64
X_41    float64
X_42    float64
X_43    float64
X_44    float64
X_45    float64
X_46      int64
X_49    float64
X_50    float64
X_51    float64
X_52    float64
X_53    float64
X_54    float64
X_55    float64
X_56    float64
X_57    float64
dtype: object

In [83]:
best = {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
    

In [106]:
best = {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
model.fit(train_x, train_y)
preds = model.predict(test_x)

In [107]:
submit = pd.read_csv('data/sample_submission.csv')
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
submit.to_csv('data/third.csv', index = False)

In [350]:

loss = -cross_val_score(model, train_x, train_y, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
print("NRMSE Loss {:.5f} params {}".format(loss, best))

NRMSE Loss 1.89188 params {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}
