### Import Libraries

In [1]:
import pandas as pd
import random
import os
import numpy as np
from functools import partial
from lightgbm import LGBMRegressor
from hyperopt import fmin, hp, tpe, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [3]:
def dataset_split_X_y(df):    
    """
    @Description: split data into features and labels
    @Param: df, pandas dataframe with columns starting with X for features and Y for labels
    @Return: features and labels in pandas dataframes
    """
    xs = df.filter(regex='X') # Input : X Feature
    ys = df.filter(regex='Y') # Output : Y Feature
    return xs, ys

In [4]:
def zero_variance(df):
    """
    @Description: check for zero_variance
    @Param1: df, pandas dataframe
    @Return: names of the columns with zero variance
    """
    result = []
    for col in df.columns:
        if df[col].var() == 0:
            result.append(col)
    return result

In [5]:
def get_top_correlation(df, n=10):
    """
    @Description: print out top correlated features
    @Param1: df, pandas dataframe
    @Param2: n, number of lines to print 
    @Return: pandas series
    """
    pairs = set()
    for idx1 in range(0, df.shape[1]):
        for idx2 in range(0, idx1+1):
            pairs.add((df.columns[idx1], df.columns[idx2]))
    corr = df.corr().abs().unstack()
    corr = corr.drop(labels=pairs).sort_values(ascending=False)
    return corr[0:n]

In [6]:
def lg_nrmse(gt, preds):
    """
    @Description: Metric used in this project
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: nrmse score
    """
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    preds = pd.DataFrame(preds)
    all_nrmse = []
    for idx in range(0,14):
        rmse = mean_squared_error(gt.iloc[:,idx], preds.iloc[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [26]:
train_x = pd.read_csv('data/train.csv')

test_x = pd.read_csv('data/test.csv')
train_x, train_y = dataset_split_X_y(train_x)


In [None]:

cols_with_zero_variance = zero_variance(train_x)
train_x = train_x.drop(cols_with_zero_variance, axis = 1)
test_x = test_x.drop(cols_with_zero_variance, axis = 1)

highly_correlated = [i[1] for i in get_top_correlation(train_x, 1).index]
train_x = train_x.drop(highly_correlated, axis = 1)

test_x = test_x.drop(highly_correlated, axis = 1)
test_x = test_x.drop('ID', axis=1)

In [27]:
def outlier_iqr(tmp, i):
    data = tmp[i]
    
    print(np.percentile(data,75))
    print(np.percentile(data,25))
    q25, q75 = np.percentile(data, 25), np.percentile(data,75)
    iqr = q75 - q25
    
    cut_off = iqr * 1.5
    lower, upper = q25 - cut_off, q75 + cut_off
    print('변수 명 : ',i)
    print('IQR : ', iqr)
    print('lower bound : ', lower)
    print('upper bound : ', upper)
    
    tmp['X_57'] = np.where(data < upper, 
                    np.where(tmp['X_57'] == 0, 0, 1), 
                    np.where(tmp['X_57'] == 0, 1, 1))
    tmp['X_57'] = np.where(data>lower, 
                    np.where(tmp['X_57'] == 0, 0, 1), 
                    np.where(tmp['X_57'] == 0, 1, 1))
    
    print("tmp['X_58'].value_counts() : ", tmp['X_57'].value_counts());
    return tmp

In [28]:
X_57 = [0 for i in range(0, train_x.shape[0])]
len(X_57)
X_57 = pd.DataFrame(X_57)
train_x['X_57'] = X_57

In [29]:
# 2차 시도 (y와 corr 기준) fixed 2 !!!!!!!!!!
cols = ["X_41","X_42","X_43", "X_44","X_45"]


In [18]:
# 2차 시도 (y와 corr 기준) fixed 2 !!!!!!!!!!
cols = ["X_14","X_15", "X_17", "X_18", "X_19", "X_20", "X_21","X_22",
        "X_24", "X_25", "X_26", "X_27", "X_28","X_29", "X_39", "X_40","X_41","X_42","X_43", "X_44","X_45"]


In [19]:
cols = ["X_11", "X_14","X_15", "X_16", "X_17", "X_18", "X_19", "X_20", "X_21","X_22",
        "X_24", "X_25", "X_26", "X_27", "X_28","X_29", 
        "X_30", "X_31", "X_32", "X_33",
        "X_34", "X_35", "X_36", "X_37", 
        "X_39", "X_40","X_41","X_42","X_43", "X_44","X_45"
       ]

In [30]:
for i in cols:
    train_x = outlier_iqr(train_x, i)

    

X_57 = [0 for i in range(0, test_x.shape[0])]
X_57 = pd.DataFrame(X_57)
test_x['X_57'] = X_57

for i in cols:
    test_x = outlier_iqr(test_x, i)


21.21
21.17
변수 명 :  X_41
IQR :  0.03999999999999915
lower bound :  21.110000000000003
upper bound :  21.27
tmp['X_58'].value_counts() :  0    38918
1      689
Name: X_57, dtype: int64
21.09
21.03
변수 명 :  X_42
IQR :  0.05999999999999872
lower bound :  20.940000000000005
upper bound :  21.18
tmp['X_58'].value_counts() :  0    38748
1      859
Name: X_57, dtype: int64
21.24
21.17
변수 명 :  X_43
IQR :  0.06999999999999673
lower bound :  21.065000000000005
upper bound :  21.34499999999999
tmp['X_58'].value_counts() :  0    38629
1      978
Name: X_57, dtype: int64
21.19
21.13
변수 명 :  X_44
IQR :  0.060000000000002274
lower bound :  21.039999999999996
upper bound :  21.280000000000005
tmp['X_58'].value_counts() :  0    38477
1     1130
Name: X_57, dtype: int64
0.19
0.12
변수 명 :  X_45
IQR :  0.07
lower bound :  0.014999999999999986
upper bound :  0.29500000000000004
tmp['X_58'].value_counts() :  0    38449
1     1158
Name: X_57, dtype: int64
21.21
21.17
변수 명 :  X_41
IQR :  0.03999999999999915
low

In [34]:
def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **params))
    
    loss = -cross_val_score(model, train_x, train_y, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
    print("NRMSE Loss {:.5f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 200,
            rstate=np.random.default_rng(1))

NRMSE Loss 1.94515 params {'n_estimators': 1350, 'max_depth': 47, 'num_leaves': 20, 'min_child_samples': 240, 'colsample_bytree': '0.715', 'subsample': '0.657', 'min_split_gain': '0.655', 'scale_pos_weight': '4.117', 'reg_alpha': '22.649', 'reg_lambda': '55.509', 'learning_rate': '0.042'}
NRMSE Loss 1.94646 params {'n_estimators': 1500, 'max_depth': 13, 'num_leaves': 90, 'min_child_samples': 110, 'colsample_bytree': '0.538', 'subsample': '0.901', 'min_split_gain': '0.652', 'scale_pos_weight': '7.202', 'reg_alpha': '1.693', 'reg_lambda': '75.762', 'learning_rate': '0.159'}
NRMSE Loss 1.95334 params {'n_estimators': 250, 'max_depth': 67, 'num_leaves': 80, 'min_child_samples': 170, 'colsample_bytree': '0.387', 'subsample': '0.902', 'min_split_gain': '0.436', 'scale_pos_weight': '8.879', 'reg_alpha': '86.379', 'reg_lambda': '88.854', 'learning_rate': '0.039'}
NRMSE Loss 1.95479 params {'n_estimators': 600, 'max_depth': 27, 'num_leaves': 60, 'min_child_samples': 120, 'colsample_bytree': '0.

In [None]:
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
model.fit(train_x, train_y)
preds = model.predict(test_x)

In [None]:
submit = pd.read_csv('./sample_submission.csv')
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
submit.to_csv('./submission_3.csv', index = False)