### Import Libraries

In [12]:
import pandas as pd
import random
import os
import numpy as np
from functools import partial
from lightgbm import LGBMRegressor
from hyperopt import fmin, hp, tpe, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold

In [13]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [14]:
def dataset_split_X_y(df):    
    """
    @Description: split data into features and labels
    @Param: df, pandas dataframe with columns starting with X for features and Y for labels
    @Return: features and labels in pandas dataframes
    """
    xs = df.filter(regex='X') # Input : X Feature
    ys = df.filter(regex='Y') # Output : Y Feature
    return xs, ys

In [15]:
def zero_variance(df):
    """
    @Description: check for zero_variance
    @Param1: df, pandas dataframe
    @Return: names of the columns with zero variance
    """
    result = []
    for col in df.columns:
        if df[col].var() == 0:
            result.append(col)
    return result

In [16]:
def get_top_correlation(df, n=10):
    """
    @Description: print out top correlated features
    @Param1: df, pandas dataframe
    @Param2: n, number of lines to print 
    @Return: pandas series
    """
    pairs = set()
    for idx1 in range(0, df.shape[1]):
        for idx2 in range(0, idx1+1):
            pairs.add((df.columns[idx1], df.columns[idx2]))
    corr = df.corr().abs().unstack()
    corr = corr.drop(labels=pairs).sort_values(ascending=False)
    return corr[0:n]

In [17]:
def lg_nrmse(gt, preds):
    """
    @Description: Metric used in this project
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: nrmse score
    """
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    preds = pd.DataFrame(preds)
    all_nrmse = []
    for idx in range(0,14):
        rmse = mean_squared_error(gt.iloc[:,idx], preds.iloc[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [71]:
train_df = pd.read_csv('data/train.csv')
test_x = pd.read_csv('data/test.csv')
y_feature_spec_info = pd.read_csv('data/meta/y_feature_spec_info.csv')

train_x, train_y = dataset_split_X_y(train_df)

cols_with_zero_variance = zero_variance(train_x) # 분산이 0 (통과 여부)
train_x = train_x.drop(cols_with_zero_variance, axis = 1)
test_x = test_x.drop(cols_with_zero_variance, axis = 1)

train_x = train_x.drop(['X_10', 'X_11'], axis = 1) # 결측치가 많음 (결측치 = 0, 공지사항)
test_x = test_x.drop(['X_10', 'X_11'], axis = 1)

test_x = test_x.drop('ID', axis=1)

In [19]:
ys = ['Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 
      'Y_06', 'Y_07', 'Y_08', 'Y_09', 'Y_10', 
      'Y_11', 'Y_12', 'Y_13', 'Y_14']
ys_bounds = [[0.2, 2], [0.2, 2.1], [0.2, 2.1], 
             [7, 19], [22, 36.5], [-19.2, 19], 
             [2.4, 4], [-29.2, -24], [-29.2, -24],
             [-30.6, -20], [19.6, 26.6], [-29.2, -24],
             [-29.2, -24], [-29.2, -24]]

In [20]:
df_indicator = pd.DataFrame()
for idx in range(len(ys)):
    y_series = ~train_y[ys[idx]].between(ys_bounds[idx][0], ys_bounds[idx][1])
    df_indicator = pd.concat([df_indicator, y_series.astype(int)], axis = 1)


In [103]:
df_indicator_y01 = df_indicator[df_indicator['Y_01']==1].reset_index()
print(len(df_indicator_y01))

1476


In [88]:
y_feature_spec_info

Unnamed: 0,Feature,Min,Max
0,Y_01,0.2,2.0
1,Y_02,0.2,2.1
2,Y_03,0.2,2.1
3,Y_04,7.0,19.0
4,Y_05,22.0,36.5
5,Y_06,-19.2,19.0
6,Y_07,2.4,4.0
7,Y_08,-29.2,-24.0
8,Y_09,-29.2,-24.0
9,Y_10,-30.6,-20.0


In [69]:
print(train_y[df_indicator['Y_01']==1].max(axis=0))
print()
print(train_y[df_indicator['Y_01']==1].min(axis=0))

Y_01     4.409
Y_02     3.998
Y_03     3.756
Y_04    21.462
Y_05    37.225
Y_06    18.753
Y_07     4.846
Y_08   -24.162
Y_09   -24.160
Y_10   -20.272
Y_11    26.592
Y_12   -24.300
Y_13   -24.177
Y_14   -24.211
dtype: float64

Y_01     0.017
Y_02     0.007
Y_03     0.017
Y_04     5.688
Y_05    21.868
Y_06   -19.963
Y_07     0.685
Y_08   -29.012
Y_09   -29.035
Y_10   -31.119
Y_11    20.536
Y_12   -28.835
Y_13   -28.989
Y_14   -28.996
dtype: float64


In [90]:
# y_01
y_01_min = [0.017, 0.2]
y_01_max = [2.0, 4.409]

y_01_spec_len = len(train_x) - len(df_indicator_y01)
print(y_01_spec_len)

38131


In [172]:
lst=[]
new_data = pd.DataFrame()
cnt = (y_01_min[1] - y_01_min[0]) / (y_01_spec_len//2)
for i in range(y_01_spec_len//2):
    lst.append(y_01_min[0] + cnt*i)

cnt = (y_01_max[1] - y_01_max[0]) / (y_01_spec_len//2)
for i in range(y_01_spec_len - y_01_spec_len//2):
    lst.append(y_01_max[0] + cnt*i)
a = pd.DataFrame()
a['Y_02'] = lst
a= a.reset_index()

In [184]:
tmp = pd.DataFrame()
tmp = train_y['Y_01']
tmp = tmp[df_indicator['Y_01'] == 1]
print(len(tmp))
tmp = pd.DataFrame(tmp)


1476


In [186]:
tmp.to_csv('data/y_03.csv', index = False)

In [175]:
print(tmp)

      index   Y_01
0         0  2.056
1         6  2.140
2         9  2.004
3        60  2.012
4        67  2.472
...     ...    ...
1471  39486  2.010
1472  39527  0.171
1473  39533  2.009
1474  39559  2.052
1475  39569  2.252

[1476 rows x 2 columns]


In [None]:
tmp.reset_index()
lst.reset_index()

In [None]:
tmp['Y_01'] = pd.concat([tmp['Y_01'], a['Y_02']], axis=0)
print(tmp)

In [93]:
print(len(lst))

38131


In [97]:
lst=pd.DataFrame(lst)
df_indicator_y01 = pd.concat([df_indicator_y01, lst], axis=0)

In [100]:
print(len(df_indicator_y01))

39607
39607


In [179]:
a.to_csv('data/y_01.csv', index = False)
tmp.to_csv('data/y_02.csv', index = False)

In [22]:
def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **params))
    
    loss = -cross_val_score(model, train_y, train_x, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
    print("NRMSE Loss {:.5f} params {}".format(loss, params))
    return loss

In [23]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 200,
            rstate=np.random.default_rng(1))

NRMSE Loss 2.59823 params {'n_estimators': 1350, 'max_depth': 47, 'num_leaves': 20, 'min_child_samples': 240, 'colsample_bytree': '0.715', 'subsample': '0.657', 'min_split_gain': '0.655', 'scale_pos_weight': '4.117', 'reg_alpha': '22.649', 'reg_lambda': '55.509', 'learning_rate': '0.042'}
NRMSE Loss 2.75810 params {'n_estimators': 1500, 'max_depth': 13, 'num_leaves': 90, 'min_child_samples': 110, 'colsample_bytree': '0.538', 'subsample': '0.901', 'min_split_gain': '0.652', 'scale_pos_weight': '7.202', 'reg_alpha': '1.693', 'reg_lambda': '75.762', 'learning_rate': '0.159'}
NRMSE Loss 2.58182 params {'n_estimators': 250, 'max_depth': 67, 'num_leaves': 80, 'min_child_samples': 170, 'colsample_bytree': '0.387', 'subsample': '0.902', 'min_split_gain': '0.436', 'scale_pos_weight': '8.879', 'reg_alpha': '86.379', 'reg_lambda': '88.854', 'learning_rate': '0.039'}
NRMSE Loss 2.82193 params {'n_estimators': 600, 'max_depth': 27, 'num_leaves': 60, 'min_child_samples': 120, 'colsample_bytree': '0.

KeyboardInterrupt: 

In [24]:
best = {'n_estimators': 700, 'max_depth': 59, 'num_leaves': 50, 'min_child_samples': 260, 'colsample_bytree': '0.450', 'subsample': '0.447', 'min_split_gain': '0.645', 'scale_pos_weight': '7.572', 'reg_alpha': '92.295', 'reg_lambda': '46.259', 'learning_rate': '0.318'}

model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
model.fit(train_y, train_x)
preds = model.predict(train_y)


In [59]:
submit = pd.read_csv('data/train.csv')
submit_x, submix_y = dataset_split_X_y(submit)
cols_with_zero_variance = zero_variance(submit_x) # 분산이 0 (통과 여부)
submit_x = submit_x.drop(cols_with_zero_variance, axis = 1)

submit_x = submit_x.drop(['X_10', 'X_11'], axis = 1) # 결측치가 많음 (결측치 = 0, 공지사항)
for idx, col in enumerate(submit_x.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
    
submit.to_csv('data/submission_1.csv', index = False)