## Import

In [15]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor


In [16]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

## 정규화 함수

In [17]:
def mean_norm(df_input):# 평균정규화로 Pandas 데이터 프레임 정규화
    return df_input.apply(lambda x: (x-x.mean())/ x.std(), axis=0)
def minmax_norm(df_input):#최소-최대정규화로 Pandas 데이터 프레임 정규화
    return (df_input - df_input.min()) / ( df_input.max() - df_input.min())
def quantile_norm(df_input):#quantile 정규화로 Pandas 데이터 프레임 정규화
    sorted_df = pd.DataFrame(np.sort(df_input.values,axis=0), index=df_input.index, columns=df_input.columns)
    mean_df = sorted_df.mean(axis=1)
    mean_df.index = np.arange(1, len(mean_df) + 1)
    quantile_df =df_input.rank(method="min").stack().astype(int).map(mean_df).unstack()
    return(quantile_df)



In [18]:
from sklearn import metrics 
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

## Data Load
-평균정규화로 Pandas 데이터 프레임 정규화

In [57]:

train_df_1 = pd.read_csv('./train.csv')

In [58]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(train_df_1, train_size=0.8)

In [59]:
train_x = mean_norm(train_df.filter(regex='X')).dropna(axis=1) # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

valid_x = mean_norm(valid_df.filter(regex='X')).dropna(axis=1) # Input : X Featrue
valid_y = valid_df.filter(regex='Y') # Output : Y Feature

## xgb Model Fit

In [60]:
import xgboost as xgb
XGB = MultiOutputRegressor(xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma = 0, subsample=0.75, colsample_bytree = 1, max_depth=7)).fit(train_x, train_y)
valid_preds = XGB.predict(valid_x)
score = lg_nrmse(valid_y.values,valid_preds)

print("------현재 모델",XGB,'-----리더보드 스코어  ',score)
print('Done.')

------현재 모델 MultiOutputRegressor(estimator=XGBRegressor(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=1,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=0,
                                            gpu_id=None, grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.08, max_bin=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=7,
  

## Data Load
- 최소-최대정규화로 Pandas 데이터 프레임 정규화

In [61]:
train_x = minmax_norm(train_df.filter(regex='X')).dropna(axis=1) # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

valid_x = minmax_norm(valid_df.filter(regex='X')).dropna(axis=1) # Input : X Featrue
valid_y = valid_df.filter(regex='Y') # Output : Y Feature

## xgb Model Fit

In [62]:
import xgboost as xgb
XGB = MultiOutputRegressor(xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma = 0, subsample=0.75, colsample_bytree = 1, max_depth=7)).fit(train_x, train_y)
valid_preds = XGB.predict(valid_x)
score = lg_nrmse(valid_y.values,valid_preds)

print("------현재 모델",XGB,'-----리더보드 스코어  ',score)
print('Done.')

------현재 모델 MultiOutputRegressor(estimator=XGBRegressor(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=1,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=0,
                                            gpu_id=None, grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.08, max_bin=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=7,
  

## Data Load
- quantile 정규화로 Pandas 데이터 프레임 정규화

In [63]:
train_x = quantile_norm(train_df.filter(regex='X')).dropna(axis=1) # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

valid_x = quantile_norm(valid_df.filter(regex='X')).dropna(axis=1) # Input : X Featrue
valid_y = valid_df.filter(regex='Y') # Output : Y Feature

## xgb Model Fit

In [64]:
import xgboost as xgb
XGB = MultiOutputRegressor(xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma = 0, subsample=0.75, colsample_bytree = 1, max_depth=7)).fit(train_x, train_y)
valid_preds = XGB.predict(valid_x)
score = lg_nrmse(valid_y.values,valid_preds)

print("------현재 모델",XGB,'-----리더보드 스코어  ',score)
print('Done.')

------현재 모델 MultiOutputRegressor(estimator=XGBRegressor(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=1,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=0,
                                            gpu_id=None, grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.08, max_bin=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=7,
  

## Submit

In [None]:
test_x = pd.read_csv('./test.csv').drop(columns=['ID'])

In [45]:
submit = pd.read_csv('./sample_submission.csv')

In [46]:

for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
print('Done.')

Done.


In [47]:
submit.to_csv('./submit.csv', index=False)