In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import random
import os
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor

%matplotlib inline

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [22]:
from sklearn import metrics 
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    print(np.sum(all_nrmse[:8]))
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

# 클리핑 기법

In [23]:
num_cols = ["X_01","X_02","X_03","X_05","X_06","X_07","X_08","X_09","X_10","X_11","X_12","X_13","X_14","X_15","X_16","X_17","X_18","X_19","X_20","X_21","X_22","X_24","X_25","X_26","X_27","X_28","X_29","X_30","X_31","X_32","X_33","X_34","X_55","X_36","X_37","X_38","X_39","X_40","X_41","X_42","X_43","X_44","X_45","X_49","X_50","X_51","X_52","X_53","X_54","X_55","X_56"]
# -----------------------------------
# clipping
# -----------------------------------
# 데이터 읽어오기
from sklearn.model_selection import train_test_split

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv("./test.csv")

train_df, valid_df = train_test_split(train_df, train_size=0.8,random_state=42)

train_x = train_df.filter(regex='X')
train_y = train_df.filter(regex='Y')
test_x = test_df.filter(regex="X")

valid_x = valid_df.filter(regex='X') # Input : X Featrue
valid_y = valid_df.filter(regex='Y') # Output : Y Feature
# -----------------------------------
# 열마다 학습 데이터의 1%, 99% 지점을 확인
p01 = train_x[num_cols].quantile(0.03)
p99 = train_x[num_cols].quantile(0.97)

p01_valid = valid_x[num_cols].quantile(0.03)
p99_vaild = valid_x[num_cols].quantile(0.97)

# 1％점 이하의 값은 1%점으로, 99%점 이상의 값은 99%점으로 클리핑
train_x = train_x[num_cols].clip(p01, p99, axis=1)
test_x = test_x[num_cols].clip(p01, p99, axis=1)
valid_x = valid_x[num_cols].clip(p01_valid, p99_vaild, axis=1)
# -----------------------------------
# 학습후 리더보드 확인

import xgboost as xgb
XGB = MultiOutputRegressor(xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma = 0, subsample=0.75, colsample_bytree = 1, max_depth=7)).fit(train_x, train_y)
valid_preds = XGB.predict(valid_x)
score = lg_nrmse(valid_y.values,valid_preds)
print(score)

1.4814818182161709
1.944991409344014


In [6]:
print(score)

1.943863752481239


In [9]:
import xgboost as xgb
gsc = GridSearchCV(
        estimator=xgb.XGBRegressor(seed=42,
                         tree_method='gpu_hist',
                         gpu_id=0),
        param_grid={
                    "learning_rate": [0.08],
                    "n_estimators":[1000],
                    "max_depth": [7],
                    "gamma":[ 0.0],
                    "colsample_bytree":[1],
                    "subsample":[0.75],
                    },
                    cv=3, scoring='neg_mean_squared_error', verbose=0, n_jobs=4)

XGB = MultiOutputRegressor(gsc).fit(train_x, train_y)
valid_preds = XGB.predict(valid_x)
score = lg_nrmse(valid_y.values,valid_preds)
print(score)

Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done  57 out of  57 | elapsed:  8.3min finished


Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done  57 out of  57 | elapsed:  8.5min finished


Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done  57 out of  57 | elapsed:  8.4min finished


Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done  57 out of  57 | elapsed:  8.5min finished


Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done  57 out of  57 | elapsed:  8.4min finished


Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done  57 out of  57 | elapsed:  8.1min finished


Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done  57 out of  57 | elapsed:  8.6min finished


Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done  57 out of  57 | elapsed:  8.2min finished


Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done  57 out of  57 | elapsed:  8.3min finished


Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done  57 out of  57 | elapsed:  8.3min finished


Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done  57 out of  57 | elapsed:  8.1min finished


Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done  57 out of  57 | elapsed:  7.8min finished


Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done  57 out of  57 | elapsed:  7.8min finished


Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done  57 out of  57 | elapsed:  7.8min finished


1.94568329526167


# 제출

In [11]:
import xgboost as xgb
gsc = GridSearchCV(
        estimator=xgb.XGBRegressor(seed=42,
                         tree_method='gpu_hist',
                         gpu_id=0),
        param_grid={
                    "learning_rate": [0.008],
                    "n_estimators":[i for i in range(100,1001,50)],
                    "max_depth": [7],
                    "gamma":[ 0.0],
                    "colsample_bytree":[1],
                    "subsample":[0.75],
                    },
                    cv=3, scoring='neg_mean_squared_error', verbose=0, n_jobs=4)

XGB = MultiOutputRegressor(gsc).fit(train_x, train_y)
valid_preds = XGB.predict(valid_x)
score = lg_nrmse(valid_y.values,valid_preds)
print(score)

1.931473104227828


In [None]:
import xgboost as xgb
gsc = GridSearchCV(
        estimator=xgb.XGBRegressor(seed=42,
                         tree_method='gpu_hist',
                         gpu_id=0),
        param_grid={
                    "learning_rate": [0.008],
                    "n_estimators":[i for i in range(100,1001,50)],
                    "max_depth": [7],
                    "gamma":[ 0.0],
                    "colsample_bytree":[1],
                    "subsample":[0.75],
                    },
                    cv=3, scoring='neg_mean_squared_error', verbose=0, n_jobs=4)

XGB = MultiOutputRegressor(gsc).fit(train_x, train_y)
valid_preds = XGB.predict(valid_x)
score = lg_nrmse(valid_y.values,valid_preds)
print(score)

In [18]:
XGB.get_score

AttributeError: 'MultiOutputRegressor' object has no attribute 'get_score'

In [10]:
preds = XGB.predict(test_x)
submit = pd.read_csv('./sample_submission.csv')
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
print('Done.')
submit.to_csv('./submit_XGBoost_clliping.csv', index=False)

Done.
