# import

In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt

import xgboost as xgb

from sklearn import metrics 

# seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [3]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

# 그룹화 및 파생변수 생성

In [4]:
group_dict = {
    "Group_1" : ["X_01", "X_02", "X_05", "X_06"],
    "Group_2" : ["X_03", "X_10", "X_11"],
    "Group_3" : ["X_04", "X_23", "X_47", "X_48"],
    "Group_4" : ["X_07", "X_08", "X_09"],
    "Group_5" : ["X_12"],
    "Group_6" : ["X_13"],
    "Group_7" : ["X_14", "X_15", "X_16", "X_17", "X_18"],
    "Group_8" : ["X_19", "X_20", "X_21", "X_22"],
    "Group_9" : ["X_24", "X_25", "X_26", "X_27", "X_28", "X_29"],
    "Group_10" : ["X_30", "X_31", "X_32", "X_33"],
    "Group_11" : ["X_34", "X_35", "X_36", "X_37"],
    "Group_12" : ["X_38", "X_39", "X_40"],
    "Group_13" : ["X_41", "X_42", "X_43", "X_44"],
    "Group_14" : ["X_45"],
    "Group_15" : ["X_46"],
    "Group_16" : ["X_49"],
    "Group_17" : ["X_50", "X_51", "X_52", "X_53", "X_54", "X_55", "X_56"],
}

In [5]:
# 참고 https://hong-yp-ml-records.tistory.com/80
train_df = pd.read_csv('./train.csv')
# 파생변수 생성전 의미 없는 검사 결과 빼기
train_df = train_df.drop(columns=["X_04", "X_23", "X_47", "X_48","ID"])
# # 파생변수 생성
for i in range(1,18):
    if i != 3 and len(group_dict['Group_%d'%i]) >1 :
        Group_train = train_df[group_dict['Group_%d'%i]]
        train_df['Group_%d mean'%i] = Group_train.mean(axis = 1)
        train_df['Group_%d var'%i] = Group_train.var(axis = 1)
        train_df['Group_%d std'%i] = Group_train.std(axis = 1)
        train_df['Group_%d sum'%i] = Group_train.sum(axis = 1)
        train_df['Group_%d median'%i] = Group_train.median(axis = 1)
train_df = train_df.dropna(axis = 1)
# # 데이터 나누기
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(train_df, train_size=0.8)
train_x = train_df.filter(regex='[XG]') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

valid_x = valid_df.filter(regex='[XG]') # Input : X Featrue
valid_y = valid_df.filter(regex='Y') # Output : Y Feature

In [7]:
# MLPRegressor

from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor()
param_grid = {'hidden_layer_sizes': [i for i in range(2,20)],
              'activation': ['LeakyReLU', 'relu'],
              'solver': ['adam'],
              'learning_rate': ['constant'],
              'learning_rate_init': [0.0003],
              'power_t': [0.5],
              'alpha': [0.0001],
              'max_iter': [1000],
              'early_stopping': [True],
              'warm_start': [False]}
mlp_GS = GridSearchCV(mlp, param_grid=param_grid, 
                   cv=10, verbose=True, pre_dispatch='2*n_jobs')
mlp_GS.fit(train_x, train_y)

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Traceback (most recent call last):
  File "c:\Users\user\anaconda3\envs\(lg_aimers)\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\anaconda3\envs\(lg_aimers)\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 641, in fit
    return self._fit(X, y, incremental=False)
  File "c:\Users\user\anaconda3\envs\(lg_aimers)\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 321, in _fit
    self._validate_hyperparameters()
  File "c:\Users\user\anaconda3\envs\(lg_aimers)\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 419, in _validate_hyperparameters
    raise ValueError("The activation '%s' is not supported. Supported "
ValueError: The activation 'LeakyReLU' is not supported. Supported activations are ['identity', 'logistic', 'relu',

GridSearchCV(cv=10, estimator=MLPRegressor(),
             param_grid={'activation': ['LeakyReLU', 'relu'], 'alpha': [0.0001],
                         'early_stopping': [True],
                         'hidden_layer_sizes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                                12, 13, 14, 15, 16, 17, 18,
                                                19],
                         'learning_rate': ['constant'],
                         'learning_rate_init': [0.0003], 'max_iter': [1000],
                         'power_t': [0.5], 'solver': ['adam'],
                         'warm_start': [False]},
             verbose=True)

In [8]:
valid_preds = mlp_GS.predict(valid_x)
score = lg_nrmse(valid_y.values,valid_preds)


In [9]:
score

2.0069135598975882

In [7]:
# https://wooono.tistory.com/97
gsc = GridSearchCV(
        estimator=xgb.XGBRegressor(seed=42,
                         tree_method='gpu_hist',
                         gpu_id=3),
        param_grid={
                    "learning_rate": [0.001, 0.01, 0.08],
                    "n_estimators":[ 500, 600],
                    "max_depth": [5, 7],
                    "min_child_weight": [ 3, 5, 7],
                    "gamma":[ 0.0, 0.1],
                    "colsample_bytree":[0.7, 0.8],
                    "subsample":[0.7, 0.8],
                    },
                    cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=4)

grid_result = MultiOutputRegressor(gsc).fit(train_x, train_y)


Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 16.7min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 39.4min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 68.8min
[Parallel(n_jobs=4)]: Done 864 out of 864 | elapsed: 92.2min finished


Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 16.8min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 39.5min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 69.2min
[Parallel(n_jobs=4)]: Done 864 out of 864 | elapsed: 93.1min finished


Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 16.8min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 39.6min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 69.2min
[Parallel(n_jobs=4)]: Done 864 out of 864 | elapsed: 92.9min finished


Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 11.7min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 29.3min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 54.6min
[Parallel(n_jobs=4)]: Done 864 out of 864 | elapsed: 74.2min finished


Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   24.9s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  7.2min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 20.1min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 40.5min
[Parallel(n_jobs=4)]: Done 864 out of 864 | elapsed: 55.5min finished


Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  8.5min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 22.0min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 42.3min
[Parallel(n_jobs=4)]: Done 864 out of 864 | elapsed: 57.5min finished


Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  9.8min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 25.1min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 47.2min
[Parallel(n_jobs=4)]: Done 864 out of 864 | elapsed: 63.5min finished


Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   23.9s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  5.8min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 17.1min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 35.2min
[Parallel(n_jobs=4)]: Done 864 out of 864 | elapsed: 48.6min finished


Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   24.0s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  6.0min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 17.6min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 36.2min
[Parallel(n_jobs=4)]: Done 864 out of 864 | elapsed: 49.7min finished


Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   28.5s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  6.6min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 18.7min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 38.3min
[Parallel(n_jobs=4)]: Done 864 out of 864 | elapsed: 52.3min finished


Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   23.4s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  5.9min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 17.2min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 35.5min
[Parallel(n_jobs=4)]: Done 864 out of 864 | elapsed: 49.1min finished


Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   23.9s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  5.9min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 17.2min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 35.3min
[Parallel(n_jobs=4)]: Done 864 out of 864 | elapsed: 48.8min finished


Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   24.8s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  6.0min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 17.7min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 36.4min
[Parallel(n_jobs=4)]: Done 864 out of 864 | elapsed: 49.9min finished


Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   24.7s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  6.0min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 18.1min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 38.0min
[Parallel(n_jobs=4)]: Done 864 out of 864 | elapsed: 52.8min finished




AttributeError: 'MultiOutputRegressor' object has no attribute 'cv_results_'

In [23]:
grid_result.get_params()

{'estimator__cv': 3,
 'estimator__error_score': nan,
 'estimator__estimator__objective': 'reg:squarederror',
 'estimator__estimator__base_score': None,
 'estimator__estimator__booster': None,
 'estimator__estimator__callbacks': None,
 'estimator__estimator__colsample_bylevel': None,
 'estimator__estimator__colsample_bynode': None,
 'estimator__estimator__colsample_bytree': None,
 'estimator__estimator__early_stopping_rounds': None,
 'estimator__estimator__enable_categorical': False,
 'estimator__estimator__eval_metric': None,
 'estimator__estimator__gamma': None,
 'estimator__estimator__gpu_id': 3,
 'estimator__estimator__grow_policy': None,
 'estimator__estimator__importance_type': None,
 'estimator__estimator__interaction_constraints': None,
 'estimator__estimator__learning_rate': None,
 'estimator__estimator__max_bin': None,
 'estimator__estimator__max_cat_to_onehot': None,
 'estimator__estimator__max_delta_step': None,
 'estimator__estimator__max_depth': None,
 'estimator__estimato

In [8]:
scores_df = pd.DataFrame(grid_result.cv_results_)
# 각 파라미터값들에 대한 모델 결과값들이 cv_results_ 객체에 할당됨
scores_df = pd.DataFrame(grid_result.cv_results_)

# score 결과값(ndarray형태로 할당됨) 중 특정 칼럼들만 가져오기 
scores_df[['params', 'mean_test_score', 'rank_test_score', 
           'split0_test_score', 'split1_test_score', 'split2_test_score']]
scores_df

AttributeError: 'MultiOutputRegressor' object has no attribute 'cv_results_'

In [21]:
valid_preds = grid_result.predict(valid_x)
score = lg_nrmse(valid_y.values,valid_preds)
score

1.9367319108563752

In [14]:
submit = pd.read_csv('./sample_submission.csv')

In [18]:
test_x = pd.read_csv('./test.csv')
# 참고 https://hong-yp-ml-records.tistory.com/80
# 파생변수 생성전 의미 없는 검사 결과 빼기
test_x = test_x.drop(columns=["X_04", "X_23", "X_47", "X_48","ID"])
# # 파생변수 생성
for i in range(1,18):
    if i != 3 and len(group_dict['Group_%d'%i]) >1 :
        Group_train = test_x[group_dict['Group_%d'%i]]
        test_x['Group_%d mean'%i] = Group_train.mean(axis = 1)
        test_x['Group_%d var'%i] = Group_train.var(axis = 1)
        test_x['Group_%d std'%i] = Group_train.std(axis = 1)
        test_x['Group_%d sum'%i] = Group_train.sum(axis = 1)
        test_x['Group_%d median'%i] = Group_train.median(axis = 1)
test_x = test_x.dropna(axis = 1)
preds = grid_result.predict(test_x)
print('Done.')

Done.


In [19]:
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
print('Done.')

Done.


In [20]:
submit.to_csv('./submit_XGBoost.csv', index=False)

# 1.946의 성능이 나왔다.
