In [17]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import random
import os

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor

%matplotlib inline

In [25]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [42]:
from sklearn import metrics 
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

## PCA 1인 경우

In [97]:
x = pd.read_csv("./train.csv").filter(regex="X").values
x = StandardScaler().fit_transform(x)# 표준화

x_test = pd.read_csv("./test.csv").filter(regex="X").values
x_test = StandardScaler().fit_transform(x_test)# 표준화

y = pd.read_csv("./train.csv").filter(regex="Y")

# 학습용
pca = PCA(n_components = 1)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['X_57'])
# 테스트용
pca = PCA(n_components = 1)
principalComponents = pca.fit_transform(x_test)
principalDf_test = pd.DataFrame(data = principalComponents
             , columns = ['X_57'])

train_df = pd.read_csv('./train.csv')
from sklearn.model_selection import train_test_split
train_df = pd.concat([train_df,principalDf], axis=1)

train_df, valid_df = train_test_split(train_df, train_size=0.8,random_state=42)
train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

valid_x = valid_df.filter(regex='X') # Input : X Featrue
valid_y = valid_df.filter(regex='Y') # Output : Y Feature

import xgboost as xgb
XGB = MultiOutputRegressor(xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma = 0, subsample=0.75, colsample_bytree = 1, max_depth=7)).fit(train_x, train_y)
valid_preds = XGB.predict(valid_x)
score = lg_nrmse(valid_y.values,valid_preds)

print("------현재 모델",XGB,'-----리더보드 스코어  ',score)
print('Done.')

------현재 모델 MultiOutputRegressor(estimator=XGBRegressor(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=1,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=0,
                                            gpu_id=None, grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.08, max_bin=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=7,
  

In [98]:
print('explained variance ratio :', pca.explained_variance_ratio_)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
print('선택할 차원 수 :', d)

explained variance ratio : [0.11857633]
선택할 차원 수 : 1


## pca 2 일 경우

In [100]:
x = pd.read_csv("./train.csv").filter(regex="X").values
x = StandardScaler().fit_transform(x)# 표준화

x_test = pd.read_csv("./test.csv").filter(regex="X").values
x_test = StandardScaler().fit_transform(x_test)# 표준화

y = pd.read_csv("./train.csv").filter(regex="Y")

# 학습용
pca = PCA(n_components = 2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['X_57', 'X_58'])
# 테스트용
pca = PCA(n_components = 2)
principalComponents = pca.fit_transform(x_test)
principalDf_test = pd.DataFrame(data = principalComponents
             , columns = ['X_57', 'X_58'])
             
train_df = pd.read_csv('./train.csv')
from sklearn.model_selection import train_test_split
train_df = pd.concat([train_df,principalDf], axis=1)

train_df, valid_df = train_test_split(train_df, train_size=0.8,random_state = 42)
train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

valid_x = valid_df.filter(regex='X') # Input : X Featrue
valid_y = valid_df.filter(regex='Y') # Output : Y Feature
import xgboost as xgb
XGB = MultiOutputRegressor(xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma = 0, subsample=0.75, colsample_bytree = 1, max_depth=7)).fit(train_x, train_y)
valid_preds = XGB.predict(valid_x)
score = lg_nrmse(valid_y.values,valid_preds)

print("------현재 모델",XGB,'-----리더보드 스코어  ',score)
print('Done.')

------현재 모델 MultiOutputRegressor(estimator=XGBRegressor(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=1,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=0,
                                            gpu_id=None, grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.08, max_bin=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=7,
  

In [101]:
print('explained variance ratio :', pca.explained_variance_ratio_)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
print('선택할 차원 수 :', d)

explained variance ratio : [0.11857633 0.08803231]
선택할 차원 수 : 1


## pca 3 일 경우

In [102]:
x = pd.read_csv("./train.csv").filter(regex="X").values
x = StandardScaler().fit_transform(x)# 표준화

x_test = pd.read_csv("./test.csv").filter(regex="X").values
x_test = StandardScaler().fit_transform(x_test)# 표준화

y = pd.read_csv("./train.csv").filter(regex="Y")

# 학습용
pca = PCA(n_components = 3)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['X_57', 'X_58','X_59'])
# 테스트용
pca = PCA(n_components = 3)
principalComponents = pca.fit_transform(x_test)
principalDf_test = pd.DataFrame(data = principalComponents
             , columns = ['X_57', 'X_58','X_59'])

train_df = pd.read_csv('./train.csv')
from sklearn.model_selection import train_test_split
train_df = pd.concat([train_df,principalDf], axis=1)

train_df, valid_df = train_test_split(train_df, train_size=0.8,random_state=42)
train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

valid_x = valid_df.filter(regex='X') # Input : X Featrue
valid_y = valid_df.filter(regex='Y') # Output : Y Feature

import xgboost as xgb
XGB = MultiOutputRegressor(xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma = 0, subsample=0.75, colsample_bytree = 1, max_depth=7)).fit(train_x, train_y)
valid_preds = XGB.predict(valid_x)
score = lg_nrmse(valid_y.values,valid_preds)

print("------현재 모델",XGB,'-----리더보드 스코어  ',score)
print('Done.')

------현재 모델 MultiOutputRegressor(estimator=XGBRegressor(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=1,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=0,
                                            gpu_id=None, grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.08, max_bin=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=7,
  

In [103]:
print('explained variance ratio :', pca.explained_variance_ratio_)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
print('선택할 차원 수 :', d)

explained variance ratio : [0.11857633 0.08803231 0.08303062]
선택할 차원 수 : 1


## pca4

In [104]:
x = pd.read_csv("./train.csv").filter(regex="X").values
x = StandardScaler().fit_transform(x)# 표준화

x_test = pd.read_csv("./test.csv").filter(regex="X").values
x_test = StandardScaler().fit_transform(x_test)# 표준화

y = pd.read_csv("./train.csv").filter(regex="Y")

# 학습용
pca = PCA(n_components = 4)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['X_57', 'X_58','X_59','X_60'])
# 테스트용
pca = PCA(n_components = 4)
principalComponents = pca.fit_transform(x_test)
principalDf_test = pd.DataFrame(data = principalComponents
             , columns = ['X_57', 'X_58','X_59','X_60'])

train_df = pd.read_csv('./train.csv')
from sklearn.model_selection import train_test_split
train_df = pd.concat([train_df,principalDf], axis=1)

train_df, valid_df = train_test_split(train_df, train_size=0.8)
train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

valid_x = valid_df.filter(regex='X') # Input : X Featrue
valid_y = valid_df.filter(regex='Y') # Output : Y Feature

import xgboost as xgb
XGB = MultiOutputRegressor(xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma = 0, subsample=0.75, colsample_bytree = 1, max_depth=7)).fit(train_x, train_y)
valid_preds = XGB.predict(valid_x)
score = lg_nrmse(valid_y.values,valid_preds)

print("------현재 모델",XGB,'-----리더보드 스코어  ',score)
print('Done.')

------현재 모델 MultiOutputRegressor(estimator=XGBRegressor(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=1,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=0,
                                            gpu_id=None, grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.08, max_bin=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=7,
  

In [105]:
print('explained variance ratio :', pca.explained_variance_ratio_)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
print('선택할 차원 수 :', d)

explained variance ratio : [0.11857633 0.08803231 0.08303062 0.07579402]
선택할 차원 수 : 1


# pca5

In [113]:
x = pd.read_csv("./train.csv").filter(regex="X").values
x = StandardScaler().fit_transform(x)# 표준화

x_test = pd.read_csv("./test.csv").filter(regex="X").values
x_test = StandardScaler().fit_transform(x_test)# 표준화

y = pd.read_csv("./train.csv").filter(regex="Y")

# 학습용
pca = PCA(n_components = 5)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['X_57', 'X_58','X_59','X_60','X_61'])
# 테스트용
pca = PCA(n_components = 5)
principalComponents = pca.fit_transform(x_test)
principalDf_test = pd.DataFrame(data = principalComponents
             , columns = ['X_57', 'X_58','X_59','X_60','X_61'])

train_df = pd.read_csv('./train.csv')
from sklearn.model_selection import train_test_split
train_df = pd.concat([train_df,principalDf], axis=1)

train_df, valid_df = train_test_split(train_df, train_size=0.8)
train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

valid_x = valid_df.filter(regex='X') # Input : X Featrue
valid_y = valid_df.filter(regex='Y') # Output : Y Feature

import xgboost as xgb
XGB = MultiOutputRegressor(xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma = 0, subsample=0.75, colsample_bytree = 1, max_depth=7)).fit(train_x, train_y)
valid_preds = XGB.predict(valid_x)
score = lg_nrmse(valid_y.values,valid_preds)

print("------현재 모델",XGB,'-----리더보드 스코어  ',score)
print('Done.')

------현재 모델 MultiOutputRegressor(estimator=XGBRegressor(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=1,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=0,
                                            gpu_id=None, grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.08, max_bin=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=7,
  

In [116]:
print('explained variance ratio :', pca.explained_variance_ratio_)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
print('선택할 차원 수 :', d)

if cumsum[-1] >= 0.95:
    d = np.argmax(cumsum >= 0.95) + 1
    print('선택할 차원 수 :', d)
    


explained variance ratio : [0.11857633 0.08803231 0.08303062 0.07579402 0.04433125]
선택할 차원 수 : 1


# pca 반복생성
- 95% 찾을때까지

In [145]:
pca_number = 1
i = 57
columns_X = []
while True:
    columns_X.append("X_%02d"%i)
    x = pd.read_csv("./train.csv").filter(regex="X").values
    x = StandardScaler().fit_transform(x)# 표준화



    y = pd.read_csv("./train.csv").filter(regex="Y")

    # 학습용
    pca = PCA(n_components = pca_number)
    principalComponents = pca.fit_transform(x)
    principalDf = pd.DataFrame(data = principalComponents
                 , columns = columns_X)


    train_df = pd.read_csv('./train.csv')
    from sklearn.model_selection import train_test_split
    train_df = pd.concat([train_df,principalDf], axis=1)

    train_df, valid_df = train_test_split(train_df, train_size=0.8)
    train_x = train_df.filter(regex='X') # Input : X Featrue
    train_y = train_df.filter(regex='Y') # Output : Y Feature

    valid_x = valid_df.filter(regex='X') # Input : X Featrue
    valid_y = valid_df.filter(regex='Y') # Output : Y Feature

    import xgboost as xgb
    XGB = MultiOutputRegressor(xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma = 0, subsample=0.75, colsample_bytree = 1, max_depth=7)).fit(train_x, train_y)
    valid_preds = XGB.predict(valid_x)
    score = lg_nrmse(valid_y.values,valid_preds)

    print("------현재 모델",XGB,'-----리더보드 스코어  ',score)
    print('Done.')
    print('explained variance ratio :', pca.explained_variance_ratio_)
    cumsum = np.cumsum(pca.explained_variance_ratio_)

    x_test = pd.read_csv("./test.csv").filter(regex="X").values
    x_test = StandardScaler().fit_transform(x_test)# 표준화
    # 테스트용
    pca = PCA(n_components = pca_number)
    principalComponents = pca.fit_transform(x_test)
    principalDf_test = pd.DataFrame(data = principalComponents, columns = columns_X)
    x_test = pd.read_csv("./test.csv").filter(regex="X")
    x_test = pd.concat([x_test,principalDf_test], axis=1)
    preds = XGB.predict(x_test)
    submit = pd.read_csv('./sample_submission.csv')
    for idx, col in enumerate(submit.columns):
        if col=='ID':
            continue
        submit[col] = preds[:,idx-1]
    print('Done.')
    submit.to_csv('./submit_XGBoost_pca.csv', index=False)
    pca_number += 1
    i += 1
    if cumsum[-1] >= 0.95:
        d = np.argmax(cumsum >= 0.95) + 1
        print('선택할 차원 수 :', d)
        break


------현재 모델 MultiOutputRegressor(estimator=XGBRegressor(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=1,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=0,
                                            gpu_id=None, grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.08, max_bin=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=7,
  