In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn import metrics

from sklearn.model_selection import train_test_split

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [3]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score, all_nrmse

In [8]:
df = pd.read_csv('./train.csv')

train_df, valid_df = train_test_split(df, train_size=0.8)

train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

valid_x = valid_df.filter(regex='X') # Input : X Featrue
valid_y = valid_df.filter(regex='Y') # Output : Y Feature

test_x = pd.read_csv('./test.csv').drop(columns=['ID'])

In [14]:
train_x = train_x.drop(columns=['X_04'])

Unnamed: 0,X_01,X_02,X_03,X_05,X_06,X_07,X_08,X_09,X_10,X_11,...,X_47,X_48,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56
38194,70.544,103.32,66.17,103.150,70.904,29.56,95.41,158.21,0.0,0.0,...,1,1,12544.13,134.682210,129.947949,139.472643,132.299986,124.762020,133.686459,134.556606
21557,67.485,103.32,65.17,102.048,69.884,29.85,105.58,117.71,0.0,0.0,...,1,1,12624.33,132.298199,138.071385,143.807214,132.119505,125.568508,138.441121,124.334309
4562,70.544,103.32,72.37,103.154,70.904,28.93,116.56,278.62,0.0,0.0,...,1,1,18394.33,125.810010,122.519528,132.610788,130.748144,120.680096,134.852993,124.379109
39428,67.485,103.32,65.57,101.891,68.864,32.07,116.43,234.22,0.0,0.0,...,1,1,26607.53,129.868550,129.021214,141.274553,130.210280,128.719257,143.581967,128.091419
15695,67.485,103.32,65.17,101.945,67.845,31.44,105.68,332.51,0.0,0.0,...,1,1,16703.43,124.206837,130.821041,135.350626,125.720045,131.959202,144.584767,129.742230
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,69.524,103.32,71.97,101.870,70.904,48.24,120.52,608.69,0.0,0.0,...,1,1,15585.73,124.925084,124.813091,123.314043,116.561773,119.467594,122.416775,123.261044
11284,68.504,103.32,71.67,101.935,72.943,30.90,119.81,268.32,0.0,0.0,...,1,1,14228.73,126.370267,130.973187,135.857515,133.020071,132.835186,130.027691,126.445255
38158,71.563,103.32,66.77,103.156,71.923,28.67,2362.16,37.58,0.0,0.0,...,1,1,12088.23,137.139912,134.100041,147.350031,130.241667,134.185695,147.163205,133.407581
860,68.504,103.32,70.67,101.966,68.864,29.44,232.11,200.89,0.0,0.0,...,1,1,19974.53,130.648595,127.940746,135.476869,124.291716,122.022510,128.166859,124.283352


In [15]:
train_x

Unnamed: 0,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,X_10,...,X_47,X_48,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56
38194,70.544,103.32,66.17,1,103.150,70.904,29.56,95.41,158.21,0.0,...,1,1,12544.13,134.682210,129.947949,139.472643,132.299986,124.762020,133.686459,134.556606
21557,67.485,103.32,65.17,1,102.048,69.884,29.85,105.58,117.71,0.0,...,1,1,12624.33,132.298199,138.071385,143.807214,132.119505,125.568508,138.441121,124.334309
4562,70.544,103.32,72.37,1,103.154,70.904,28.93,116.56,278.62,0.0,...,1,1,18394.33,125.810010,122.519528,132.610788,130.748144,120.680096,134.852993,124.379109
39428,67.485,103.32,65.57,1,101.891,68.864,32.07,116.43,234.22,0.0,...,1,1,26607.53,129.868550,129.021214,141.274553,130.210280,128.719257,143.581967,128.091419
15695,67.485,103.32,65.17,1,101.945,67.845,31.44,105.68,332.51,0.0,...,1,1,16703.43,124.206837,130.821041,135.350626,125.720045,131.959202,144.584767,129.742230
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,69.524,103.32,71.97,1,101.870,70.904,48.24,120.52,608.69,0.0,...,1,1,15585.73,124.925084,124.813091,123.314043,116.561773,119.467594,122.416775,123.261044
11284,68.504,103.32,71.67,1,101.935,72.943,30.90,119.81,268.32,0.0,...,1,1,14228.73,126.370267,130.973187,135.857515,133.020071,132.835186,130.027691,126.445255
38158,71.563,103.32,66.77,1,103.156,71.923,28.67,2362.16,37.58,0.0,...,1,1,12088.23,137.139912,134.100041,147.350031,130.241667,134.185695,147.163205,133.407581
860,68.504,103.32,70.67,1,101.966,68.864,29.44,232.11,200.89,0.0,...,1,1,19974.53,130.648595,127.940746,135.476869,124.291716,122.022510,128.166859,124.283352


In [12]:
LR = MultiOutputRegressor(LinearRegression()).fit(train_x, train_y)

valid_preds = LR.predict(valid_x)
lg_nrmse(valid_y.values, valid_preds)

(1.9713028094742684,
 [0.25911875134841755,
  0.3577636001547976,
  0.35106896279147065,
  0.19257779398362862,
  0.0799755837749716,
  0.1051559330934357,
  0.1297143978520434,
  0.024663907842191652,
  0.024528935970444848,
  0.03918554560414437,
  0.0337778767028268,
  0.024655584113178777,
  0.024558754879764215,
  0.02454939519476133])

## 우리가 해볼 수 있는 것
- 정규화
  - min-max scaling
  - standarziation
- 이상치 제거
  - pcb 하우징 파트 부분
  - 컬럼 별 히스토그램 참고
- y1~y8을 중심으로 더 전처리 하자!
  - 방법은 좀 더 고민...
- 의미없는 컬럼 제거 : 1~4차 통과 여부
- 의미있는 컬럼만으로 학습

In [4]:
df = pd.read_csv('./train.csv')

train_df, valid_df = train_test_split(df, train_size=0.8)

train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

valid_x = valid_df.filter(regex='X') # Input : X Featrue
valid_y = valid_df.filter(regex='Y') # Output : Y Feature

test_x = pd.read_csv('./test.csv').drop(columns=['ID'])

In [5]:
###
# 4 23 47 48
train_x = train_x.drop(columns=['X_04', 'X_23', 'X_47', 'X_48'])
valid_x = valid_x.drop(columns=['X_04', 'X_23', 'X_47', 'X_48'])
###

In [6]:
LR = MultiOutputRegressor(LinearRegression()).fit(train_x, train_y)
valid_preds = LR.predict(valid_x)
lg_nrmse(valid_y.values, valid_preds)

(1.9713028094739522,
 [0.259118751348463,
  0.3577636001544468,
  0.35106896279148214,
  0.19257779398361924,
  0.07997558377496794,
  0.10515593309339616,
  0.12971439785200914,
  0.024663907842215702,
  0.024528935970452245,
  0.039185545604199394,
  0.03377787670286758,
  0.024655584113189126,
  0.024558754879735134,
  0.024549395194788704])