In [2]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import warnings, random

from collections import defaultdict
warnings.filterwarnings(action='ignore')

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn import metrics

from catboost import CatBoostRegressor, CatBoostClassifier, Pool

def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score, all_nrmse

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(42) # Seed 고정

pd.set_option('display.max_columns', None)

df = pd.read_csv('train.csv')
df.drop(columns=['X_04', 'X_23', 'X_47', 'X_48'], inplace=True)

train_df, valid_df = train_test_split(df, train_size=0.8)

train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

valid_x = valid_df.filter(regex='X') # Input : X Featrue
valid_y = valid_df.filter(regex='Y') # Output : Y Feature

LR = MultiOutputRegressor(LinearRegression()).fit(train_x, train_y)

valid_preds = LR.predict(valid_x)
lg_nrmse(valid_y.values, valid_preds)

(1.9713028094739522,
 [0.259118751348463,
  0.3577636001544468,
  0.35106896279148214,
  0.19257779398361924,
  0.07997558377496794,
  0.10515593309339616,
  0.12971439785200914,
  0.024663907842215702,
  0.024528935970452248,
  0.039185545604199394,
  0.03377787670286758,
  0.024655584113189126,
  0.02455875487973513,
  0.024549395194788704])

In [7]:
valid_y

Unnamed: 0,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
15844,0.964,0.673,0.649,13.844,30.533,15.926,3.433,-26.468,-26.434,-22.634,23.618,-26.451,-26.414,-26.649
15845,1.920,1.712,1.605,15.014,31.939,17.446,2.760,-26.096,-26.004,-21.535,25.162,-26.019,-25.869,-26.203
15846,1.072,0.764,0.621,13.314,31.908,16.626,3.403,-26.278,-26.154,-22.427,24.447,-26.364,-26.174,-26.295
15847,1.783,1.789,1.525,13.313,31.762,16.911,3.326,-26.065,-26.175,-22.145,24.777,-26.194,-26.032,-26.282
15848,1.236,1.281,1.115,14.075,34.559,17.255,3.215,-25.933,-25.820,-21.386,25.165,-25.809,-25.912,-25.774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23760,1.186,0.629,0.825,14.596,31.161,16.413,2.981,-25.767,-25.836,-22.089,24.148,-25.617,-25.583,-25.709
23761,1.913,1.891,1.644,12.178,30.307,17.128,3.302,-25.700,-25.556,-21.898,24.951,-25.705,-25.632,-25.419
23762,1.530,0.997,1.250,16.058,34.174,17.245,2.854,-26.090,-25.837,-21.276,25.014,-25.993,-25.851,-25.638
23763,1.509,1.469,1.216,12.414,31.963,16.565,2.973,-26.058,-26.045,-22.300,24.356,-25.811,-25.888,-25.970


In [6]:
save

Unnamed: 0,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,0.287812,0.223274,0.210012,2.652320,6.218765,3.345688,0.633421,-5.234755,-5.248823,-4.470186,4.896794,-5.236244,-5.231339,-5.221180
1,0.272640,0.223247,0.205668,2.692446,6.161805,3.321978,0.640157,-5.237055,-5.225664,-4.449567,4.864418,-5.226833,-5.231797,-5.215522
2,0.278985,0.229791,0.202808,2.782390,6.241235,3.323639,0.620381,-5.218504,-5.226314,-4.444216,4.896397,-5.207754,-5.212969,-5.209809
3,0.281409,0.236624,0.220249,2.668257,6.328318,3.306379,0.621110,-5.250223,-5.239709,-4.482727,4.877219,-5.225932,-5.215577,-5.224893
4,0.268221,0.217082,0.204465,2.633279,6.165206,3.274084,0.653095,-5.254649,-5.293254,-4.511991,4.787665,-5.265254,-5.270283,-5.250594
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7917,0.276151,0.218732,0.212053,2.537317,6.287407,3.306166,0.638697,-5.279950,-5.276637,-4.455211,4.867304,-5.260975,-5.267983,-5.265386
7918,0.237697,0.173938,0.170928,2.530490,6.221210,3.311265,0.645756,-5.303526,-5.307760,-4.560882,4.827273,-5.290479,-5.289895,-5.310068
7919,0.266140,0.207642,0.211291,2.476795,6.167352,3.321836,0.644560,-5.269671,-5.273396,-4.481491,4.856819,-5.264442,-5.258779,-5.264001
7920,0.270465,0.202975,0.202173,2.702903,6.225205,3.316122,0.644835,-5.299037,-5.315997,-4.538808,4.819384,-5.285631,-5.301393,-5.293712


In [9]:
preds

array([1.39317421, 1.45400962, 1.37468469, ..., 1.34177882, 1.40854818,
       1.43352961])

In [8]:
len(preds)

7921

In [13]:
len(x) 

39600

In [14]:
df = pd.read_csv('train.csv').drop(columns=['X_04', 'X_23', 'X_47', 'X_48']).iloc[:39605]

x = df.filter(regex='X')
y = df.filter(regex='Y')

seed = 42
n_fold = 5

save = pd.DataFrame(np.zeros(valid_y.shape), columns=valid_y.columns)

score = defaultdict(int)
kf = KFold(n_splits=n_fold)
for idx, (train, valid) in enumerate(kf.split(x), 1):
    print("Fold | ", idx)
    train_x, train_y = x.iloc[train], y.iloc[train]
    valid_x, valid_y = x.iloc[valid], y.iloc[valid]
    
    for col in valid_y.columns:
        train_data = Pool(data=train_x, label=train_y[col])
        valid_data = Pool(data=valid_x, label=valid_y[col])

        model = CatBoostRegressor(loss_function='RMSE')
        model.fit(train_data, eval_set=valid_data, use_best_model=True, silent=True)

        preds = model.predict(valid_x)
        save[col] = preds / 5
        gt = valid_y[col].values

        rmse = metrics.mean_squared_error(gt, preds, squared=False)
        nrmse = rmse/np.mean(np.abs(gt))
        score[col] += nrmse / n_fold
        
        # preds = model.predict(test_x)
        # submit[col] += preds / n_fold
# submit.to_csv('catboost_normal_abnormal3.csv', index=False)

Fold |  1
Fold |  2
Fold |  3
Fold |  4
Fold |  5


In [15]:
save

Unnamed: 0,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,0.291218,0.240639,0.218448,2.610437,6.299323,3.321539,0.642452,-5.214801,-5.237262,-4.408750,4.914153,-5.216247,-5.210328,-5.213497
1,0.311372,0.244010,0.228697,2.784386,6.294942,3.303264,0.629588,-5.238929,-5.282087,-4.461900,4.867334,-5.222271,-5.254502,-5.221677
2,0.281533,0.226402,0.210660,2.599292,6.125542,3.300338,0.644140,-5.276766,-5.286689,-4.428460,4.846447,-5.269960,-5.265234,-5.254518
3,0.289401,0.223258,0.213344,2.809146,6.316374,3.304576,0.634490,-5.241361,-5.254365,-4.469731,4.885221,-5.246352,-5.249552,-5.222113
4,0.298412,0.221802,0.219301,2.576434,6.230377,3.301476,0.633332,-5.268992,-5.257408,-4.446929,4.826433,-5.268610,-5.245116,-5.272182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7915,0.257021,0.189374,0.197728,2.547232,6.270170,3.304143,0.639539,-5.302242,-5.300499,-4.532213,4.850091,-5.297080,-5.292897,-5.307886
7916,0.259411,0.181885,0.196774,2.709464,6.156807,3.305292,0.638157,-5.324195,-5.336062,-4.585439,4.874434,-5.311421,-5.291553,-5.314405
7917,0.261109,0.189330,0.199107,2.715033,6.239295,3.304105,0.643579,-5.295864,-5.291086,-4.530146,4.867892,-5.286066,-5.271848,-5.274415
7918,0.262726,0.191572,0.201343,2.528591,6.249412,3.306013,0.643859,-5.283287,-5.277477,-4.520477,4.903118,-5.271308,-5.262870,-5.258598


In [None]:
df.set_index(col, drop=True)

In [24]:
save.index=valid_x.index

In [26]:
save.set_index(save.index, drop=True)

Unnamed: 0,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
31680,0.291218,0.240639,0.218448,2.610437,6.299323,3.321539,0.642452,-5.214801,-5.237262,-4.408750,4.914153,-5.216247,-5.210328,-5.213497
31681,0.311372,0.244010,0.228697,2.784386,6.294942,3.303264,0.629588,-5.238929,-5.282087,-4.461900,4.867334,-5.222271,-5.254502,-5.221677
31682,0.281533,0.226402,0.210660,2.599292,6.125542,3.300338,0.644140,-5.276766,-5.286689,-4.428460,4.846447,-5.269960,-5.265234,-5.254518
31683,0.289401,0.223258,0.213344,2.809146,6.316374,3.304576,0.634490,-5.241361,-5.254365,-4.469731,4.885221,-5.246352,-5.249552,-5.222113
31684,0.298412,0.221802,0.219301,2.576434,6.230377,3.301476,0.633332,-5.268992,-5.257408,-4.446929,4.826433,-5.268610,-5.245116,-5.272182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39595,0.257021,0.189374,0.197728,2.547232,6.270170,3.304143,0.639539,-5.302242,-5.300499,-4.532213,4.850091,-5.297080,-5.292897,-5.307886
39596,0.259411,0.181885,0.196774,2.709464,6.156807,3.305292,0.638157,-5.324195,-5.336062,-4.585439,4.874434,-5.311421,-5.291553,-5.314405
39597,0.261109,0.189330,0.199107,2.715033,6.239295,3.304105,0.643579,-5.295864,-5.291086,-4.530146,4.867892,-5.286066,-5.271848,-5.274415
39598,0.262726,0.191572,0.201343,2.528591,6.249412,3.306013,0.643859,-5.283287,-5.277477,-4.520477,4.903118,-5.271308,-5.262870,-5.258598


In [27]:
save

Unnamed: 0,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
31680,0.291218,0.240639,0.218448,2.610437,6.299323,3.321539,0.642452,-5.214801,-5.237262,-4.408750,4.914153,-5.216247,-5.210328,-5.213497
31681,0.311372,0.244010,0.228697,2.784386,6.294942,3.303264,0.629588,-5.238929,-5.282087,-4.461900,4.867334,-5.222271,-5.254502,-5.221677
31682,0.281533,0.226402,0.210660,2.599292,6.125542,3.300338,0.644140,-5.276766,-5.286689,-4.428460,4.846447,-5.269960,-5.265234,-5.254518
31683,0.289401,0.223258,0.213344,2.809146,6.316374,3.304576,0.634490,-5.241361,-5.254365,-4.469731,4.885221,-5.246352,-5.249552,-5.222113
31684,0.298412,0.221802,0.219301,2.576434,6.230377,3.301476,0.633332,-5.268992,-5.257408,-4.446929,4.826433,-5.268610,-5.245116,-5.272182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39595,0.257021,0.189374,0.197728,2.547232,6.270170,3.304143,0.639539,-5.302242,-5.300499,-4.532213,4.850091,-5.297080,-5.292897,-5.307886
39596,0.259411,0.181885,0.196774,2.709464,6.156807,3.305292,0.638157,-5.324195,-5.336062,-4.585439,4.874434,-5.311421,-5.291553,-5.314405
39597,0.261109,0.189330,0.199107,2.715033,6.239295,3.304105,0.643579,-5.295864,-5.291086,-4.530146,4.867892,-5.286066,-5.271848,-5.274415
39598,0.262726,0.191572,0.201343,2.528591,6.249412,3.306013,0.643859,-5.283287,-5.277477,-4.520477,4.903118,-5.271308,-5.262870,-5.258598


In [16]:
score

defaultdict(int,
            {'Y_01': 0.25616462884327673,
             'Y_02': 0.35700995201719454,
             'Y_03': 0.3493683727917297,
             'Y_04': 0.1909291661896869,
             'Y_05': 0.07992443773893301,
             'Y_06': 0.0921031639887897,
             'Y_07': 0.13016851786763986,
             'Y_08': 0.023861581444816357,
             'Y_09': 0.02367719823715479,
             'Y_10': 0.03836539109793962,
             'Y_11': 0.03337281532123037,
             'Y_12': 0.023852013696540754,
             'Y_13': 0.02380157110154478,
             'Y_14': 0.023798991938739883})

In [17]:
sum([1.2*v if k in train_y.columns[:7] else v for k, v in score.items()])

1.9375314501626666

## Y_01 예측

In [56]:
s_train_df, s_valid_df = train_test_split(save, train_size=0.8)

s_train_x = s_train_df[save.columns[3:]]
s_train_y = valid_y.loc[s_train_df.index, save.columns[0]]

s_valid_x = s_valid_df[save.columns[3:]]
s_valid_y = valid_y.loc[s_valid_df.index, save.columns[0]]

train_data = Pool(data=s_train_x, label=s_train_y)
valid_data = Pool(data=s_valid_x, label=s_valid_y)

model = CatBoostRegressor(loss_function='RMSE')
model.fit(train_data, eval_set=valid_data, use_best_model=True, silent=True)

preds = model.predict(s_valid_x)
gt = s_valid_y

rmse = metrics.mean_squared_error(gt, preds, squared=False)
nrmse = rmse/np.mean(np.abs(gt))
print(nrmse)

0.23852212151160565


## Y_03 예측

In [58]:
s_train_df, s_valid_df = train_test_split(save, train_size=0.8)

s_train_x = s_train_df[[save.columns[0]] + list(save.columns[3:])]
s_train_y = valid_y.loc[s_train_df.index, save.columns[2]]

s_valid_x = s_valid_df[[save.columns[0]] + list(save.columns[3:])]
s_valid_y = valid_y.loc[s_valid_df.index, save.columns[2]]

train_data = Pool(data=s_train_x, label=s_train_y)
valid_data = Pool(data=s_valid_x, label=s_valid_y)

model = CatBoostRegressor(loss_function='RMSE')
model.fit(train_data, eval_set=valid_data, use_best_model=True, silent=True)

preds = model.predict(s_valid_x)
gt = s_valid_y

rmse = metrics.mean_squared_error(gt, preds, squared=False)
nrmse = rmse/np.mean(np.abs(gt))
print(nrmse)

0.34863872195906703


## Y_02 예측

In [None]:
s_train_df, s_valid_df = train_test_split(save, train_size=0.8)

s_train_x = s_train_df[save.columns[4:]]
s_train_y = valid_y.loc[s_train_df.index, save.columns[0]]

s_valid_x = s_valid_df[save.columns[4:]]
s_valid_y = valid_y.loc[s_valid_df.index, save.columns[0]]

train_data = Pool(data=s_train_x, label=s_train_y)
valid_data = Pool(data=s_valid_x, label=s_valid_y)

model = CatBoostRegressor(loss_function='RMSE')
model.fit(train_data, eval_set=valid_data, use_best_model=True, silent=True)

preds = model.predict(s_valid_x)
gt = s_valid_y

rmse = metrics.mean_squared_error(gt, preds, squared=False)
nrmse = rmse/np.mean(np.abs(gt))
print(nrmse)