In [8]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn import metrics

In [2]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score, all_nrmse

In [9]:
df = pd.read_csv('abnormal_train.csv')

train_df, valid_df = train_test_split(df, train_size=0.9)

train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

valid_x = valid_df.filter(regex='X') # Input : X Featrue
valid_y = valid_df.filter(regex='Y') # Output : Y Feature

LR = MultiOutputRegressor(LinearRegression()).fit(train_x, train_y)

valid_preds = LR.predict(valid_x)
lg_nrmse(valid_y.values, valid_preds)

(2.99357866876442,
 [0.4075648226642426,
  0.4630912647801847,
  0.4770752171746861,
  0.26903830866071665,
  0.10883679866404056,
  0.3095499459940363,
  0.2308258938179748,
  0.03022904872948916,
  0.03050280710375238,
  0.06650315492142433,
  0.04999405208369747,
  0.030619567988576853,
  0.030468355279367903,
  0.03003717080515546])

In [5]:
cosine_matrix = pd.DataFrame(cosine_similarity(train_x, valid_x), index=train_x.index, columns=valid_x.index)
cosine_matrix

Unnamed: 0,2862,3362,308,98,42,1743,2642,2106,2786,2697,...,788,1012,1420,2409,1152,1056,2056,44,277,1265
980,0.999758,0.999283,0.999858,0.999860,0.999144,0.999170,0.999873,0.997803,0.999926,0.999760,...,0.999477,0.998936,0.999228,0.999856,0.999051,0.999880,0.997269,0.999791,0.999581,0.997972
2159,0.999939,0.999891,0.999940,0.999215,0.999849,0.999838,0.999928,0.999079,0.999290,0.999975,...,0.999903,0.999718,0.999876,0.999914,0.999778,0.999594,0.998700,0.999964,0.999955,0.999226
2508,0.998914,0.998051,0.999159,0.999911,0.997827,0.997866,0.999235,0.995866,0.999908,0.999012,...,0.998377,0.997496,0.997962,0.999163,0.997675,0.999656,0.995146,0.999001,0.998567,0.996137
374,0.999981,0.999945,0.999933,0.999119,0.999903,0.999910,0.999894,0.999231,0.999238,0.999919,...,0.999976,0.999817,0.999930,0.999925,0.999864,0.999467,0.998893,0.999968,0.999997,0.999321
1736,0.999884,0.999786,0.999852,0.999181,0.999746,0.999769,0.999835,0.998979,0.999252,0.999847,...,0.999803,0.999615,0.999781,0.999913,0.999678,0.999469,0.998615,0.999887,0.999895,0.999246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1356,0.999966,0.999817,0.999939,0.999331,0.999756,0.999781,0.999913,0.998937,0.999426,0.999897,...,0.999879,0.999639,0.999799,0.999975,0.999704,0.999572,0.998562,0.999957,0.999942,0.999114
3750,0.999525,0.999876,0.999325,0.997767,0.999925,0.999925,0.999229,0.999881,0.997942,0.999366,...,0.999764,0.999956,0.999901,0.999335,0.999941,0.998383,0.999732,0.999460,0.999717,0.999926
1368,0.999967,0.999962,0.999893,0.998998,0.999924,0.999935,0.999839,0.999328,0.999136,0.999863,...,0.999992,0.999864,0.999947,0.999887,0.999902,0.999351,0.999018,0.999935,0.999994,0.999382
65,0.999955,0.999972,0.999895,0.998989,0.999941,0.999942,0.999852,0.999341,0.999112,0.999896,...,0.999983,0.999867,0.999961,0.999879,0.999906,0.999383,0.999022,0.999941,0.999995,0.999418


In [7]:
for k in [5, 10, 20, 50, 100, 200, 500, 1000, 2000]:
    preds = []
    for idx, row in valid_x.iterrows():
        neighbors_idx = cosine_matrix[idx].sort_values(ascending=False).index[:k]
        pred = train_y.loc[neighbors_idx].mean()
        preds.append(pred)
    
    print(f"*** {k} ***")
    print(lg_nrmse(valid_y.values, np.array(preds)))

*** 5 ***
(3.30439198001972, [0.4453244140457251, 0.5560346831423109, 0.5407231725600645, 0.3047879707400246, 0.1194706931830339, 0.29243802133304553, 0.23815986427247046, 0.03577873780190279, 0.03539663732266079, 0.06904213627731076, 0.05422001151258945, 0.035361342739301374, 0.03570318646776212, 0.03540759720580212])
*** 10 ***
(3.205345340282352, [0.4283398783139922, 0.5358438309170583, 0.5203019545001231, 0.30149716051983316, 0.11689121681115651, 0.28450380248676765, 0.22985948789392943, 0.03544575442242228, 0.03493360345778251, 0.0682947834840495, 0.053404381502085485, 0.035000697777720614, 0.03531474567912612, 0.03517742534324918])
*** 20 ***
(3.1128186825779927, [0.4137286179535927, 0.5236898538439009, 0.5038410078054973, 0.2881513056437344, 0.1131864249332261, 0.28218000297456725, 0.22121553470936217, 0.03461506232710386, 0.03394703222318207, 0.0671232685427516, 0.05221346795864506, 0.03408559326730084, 0.0344211768508348, 0.034298771506096386])
*** 50 ***
(3.056938426406481, [

In [None]:
for k in [200, 250, 300, 350, 400, 450, 500]:
    preds = []
    for idx, row in valid_x.iterrows():
        neighbors_idx = cosine_matrix[idx].sort_values(ascending=False).index[:k]
        pred = train_y.loc[neighbors_idx].mean()
        preds.append(pred)
    
    print(f"*** {k} ***")
    print(lg_nrmse(valid_y.values, np.array(preds)))