In [1]:
import numpy as np
import pandas as pd

from scipy.spatial.distance import pdist, squareform

from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn import metrics

def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score, all_nrmse

df = pd.read_csv('train.csv')

train_df, valid_df = train_test_split(df, train_size=0.9)

train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

valid_x = valid_df.filter(regex='X') # Input : X Featrue
valid_y = valid_df.filter(regex='Y') # Output : Y Feature

LR = MultiOutputRegressor(LinearRegression()).fit(train_x, train_y)

valid_preds = LR.predict(valid_x)
lg_nrmse(valid_y.values, valid_preds)

(1.9947303849514082,
 [0.25877475566101377,
  0.35947217746674476,
  0.3534679773655478,
  0.19467927083818445,
  0.08093942814929539,
  0.11380086847814155,
  0.13316968809073867,
  0.024531514007744597,
  0.02424453954165752,
  0.04019191990089526,
  0.034329237271388875,
  0.024516242394739443,
  0.02443178689207567,
  0.024413842881758255])

In [7]:
col = 'Y_01'
for n in range(1, 51): 
    train_tsne = TSNE(n_components = n).fit_transform(train_x)
    tsne_train_x = pd.DataFrame(train_tsne, columns=[f'Z_{i}' for i in range(1, n+1)])
    
    valid_tsne = TSNE(n_components = n).fit_transform(valid_x)
    tsne_valid_x = pd.DataFrame(valid_tsne, columns=[f'Z_{i}' for i in range(1, n+1)])
    
    LR = LinearRegression().fit(tsne_train_x, train_y[col])
    valid_preds = LR.predict(tsne_valid_x)
    
    rmse = metrics.mean_squared_error(valid_y[col], valid_preds, squared=False)
    nrmse = rmse/np.mean(np.abs(valid_y[col]))
    
    print(n, nrmse)
    

1 0.2647578236507528
2 0.2641982568365092
3 0.2644923632856626


ValueError: 'n_components' should be inferior to 4 for the barnes_hut algorithm as it relies on quad-tree or oct-tree.

In [None]:
col = 'Y_01'
for n in range(4, 51): 
    train_tsne = TSNE(n_components = n, method='exact').fit_transform(train_x)
    tsne_train_x = pd.DataFrame(train_tsne, columns=[f'Z_{i}' for i in range(1, n+1)])
    
    valid_tsne = TSNE(n_components = n, method='exact').fit_transform(valid_x)
    tsne_valid_x = pd.DataFrame(valid_tsne, columns=[f'Z_{i}' for i in range(1, n+1)])
    
    LR = LinearRegression().fit(tsne_train_x, train_y[col])
    valid_preds = LR.predict(tsne_valid_x)
    
    rmse = metrics.mean_squared_error(valid_y[col], valid_preds, squared=False)
    nrmse = rmse/np.mean(np.abs(valid_y[col]))
    
    print(n, nrmse)
    
