In [66]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from math import sqrt

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn 

def metric(y_true, y_pred):
    return np.mean(np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0))

In [149]:
train = pd.read_csv('train_scores.csv')
loading = pd.read_csv('loading.csv')

test_idx = [x for x in loading.Id.values if x not in train.Id.values]

preds = sorted([x for x in os.listdir('preds/') if not x.startswith('test')])
tests = sorted([x for x in os.listdir('preds/') if x.startswith('test')])

target_cols= ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']
weights = [ 0.3, 0.175, 0.175, 0.175, 0.175 ]

pred_shape = (5877, 5)

features = [x.split('.')[0] for x in preds]

SEED=0
NUM_SPLITS = 7

In [150]:
print('Individual model Errors:')
print('='*40)

avg_arr = np.zeros(pred_shape)
avg_df = pd.DataFrame(avg_arr, columns=target_cols)

for i, model in enumerate(preds):
    pred = pd.read_csv('preds/{}'.format(preds[i]))
    test = pd.read_csv('preds/{}'.format(tests[i]))
    
    model_score = 0
    for col, w in list(zip(target_cols, weights)):
        model_score += w*metric(train[col], pred[col])
        avg_df[col] += pred[col]
    
    print('{} : {:.5f}'.format(str(model).split('.')[0], model_score))
    print('='*25)

print('Calculating error after averaging the predictions ...')

avg_df /= len(preds)
avg_score = 0
for col, w in list(zip(target_cols, weights)):
    avg_score += w*metric(train[col], avg_df[col])
print('The average  = {:.5f}'.format(avg_score))
print('='*25)

Individual model Errors:
BayesianRidge : 0.16214
ElasticNet : 0.15840
KNeighborsRegressor : 0.16922
LGBMRegressor : 0.16261
Lars : 0.16081
Lasso : 0.15894
RandomForestRegressor : 0.16673
Ridge : 0.15821
SGDRegressor : 0.16225
SVR_linear : 0.15953
SVR_rbf : 0.15858
XGBRegressor : 0.16372
cat : 0.16098
Calculating error after averaging the predictions ...
The average  = 0.15901


In [109]:
models = [ LinearRegression(),
           XGBRegressor(max_depth=1, n_estimators= 25), 
           RandomForestRegressor(max_depth=4, n_estimators=200), 
           ExtraTreesRegressor(max_depth=4, n_estimators= 200) 
         ]
meta_meta_df = pd.DataFrame()
meta_meta_test = pd.DataFrame( )

for model in models:
    
    print('='*10, str(model).split('(')[0], '='*10)
    print()
    model_score = 0
    
    for i, (target, w) in enumerate(list(zip(target_cols, weights))):

        meta_df = pd.DataFrame(columns = features)
        meta_test = pd.DataFrame(columns = features)

        for i in range(len(preds)):

            pred = pd.read_csv('preds/{}'.format(preds[i]))
            meta_df['{}'.format(preds[i].split('.')[0])] = pred[target]

        meta_df[target_cols] = train[target_cols] 

        for i in range(len(tests)):

            test = pd.read_csv('preds/{}'.format(tests[i]))
            meta_test['{}'.format(features[i])] = test[target]


        kf = KFold(n_splits=NUM_SPLITS, shuffle=True, random_state=SEED)


        y_oof = np.zeros(meta_df.shape[0])
        y_test = np.zeros((meta_test.shape[0], NUM_SPLITS))

        for f, (train_ind, val_ind) in enumerate(kf.split(meta_df, meta_df)):
            train_df, val_df = meta_df.iloc[train_ind], meta_df.iloc[val_ind]
            train_df = train_df[train_df[target].notnull()]

            model.fit(train_df[features], train_df[target])

            y_oof[val_ind] = model.predict(val_df[features])
            y_test[:, f] = model.predict(meta_test[features])

        meta_df['pred_{}_{}'.format(str(model).split('(')[0],target)] = y_oof
        meta_test['{}_{}'.format(str(model).split('(')[0],target)] = y_test.mean(axis=1)
        
        meta_meta_df['{}_{}'.format(str(model).split('(')[0],target)] = y_oof
        meta_meta_test['{}_{}'.format(str(model).split('(')[0],target)] = y_test.mean(axis=1)

        score = metric(meta_df[meta_df[target].notnull()][target].values, meta_df[meta_df[target].notnull()]['pred_{}_{}'.format(str(model).split('(')[0],target)].values)
        rmse_score = sqrt(mean_squared_error(meta_df[meta_df[target].notnull()][target].values, meta_df[meta_df[target].notnull()]['pred_{}_{}'.format(str(model).split('(')[0],target)].values))
        print( target, 'metric', np.round(score, 5))
        #print( target, 'rmse',np.round(rmse_score, 5))
        model_score += w * score
        
    print('The final score of the model is = {}'.format(np.round(model_score, 5)))
    print()


age metric 0.142
domain1_var1 metric 0.15105
domain1_var2 metric 0.15131
domain2_var1 metric 0.18164
domain2_var2 metric 0.17628
The final score of the model is = 0.15815


age metric 0.14325
domain1_var1 metric 0.15136
domain1_var2 metric 0.15166
domain2_var1 metric 0.18144
domain2_var2 metric 0.17674
The final score of the model is = 0.15869


age metric 0.14239
domain1_var1 metric 0.15148
domain1_var2 metric 0.15155
domain2_var1 metric 0.18154
domain2_var2 metric 0.17674
The final score of the model is = 0.15845


age metric 0.14195
domain1_var1 metric 0.1511
domain1_var2 metric 0.15139
domain2_var1 metric 0.18142
domain2_var2 metric 0.17651
The final score of the model is = 0.15816



In [151]:
meta_meta_df

Unnamed: 0,LinearRegression_age,LinearRegression_domain1_var1,LinearRegression_domain1_var2,LinearRegression_domain2_var1,LinearRegression_domain2_var2,XGBRegressor_age,XGBRegressor_domain1_var1,XGBRegressor_domain1_var2,XGBRegressor_domain2_var1,XGBRegressor_domain2_var2,RandomForestRegressor_age,RandomForestRegressor_domain1_var1,RandomForestRegressor_domain1_var2,RandomForestRegressor_domain2_var1,RandomForestRegressor_domain2_var2,ExtraTreesRegressor_age,ExtraTreesRegressor_domain1_var1,ExtraTreesRegressor_domain1_var2,ExtraTreesRegressor_domain2_var1,ExtraTreesRegressor_domain2_var2
0,61.698155,55.909191,59.005186,47.346392,50.117321,62.653419,57.863869,58.985943,49.389858,50.276421,62.200829,57.272067,59.126169,49.840462,49.911878,61.911698,55.870477,58.989835,48.841860,50.147836
1,59.050053,53.611535,59.378580,47.042074,52.202775,57.012112,54.675259,58.679852,47.303204,51.577129,57.929948,54.708888,58.403643,46.536301,51.885611,58.267169,54.454971,58.515642,46.991795,51.871751
2,52.893057,52.172925,59.432077,47.245505,49.322844,54.649014,53.576420,59.242516,48.646725,50.074955,54.057412,53.253153,58.941351,48.891107,49.719908,53.209515,53.194020,59.132519,48.252353,49.859685
3,61.773124,57.520089,59.563219,48.264521,53.420590,62.191853,55.957699,59.479877,48.646725,53.507648,61.801942,56.609329,59.338460,49.307686,52.943928,61.223030,56.513033,59.425009,49.449634,53.141917
4,45.591874,50.848622,59.033413,45.955701,52.020715,43.372608,50.032982,57.917343,44.727661,52.465092,43.298549,50.518845,58.356774,44.976112,51.902485,45.004246,50.582208,58.591621,45.233601,52.454358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5872,33.137019,48.240113,59.569448,44.512720,50.540179,35.224617,47.899273,59.970634,44.649265,50.331303,34.493209,48.935699,59.981538,44.482338,50.465459,34.796594,49.411757,59.816673,44.824133,50.311960
5873,52.273359,49.196332,60.674571,47.599648,52.776683,54.649014,50.071007,60.043056,47.379128,52.416714,54.237612,49.660372,59.845990,47.620340,52.527120,54.084117,49.847569,59.244611,46.817433,52.691845
5874,42.410655,49.763191,59.847840,46.762256,53.298842,41.099533,49.819195,59.453625,45.445995,53.209221,40.537290,51.052345,59.618772,45.448485,53.264907,40.575951,50.011263,59.736794,45.292248,53.254794
5875,53.321854,54.945430,58.502482,48.253109,51.249104,53.274975,54.929890,57.393456,49.240810,52.064804,54.375453,55.088315,58.438924,48.816625,52.554894,54.755271,54.483344,58.388612,48.275309,52.349681


In [152]:
meta_meta_test

Unnamed: 0,LinearRegression_age,LinearRegression_domain1_var1,LinearRegression_domain1_var2,LinearRegression_domain2_var1,LinearRegression_domain2_var2,XGBRegressor_age,XGBRegressor_domain1_var1,XGBRegressor_domain1_var2,XGBRegressor_domain2_var1,XGBRegressor_domain2_var2,RandomForestRegressor_age,RandomForestRegressor_domain1_var1,RandomForestRegressor_domain1_var2,RandomForestRegressor_domain2_var1,RandomForestRegressor_domain2_var2,ExtraTreesRegressor_age,ExtraTreesRegressor_domain1_var1,ExtraTreesRegressor_domain1_var2,ExtraTreesRegressor_domain2_var1,ExtraTreesRegressor_domain2_var2
0,57.346748,48.743686,60.010147,49.427853,55.109343,57.723984,48.729517,59.595332,49.482388,55.343074,58.128525,48.844206,59.784538,49.768190,55.918090,58.318726,48.730069,59.776733,49.699855,55.431515
1,64.203994,53.231444,59.345568,49.742896,50.775417,63.484756,53.129129,59.517896,49.738480,50.224243,62.986338,52.856281,59.237472,49.542424,50.114449,62.848044,53.360470,58.906660,49.507302,50.530505
2,37.536597,49.484865,60.095702,44.979600,52.235729,36.948123,49.752775,59.554996,44.967264,53.322168,36.613171,49.769021,59.622642,45.051440,53.297653,36.364661,49.860020,59.596593,45.069223,52.777075
3,50.187257,51.137942,59.861034,47.696430,49.782131,53.013290,51.812508,59.762786,48.093916,49.838542,51.786980,52.056266,59.909813,48.934169,49.570836,51.105449,51.629739,59.833342,48.510131,49.939095
4,52.910851,55.326605,56.530837,46.149420,57.221050,55.111980,55.035415,57.098156,45.436153,55.539588,54.433910,55.307944,56.017662,45.417523,56.248324,54.024007,54.894037,56.841679,45.737073,55.743459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5872,39.648738,50.925330,58.747687,44.986931,53.273382,38.061413,51.743267,59.121572,44.366979,53.558491,38.044970,51.523693,59.250800,44.849846,53.399347,38.273410,50.789288,58.944813,45.030880,53.557481
5873,53.874271,56.264406,58.921954,48.412480,52.207443,55.301570,55.265081,59.246691,48.764062,51.432425,54.774756,55.877811,59.321266,48.445032,51.649199,54.459546,55.905346,59.502746,48.544141,51.630287
5874,40.121138,46.258081,59.295795,45.336036,51.383756,37.616792,45.846445,60.003267,45.430312,50.872007,38.140818,45.718307,59.967990,45.587321,50.897666,39.199821,46.022661,59.756886,45.313708,51.070157
5875,39.235651,50.532692,58.007507,44.916980,56.351516,36.951890,51.812508,59.573356,44.782558,55.225529,37.940463,51.832680,59.332003,45.015324,55.900569,38.825735,51.189816,59.073678,45.079637,55.202918


In [153]:
final_score=0
for col, w in list(zip(target_cols, weights)):
    cols_in_view = [x for x in meta_meta_df.columns if x.endswith(col)]
    final = (meta_meta_df[cols_in_view]).multiply([0.5, 0.05, 0.05, 0.4]).sum(axis=1)
    score = np.round(metric(train[col], final), 5)
    final_score += w*score
    print(col, score)
    
print('Final score = {:.5f}'.format(final_score) )   

age 0.14147
domain1_var1 0.151
domain1_var2 0.15132
domain2_var1 0.18137
domain2_var2 0.17625
Final score = 0.15793


In [157]:
final_test = pd.DataFrame()
final_test['Id'] = test_idx
for col in target_cols:
    cols_in_view = [x for x in meta_meta_df.columns if x.endswith(col)]
    final_test[col] = (meta_meta_test[cols_in_view]).multiply([0.5, 0.05, 0.05, 0.4]).sum(axis=1)

In [161]:
sub_df = pd.melt(final_test[["Id", "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"]], id_vars=["Id"], value_name="Predicted")
sub_df["Id"] = sub_df["Id"].astype("str") + "_" +  sub_df["variable"].astype("str")

sub_df = sub_df.drop("variable", axis=1).sort_values("Id")
sub_df.head(10)

Unnamed: 0,Id,Predicted
0,10003_age,57.79349
5877,10003_domain1_var1,48.742557
11754,10003_domain1_var2,59.884761
17631,10003_domain2_var1,49.556397
23508,10003_domain2_var2,55.290336
1,10006_age,63.564769
5878,10006_domain1_var1,53.25918
11755,10006_domain1_var2,59.173216
17632,10006_domain2_var1,49.638414
23509,10006_domain2_var2,50.616845


In [162]:
sub_df.to_csv('final_submission.csv', index=False)