In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import (RandomForestRegressor, ExtraTreesRegressor,
                              AdaBoostRegressor, GradientBoostingRegressor)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn 

def metric(y_true, y_pred):
    return np.mean(np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0))

In [2]:
fnc = pd.read_csv('fnc.csv')
loading = pd.read_csv('loading.csv')
reveal = pd.read_csv('reveal_ID_site2.csv')
sample = pd.read_csv('sample_submission.csv')
train_scores = pd.read_csv('train_scores.csv')

fnc_features, loading_features = list(fnc.columns[1:]), list(loading.columns[1:])
target_cols = ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2', ]
loading = loading.drop(['IC_20'], axis=1)
loading_features.remove('IC_20')

In [3]:
df = fnc.merge(loading, on="Id")

In [4]:
SEED = 0
NUM_FOLDS = 7
FNC_SCALE = 1/500

In [5]:
train_scores["is_train"] = True
df = df.merge(train_scores, on="Id", how="left")

test_df = df[df["is_train"] != True].copy()
df = df[df["is_train"] == True].copy()

In [6]:
df[fnc_features] *= FNC_SCALE
test_df[fnc_features] *= FNC_SCALE

In [7]:
df.age += np.random.uniform(0, 1.0, df.shape[0])
df.domain1_var1 += np.random.uniform(0, 1.0, df.shape[0])
df.domain1_var2 += np.random.uniform(0, 1.0, df.shape[0])
df.domain2_var1 += np.random.uniform(0, 1.0, df.shape[0])
df.domain2_var2 += np.random.uniform(0, 1.0, df.shape[0])

In [8]:
params = [
    
{  'ridge_alpha' : 0.0040,                     #params of age
    'lasso_alpha' : 0.00009,
    'enet_alpha' : 0.000003,
    'sgd_alpha' : 0.0003,
    'lars_n_nonzero_coefs' : 150  },

{  'ridge_alpha' : 0.082,                      #params of domain1_var1
    'lasso_alpha' : 0.035,
    'enet_alpha' : 0.045,
    'sgd_alpha' : 0.0003,
    'lars_n_nonzero_coefs' : 150  },

{  'ridge_alpha' : 0.24,                       #params of domain1_var2
    'lasso_alpha' : 0.03,
    'enet_alpha' : 0.00012,
    'sgd_alpha' : 0.0003,
    'lars_n_nonzero_coefs' : 30   },

{  'ridge_alpha' : 0.05,                       #params of domain2_var1
    'lasso_alpha' : 0.0003,
    'enet_alpha' : 0.00004,
    'sgd_alpha' : 0.0003,
    'lars_n_nonzero_coefs' : 120  },

{  'ridge_alpha' : 0.05,                       #params of domain2_var2
    'lasso_alpha' : 0.040,
    'enet_alpha' : 0.00004,
    'sgd_alpha' : 0.0003,
    'lars_n_nonzero_coefs' : 100  }
]

ENET_RATIO = 0.75
SGD_ALPHA = 0.0003
SGD_RATIO = 0.75
B_RIDGE_ITER = 3000

In [9]:
for i, target in enumerate(target_cols):
    
    print('========={}.{}========='.format(i+1, target))
    
    ridge = Ridge( alpha = params[i]['ridge_alpha'] )
    lasso = Lasso( alpha = params[i]['lasso_alpha'] )
    enet = ElasticNet( alpha = params[i]['enet_alpha'], l1_ratio = ENET_RATIO)
    sgd = SGDRegressor(alpha = SGD_ALPHA, l1_ratio = SGD_RATIO )
    lars = Lars(n_nonzero_coefs = params[i]['lars_n_nonzero_coefs'])
    b_ridge = BayesianRidge(n_iter=B_RIDGE_ITER)

    models = [ ridge, lasso, enet, sgd, lars, b_ridge ]
    
    kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)
    features = loading_features + fnc_features 
    
    for model in models:
    
        y_oof = np.zeros(df.shape[0])
        y_test = np.zeros((test_df.shape[0], NUM_FOLDS))

        for f, (train_ind, val_ind) in enumerate(kf.split(df, df)):
            train_df, val_df = df.iloc[train_ind], df.iloc[val_ind]
            train_df = train_df[train_df[target].notnull()]

            model.fit(train_df[features], train_df[target])

            y_oof[val_ind] = model.predict(val_df[features])
            y_test[:, f] = model.predict(test_df[features])

        df['pred_{}_{}'.format(str(model).split('(')[0],target)] = y_oof
        test_df['{}_{}'.format(str(model).split('(')[0],target)] = y_test.mean(axis=1)

        score = metric(df[df[target].notnull()][target].values, df[df[target].notnull()]['pred_{}_{}'.format(str(model).split('(')[0],target)].values)
        print(str(model).split('(')[0], np.round(score, 5))


Ridge 0.14165
Lasso 0.14232
ElasticNet 0.14165
SGDRegressor 0.15251
Lars 0.1485
BayesianRidge 0.15246
Ridge 0.14948
Lasso 0.1504
ElasticNet 0.15039
SGDRegressor 0.15068
Lars 0.15028
BayesianRidge 0.15045
Ridge 0.15001
Lasso 0.15008
ElasticNet 0.15003
SGDRegressor 0.15028
Lars 0.15004
BayesianRidge 0.15009
Ridge 0.17972
Lasso 0.18004
ElasticNet 0.17973
SGDRegressor 0.1808
Lars 0.18074
BayesianRidge 0.18071
Ridge 0.17448
Lasso 0.1761
ElasticNet 0.17453
SGDRegressor 0.17622
Lars 0.17573
BayesianRidge 0.17606


In [10]:
for model in models:
    
    meta_df = pd.DataFrame(columns = ['Id'] + target_cols)
    meta_df.Id = df.Id
    for col in target_cols:
        meta_df[col] = df['pred_{}_{}'.format(str(model).split('(')[0],col)]
        
    meta_df.to_csv('preds/{}.csv'.format(str(model).split('(')[0]), index=False)
    
    meta_test_df = pd.DataFrame(columns = ['Id'] + target_cols)
    meta_test_df.Id = test_df.Id
    for col in target_cols:
        meta_test_df[col] = test_df['{}_{}'.format(str(model).split('(')[0],col)]
        
    meta_test_df.to_csv('preds/test_{}.csv'.format(str(model).split('(')[0]), index=False)

In [None]:
#this is level 1 of linear models