In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet, SGDRegressor, Lars, BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn 

def metric(y_true, y_pred):
    return np.mean(np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0))

In [10]:
fnc = pd.read_csv('fnc.csv')
loading = pd.read_csv('loading.csv')
reveal = pd.read_csv('reveal_ID_site2.csv')
sample = pd.read_csv('sample_submission.csv')
train_scores = pd.read_csv('train_scores.csv')

fnc_features, loading_features = list(fnc.columns[1:]), list(loading.columns[1:])
target_cols = ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2', ]
loading = loading.drop(['IC_20'], axis=1)
loading_features.remove('IC_20')

In [11]:
df = fnc.merge(loading, on="Id")

In [12]:
SEED = 0
NUM_FOLDS = 7
FNC_SCALE = 1/500

In [13]:
scaler = MinMaxScaler()
df.iloc[:,1:] = scaler.fit_transform(df.iloc[:,1:])
scaler = RobustScaler()
df.iloc[:,1:] = scaler.fit_transform(df.iloc[:,1:])

train_scores["is_train"] = True
df = df.merge(train_scores, on="Id", how="left")

test_df = df[df["is_train"] != True].copy()
df = df[df["is_train"] == True].copy()

In [14]:
df[fnc_features] *= FNC_SCALE
test_df[fnc_features] *= FNC_SCALE

In [15]:
df.age += np.random.uniform(0, 1.0, df.shape[0])
df.domain1_var1 += np.random.uniform(0, 1.0, df.shape[0])
df.domain1_var2 += np.random.uniform(0, 1.0, df.shape[0])
df.domain2_var1 += np.random.uniform(0, 1.0, df.shape[0])
df.domain2_var2 += np.random.uniform(0, 1.0, df.shape[0])

In [16]:
params = [
    
{  'ridge_alpha' : 0.0040,                     #params of age
    'lasso_alpha' : 0.00009,
    'enet_alpha' : 0.000003,
    'sgd_alpha' : 0.0003,
    'lars_n_nonzero_coefs' : 150  },

{  'ridge_alpha' : 0.082,                      #params of domain1_var1
    'lasso_alpha' : 0.035,
    'enet_alpha' : 0.045,
    'sgd_alpha' : 0.0003,
    'lars_n_nonzero_coefs' : 150  },

{  'ridge_alpha' : 0.24,                       #params of domain1_var2
    'lasso_alpha' : 0.03,
    'enet_alpha' : 0.00012,
    'sgd_alpha' : 0.0003,
    'lars_n_nonzero_coefs' : 30   },

{  'ridge_alpha' : 0.05,                       #params of domain2_var1
    'lasso_alpha' : 0.0003,
    'enet_alpha' : 0.00004,
    'sgd_alpha' : 0.0003,
    'lars_n_nonzero_coefs' : 120  },

{  'ridge_alpha' : 0.05,                       #params of domain2_var2
    'lasso_alpha' : 0.040,
    'enet_alpha' : 0.00004,
    'sgd_alpha' : 0.0003,
    'lars_n_nonzero_coefs' : 100  }
]

ENET_RATIO = 0.75
SGD_ALPHA = 0.0003
SGD_RATIO = 0.75
B_RIDGE_ITER = 3000

In [23]:
for i, target in enumerate(target_cols):
    
    print('========={}.{}========='.format(i+1, target))
    
    ridge = Ridge( alpha = params[i]['ridge_alpha'] )
    lasso = Lasso( alpha = params[i]['lasso_alpha'] )
    enet = ElasticNet( alpha = params[i]['enet_alpha'], l1_ratio = ENET_RATIO)
    sgd = SGDRegressor(alpha = SGD_ALPHA, l1_ratio = SGD_RATIO )
    lars = Lars(n_nonzero_coefs = params[i]['lars_n_nonzero_coefs'])
    b_ridge = BayesianRidge(n_iter=B_RIDGE_ITER)
    knr = KNeighborsRegressor(n_neighbors = 20, n_jobs=-1)

    models = [ ridge, lasso, enet, sgd, lars, b_ridge, knr ]

    
    kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)
    features = loading_features + fnc_features 
    
    for model in models:
    
        y_oof = np.zeros(df.shape[0])
        y_test = np.zeros((test_df.shape[0], NUM_FOLDS))

        for f, (train_ind, val_ind) in enumerate(kf.split(df, df)):
            train_df, val_df = df.iloc[train_ind], df.iloc[val_ind]
            train_df = train_df[train_df[target].notnull()]

            model.fit(train_df[features], train_df[target])

            y_oof[val_ind] = model.predict(val_df[features])
            y_test[:, f] = model.predict(test_df[features])

        df['pred_{}_{}'.format(str(model).split('(')[0],target)] = y_oof
        test_df['{}_{}'.format(str(model).split('(')[0],target)] = y_test.mean(axis=1)

        score = metric(df[df[target].notnull()][target].values, df[df[target].notnull()]['pred_{}_{}'.format(str(model).split('(')[0],target)].values)
        print(str(model).split('(')[0], np.round(score, 5))


KNeighborsRegressor 0.16602
KNeighborsRegressor 0.1557
KNeighborsRegressor 0.1535
KNeighborsRegressor 0.1848
KNeighborsRegressor 0.18067


In [24]:
for model in models:
    
    meta_df = pd.DataFrame(columns = ['Id'] + target_cols)
    meta_df.Id = df.Id
    for col in target_cols:
        meta_df[col] = df['pred_{}_{}'.format(str(model).split('(')[0],col)]
        
    meta_df.to_csv('preds/{}.csv'.format(str(model).split('(')[0]), index=False)
    
    meta_test_df = pd.DataFrame(columns = ['Id'] + target_cols)
    meta_test_df.Id = test_df.Id
    for col in target_cols:
        meta_test_df[col] = test_df['{}_{}'.format(str(model).split('(')[0],col)]
        
    meta_test_df.to_csv('preds/test_{}.csv'.format(str(model).split('(')[0]), index=False)

In [None]:
#this is level 1 of linear models