In [None]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import lightgbm as lgb

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn 

def metric(y_true, y_pred):
    return np.mean(np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0))

!mkdir preds

In [None]:
fnc = pd.read_csv('../input/trends-assessment-prediction/fnc.csv')
loading = pd.read_csv('../input/trends-assessment-prediction/loading.csv')
sample = pd.read_csv('../input/trends-assessment-prediction/sample_submission.csv')
train_scores = pd.read_csv('../input/trends-assessment-prediction/train_scores.csv')

fnc_features, loading_features = list(fnc.columns[1:]), list(loading.columns[1:])
target_cols = ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2', ]
loading = loading.drop(['IC_20'], axis=1)
loading_features.remove('IC_20')

df = fnc.merge(loading, on="Id")

In [None]:
SEED = 0
NUM_FOLDS = 7
FNC_SCALE = 1/500

In [None]:
train_scores["is_train"] = True
df = df.merge(train_scores, on="Id", how="left")

test_df = df[df["is_train"] != True].copy()
df = df[df["is_train"] == True].copy()

In [None]:
df[fnc_features] *= FNC_SCALE
test_df[fnc_features] *= FNC_SCALE

In [None]:
df.age += np.random.uniform(0, 1.0, df.shape[0])
df.domain1_var1 += np.random.uniform(0, 1.0, df.shape[0])
df.domain1_var2 += np.random.uniform(0, 1.0, df.shape[0])
df.domain2_var1 += np.random.uniform(0, 1.0, df.shape[0])
df.domain2_var2 += np.random.uniform(0, 1.0, df.shape[0])

In [None]:
params = [
    
{   'xgb_max_depth' : 2,                     #params of age
    'xgb_lr' : 0.25,
    'xgb_n_estimators': 100,
    'cat_max_depth' : 4 ,                     
    'cat_lr' : 0.05,
    'cat_iterations': 2000 ,
    'lgb_max_depth' : 10,                     
    'lgb_lr' : 0.1,
    'lgb_n_estimators': 500 },

{   'xgb_max_depth' : 2,                     #params of d1v1
    'xgb_lr' : 0.25,
    'xgb_n_estimators': 150,
    'cat_max_depth' : 2,                     
    'cat_lr' : 0.03,
    'cat_iterations': 1500,
    'lgb_max_depth' : 2,                     
    'lgb_lr' : 0.1,
    'lgb_n_estimators':100  },

{   'xgb_max_depth' : 2,                     #params of d1v2
    'xgb_lr' : 0.35,
    'xgb_n_estimators': 10,
    'cat_max_depth' : 2,                     
    'cat_lr' : 0.006,
    'cat_iterations': 1500,
    'lgb_max_depth' : 2,                     
    'lgb_lr' : 0.01,
    'lgb_n_estimators':  200},

{   'xgb_max_depth' : 2,                     #params of d2v1
    'xgb_lr' : 0.05,
    'xgb_n_estimators': 150,
    'cat_max_depth' : 2,                     
    'cat_lr' : 0.02,
    'cat_iterations': 1000,
    'lgb_max_depth' : 2,                     
    'lgb_lr' : 0.075,
    'lgb_n_estimators': 100 },

{   'xgb_max_depth' : 2,                     #params of d2v2
    'xgb_lr' : 0.4,
    'xgb_n_estimators': 10,
    'cat_max_depth' : 2,                     
    'cat_lr' : 0.02,
    'cat_iterations': 500,
    'lgb_max_depth' : 2,                     
    'lgb_lr' : 0.025,
    'lgb_n_estimators': 200 },

]

In [None]:
%%time

for i, target in enumerate(target_cols):
    
    print('========={}.{}========='.format(i+1, target))
    
    xgb = XGBRegressor(max_depth=params[i]['xgb_max_depth'], learning_rate=params[i]['xgb_lr'], n_estimators=params[i]['xgb_n_estimators'], tree_method= 'gpu_hist', verbose=0)
    cat = CatBoostRegressor(max_depth=params[i]['cat_max_depth'], learning_rate=params[i]['cat_lr'], iterations=params[i]['cat_iterations'], task_type = 'GPU', verbose=0)
    lgb = LGBMRegressor(max_depth=params[i]['lgb_max_depth'], learning_rate=params[i]['lgb_lr'], n_estimators=params[i]['lgb_n_estimators'], device='gpu', gpu_platform_id= 0, gpu_device_id=0, verbose=0 )
    
    models = [ xgb, cat, lgb ]
    
    kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)
    features = loading_features + fnc_features 
    
    for model in models:
    
        y_oof = np.zeros(df.shape[0])
        y_test = np.zeros((test_df.shape[0], NUM_FOLDS))

        for f, (train_ind, val_ind) in enumerate(kf.split(df, df)):
            train_df, val_df = df.iloc[train_ind], df.iloc[val_ind]
            train_df = train_df[train_df[target].notnull()]
            
            model.fit(train_df[features].values, train_df[target].values)

            y_oof[val_ind] = model.predict(val_df[features].values)
            y_test[:, f] = model.predict(test_df[features].values)

        df['pred_{}_{}'.format(str(model).split('(')[0],target)] = y_oof
        test_df['{}_{}'.format(str(model).split('(')[0],target)] = y_test.mean(axis=1)

        score = metric(df[df[target].notnull()][target].values, df[df[target].notnull()]['pred_{}_{}'.format(str(model).split('(')[0],target)].values)
        print(str(model).split('(')[0], np.round(score, 5))


In [None]:
to_be_changed_df = [x for x in df.columns if x.startswith('pred_<') ]
new_names_df = [x.split('<')[0]+'cat'+x.split('>')[-1] for x in df.columns if x.startswith('pred_<') ]
to_be_changed_test = [x for x in test_df.columns if x.startswith('<') ]
new_names_test = ['cat'+x.split('>')[-1] for x in test_df.columns if x.startswith('<') ]

In [None]:
df.rename(columns=dict(zip(to_be_changed_df, new_names_df)), inplace=True)
test_df.rename(columns=dict(zip(to_be_changed_test, new_names_test)), inplace=True)

In [None]:
df

In [None]:
models = ['XGBRegressor', 'cat', 'LGBMRegressor' ]

for model in models:
    
    meta_df = pd.DataFrame(columns = ['Id'] + target_cols)
    meta_df.Id = df.Id
    for col in target_cols:
        meta_df[col] = df['pred_{}_{}'.format(str(model).split('(')[0],col)]
        
    meta_df.to_csv('preds/{}.csv'.format(str(model).split('(')[0]), index=False)
    
    meta_test_df = pd.DataFrame(columns = ['Id'] + target_cols)
    meta_test_df.Id = test_df.Id
    for col in target_cols:
        meta_test_df[col] = test_df['{}_{}'.format(str(model).split('(')[0],col)]
        
    meta_test_df.to_csv('preds/test_{}.csv'.format(str(model).split('(')[0]), index=False)