In [None]:
import sys
!cp ../input/rapids/rapids.0.13.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.6/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.6"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import cupy as cp
import cudf
import cuml 
from cuml.ensemble import RandomForestRegressor


import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn 

def metric(y_true, y_pred):
    return np.mean(np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0))

!mkdir preds

In [None]:
fnc = pd.read_csv('../input/trends-assessment-prediction/fnc.csv')
loading = pd.read_csv('../input/trends-assessment-prediction/loading.csv')
sample = pd.read_csv('../input/trends-assessment-prediction/sample_submission.csv')
train_scores = pd.read_csv('../input/trends-assessment-prediction/train_scores.csv')


fnc_features, loading_features = list(fnc.columns[1:]), list(loading.columns[1:])
target_cols = ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2', ]
loading = loading.drop(['IC_20'], axis=1)
loading_features.remove('IC_20')

df = fnc.merge(loading, on="Id")

In [None]:
SEED = 0
NUM_FOLDS = 7
FNC_SCALE = 1/500

In [None]:
train_scores["is_train"] = True
df = df.merge(train_scores, on="Id", how="left")

test_df = df[df["is_train"] != True].copy()
df = df[df["is_train"] == True].copy()

df[fnc_features] *= FNC_SCALE
test_df[fnc_features] *= FNC_SCALE

In [None]:
df.age += np.random.uniform(0, 1.0, df.shape[0])
df.domain1_var1 += np.random.uniform(0, 1.0, df.shape[0])
df.domain1_var2 += np.random.uniform(0, 1.0, df.shape[0])
df.domain2_var1 += np.random.uniform(0, 1.0, df.shape[0])
df.domain2_var2 += np.random.uniform(0, 1.0, df.shape[0])

In [None]:
features = fnc_features + loading_features + target_cols
df[features] = df[features].astype(np.float32)
test_df[features] = test_df[features].astype(np.float32)

In [None]:
df = cudf.from_pandas(df)
test_df = cudf.from_pandas(test_df)

In [None]:
params = [
    
{   'rf_max_depth' : 10,                     #params of age
    'rf_n_estimators': 100 }, 
    
{   'rf_max_depth' : 10,                     #params of d1v1
    'rf_n_estimators': 100 }, 

{   'rf_max_depth' : 10,                     #params of d1v2
    'rf_n_estimators': 250 }, 
    
{   'rf_max_depth' : 10,                     #params of d2v1
    'rf_n_estimators': 150 }, 
    
{   'rf_max_depth' : 10,                     #params of d2v2
    'rf_n_estimators': 250 }
    
]

In [None]:
for i, target in enumerate(target_cols):
    
    print('========={}.{}========='.format(i+1, target))
    
    rf = RandomForestRegressor(n_estimators=params[i]['rf_n_estimators'], max_depth=params[i]['rf_max_depth'])
    models = [ rf ]
    
    kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)
    features = loading_features + fnc_features 
    
    for model in models:
    
        y_oof = np.zeros(df.shape[0])
        y_test = np.zeros((test_df.shape[0], NUM_FOLDS))

        for f, (train_ind, val_ind) in enumerate(kf.split(df, df)):
            train_df, val_df = df.iloc[train_ind], df.iloc[val_ind]
            train_df = train_df[train_df[target].notnull()]
            
            model.fit(train_df[features], train_df[target])

            y_oof[val_ind] = model.predict(val_df[features])
            y_test[:, f] = model.predict(test_df[features])

        df['pred_{}_{}'.format(str(model).split('(')[0],target)] = y_oof
        test_df['{}_{}'.format(str(model).split('(')[0],target)] = y_test.mean(axis=1)

        score = metric(df[df[target].notnull()][target].values, df[df[target].notnull()]['pred_{}_{}'.format(str(model).split('(')[0],target)].values)
        print(str(model).split('(')[0], np.round(score, 5))


In [None]:
for model in models:
    
    meta_df = cudf.DataFrame(columns = ['Id'] + target_cols)
    meta_df.Id = df.Id
    for col in target_cols:
        meta_df[col] = df['pred_{}_{}'.format(str(model).split('(')[0],col)]
        
    meta_df.to_csv('{}.csv'.format(str(model).split('(')[0]), index=False)
    
    meta_test_df = cudf.DataFrame(columns = ['Id'] + target_cols)
    meta_test_df.Id = test_df.Id
    for col in target_cols:
        meta_test_df[col] = test_df['{}_{}'.format(str(model).split('(')[0],col)]
        
    meta_test_df.to_csv('test_{}.csv'.format(str(model).split('(')[0]), index=False)