In [1]:
import numpy as np 
import pandas as pd
import json 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score

from tqdm import tqdm
tqdm.pandas()
from sklearn.linear_model import ElasticNet
import statsmodels.api as sm
from copy import deepcopy

import Utils
import models

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_excel('Cleaned_data.xlsx')
df1 = pd.read_excel('PMT_N+2SD+3SD.xlsx', sheet_name='N-2SD')

In [3]:
over_perf = df1.filter(regex = 'Overperformers')
over_perf['log'] = df1['LOG']
#over_perf = over_perf.drop(columns=['Overperformers on levels 1 and 2', 'Overperformers on levels 2 and 3'])
over_perf = over_perf.fillna(0).loc[:342]
df = df.merge(over_perf, on='log', how='left')


df = df.drop(columns=['log'])
df['gender'] = pd.factorize(df['gender'])[0]


df.columns = ['age', 'gender', 'ACC_ADD1', 'ACC_ADD2', 'ACC_ADD3', 'ACC_DIV1', 'ACC_DIV2', 'ACC_DIV3', 'ACC_MUL1',
       'ACC_MUL2', 'ACC_MUL3', 'ACC_SUB1', 'ACC_SUB2', 'ACC_SUB3', 'RT_ADD1', 'RT_ADD2', 'RT_ADD3',
       'RT_DIV1', 'RT_DIV2', 'RT_DIV3', 'RT_MUL1', 'RT_MUL2', 'RT_MUL3',
       'RT_SUB1', 'RT_SUB2', 'RT_SUB3', 'ADD1', 'ADD2', 'ADD3', 'DIV1',
       'DIV2', 'DIV3', 'MUL1', 'MUL2', 'MUL3', 'SUB1', 'SUB2',
       'SUB3', 'm_score_bal', 'acc_1_bal', 'acc_2_bal', 'acc_3_bal',
       'acc_4_bal', 'acc_5_bal', 'acc_6_bal', 'rt_1_bal', 'rt_2_bal',
       'rt_3_bal', 'rt_4_bal', 'rt_5_bal', 'rt_6_bal', 'm_score_cl',
       'acc_1_cl', 'acc_2_cl', 'acc_3_cl', 'acc_4_cl', 'acc_5_cl', 'acc_6_cl',
       'rt_1_cl', 'rt_2_cl', 'rt_3_cl', 'rt_4_cl', 'rt_5_cl', 'rt_6_cl',
       'n_sum', 'rt_mean', 'rt_cmt_mean', 'rt_bal', 'rt_cl', 'O_12', 'O_23', 'O_ADD', 'O_DIV','O_MUL', 'O_SUB']


df['O_12'][(df['O_12'] == ' 1+2')] = 1
df['O_12'] = df['O_12'].astype(int)
df['O_23'][(df['O_23'] == ' 2+3')] = 1
df['O_23'] = df['O_23'].astype(int)


seed = 0xAB0BA
np.random.seed(seed)

In [4]:
targets = ['1', '2', '3']

In [5]:
df_list = {}
for target in targets:
    valid_columns = []
    target_col = pd.Series(np.zeros(len(df)))
    for column in df.columns:
        if ('ACC' in column) and (target in column):
            target_col += df[column]
        else:
            valid_columns.append(column)
    df_list[target] = (df[valid_columns], target_col)



In [6]:
results = {}

for name in tqdm(targets):
    X, y = df_list[name]
    
    res = models.LinRegStatmodels(X, y)
    
    r2 = res.rsquared
    params = res.params.index
    
    results[name] = res

100%|██████████| 3/3 [00:01<00:00,  2.16it/s]


In [12]:
results['3'].summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.659
Model:,OLS,Adj. R-squared:,0.644
Method:,Least Squares,F-statistic:,45.07
Date:,"Wed, 29 Nov 2023",Prob (F-statistic):,2.33e-44
Time:,00:36:44,Log-Likelihood:,-190.16
No. Observations:,220,AIC:,400.3
Df Residuals:,210,BIC:,434.3
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.0184,0.456,6.619,0.000,2.119,3.917
ACC_ADD1,-2.9255,0.552,-5.300,0.000,-4.014,-1.837
ACC_ADD2,0.7910,0.314,2.522,0.012,0.173,1.409
ACC_DIV1,-1.0434,0.374,-2.789,0.006,-1.781,-0.306
ACC_DIV2,1.0956,0.204,5.369,0.000,0.693,1.498
ACC_MUL2,1.1735,0.243,4.834,0.000,0.695,1.652
ACC_SUB2,0.8538,0.259,3.295,0.001,0.343,1.365
RT_ADD1,0.2322,0.037,6.322,0.000,0.160,0.305
RT_ADD3,-0.0829,0.013,-6.254,0.000,-0.109,-0.057

0,1,2,3
Omnibus:,1.189,Durbin-Watson:,1.973
Prob(Omnibus):,0.552,Jarque-Bera (JB):,0.862
Skew:,0.099,Prob(JB):,0.65
Kurtosis:,3.234,Cond. No.,228.0


In [7]:
params_frequency = {}
for name in df.columns:
    params_frequency[name] = set()

params_frequency['const'] = set()

for res in results:
    for param_name in results[res].params.index:
        params_frequency[param_name].add(res)


In [8]:
params_frequency

{'age': set(),
 'gender': {'2'},
 'ACC_ADD1': {'2', '3'},
 'ACC_ADD2': {'1', '3'},
 'ACC_ADD3': {'1', '2'},
 'ACC_DIV1': {'2', '3'},
 'ACC_DIV2': {'1', '3'},
 'ACC_DIV3': {'1', '2'},
 'ACC_MUL1': {'2'},
 'ACC_MUL2': {'1', '3'},
 'ACC_MUL3': {'1', '2'},
 'ACC_SUB1': {'2'},
 'ACC_SUB2': {'1', '3'},
 'ACC_SUB3': {'2'},
 'RT_ADD1': {'2', '3'},
 'RT_ADD2': set(),
 'RT_ADD3': {'2', '3'},
 'RT_DIV1': set(),
 'RT_DIV2': set(),
 'RT_DIV3': set(),
 'RT_MUL1': set(),
 'RT_MUL2': {'1'},
 'RT_MUL3': set(),
 'RT_SUB1': set(),
 'RT_SUB2': set(),
 'RT_SUB3': {'3'},
 'ADD1': set(),
 'ADD2': set(),
 'ADD3': set(),
 'DIV1': {'1'},
 'DIV2': set(),
 'DIV3': set(),
 'MUL1': set(),
 'MUL2': {'1'},
 'MUL3': set(),
 'SUB1': set(),
 'SUB2': set(),
 'SUB3': {'2'},
 'm_score_bal': set(),
 'acc_1_bal': set(),
 'acc_2_bal': set(),
 'acc_3_bal': set(),
 'acc_4_bal': set(),
 'acc_5_bal': set(),
 'acc_6_bal': set(),
 'rt_1_bal': set(),
 'rt_2_bal': set(),
 'rt_3_bal': set(),
 'rt_4_bal': set(),
 'rt_5_bal': set(),
 'r

In [None]:

for name in params_frequency:
    new_input = str(params_frequency[name])
    if new_input == 'set()':
        new_input = ''
    params_frequency[name] = new_input

with open("params_in_models_new.json", "w") as outfile:
    json.dump(params_frequency, outfile, indent=4)


r2_dict = {}
for name in results:
    r2_dict[name] = results[name].rsquared

with open("r2_LinReg_new.json", "w") as outfile:
    json.dump(r2_dict, outfile, indent=4)


rmse_dict = {}
for name in results:
    rmse_dict[name] = float(np.sqrt(results[name].mse_total))

with open("rmse_LinReg_new.json", "w") as outfile:
    json.dump(rmse_dict, outfile, indent=4)

In [24]:
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.ensemble import RandomForestClassifier

In [25]:
from xgboost import XGBRegressor
from sklearn.model_selection import RepeatedKFold, cross_val_score


In [26]:
result = {}
for name in tqdm(targets):
    X, y = df_list[name]

    op, rmse, model = models.XGBReg(X, y)

    result[name] = (op, rmse, model)

100%|██████████| 3/3 [00:29<00:00,  9.71s/it]


In [29]:
result['3']

((100, 2, 0.3, 0.6),
 0.6458828026619117,
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, eta=0.3, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None))

In [30]:
rmse_dic_xgb = {}
opt_params = {}
for elem in result:
    rmse_dic_xgb[elem] = result[elem][1]
    opt_params[elem] = result[elem][0]

In [31]:
with open("rmse_XGB_new.json", "w") as outfile:
    json.dump(rmse_dic_xgb, outfile, indent=4)
with open("params_XGB_new.json", "w") as outfile:
    json.dump(opt_params, outfile, indent=4)

RF

In [32]:
result = {}
for name in tqdm(targets):
    X, y = df_list[name]

    op, rmse, model = models.RFReg(X, y)

    result[name] = (op, rmse, model)

  0%|          | 0/3 [00:00<?, ?it/s]

In [33]:
rmse_dic_rf = {}
opt_params = {}
for elem in result:
    rmse_dic_rf[elem] = result[elem][1]
    opt_params[elem] = result[elem][0]

In [35]:
with open("rmse_RF_new.json", "w") as outfile:
    json.dump(rmse_dic_rf, outfile, indent=4)

with open("params_RF_new.json", "w") as outfile:
    json.dump(opt_params, outfile, indent=4)

comparison


In [36]:
with open('rmse_RF_new.json') as file:
    res_rf = json.load(file)
with open('params_RF_new.json') as file:
    params_rf = json.load(file)

with open('rmse_XGB_new.json') as file:
    res_gb = json.load(file)

with open('params_XGB_new.json') as file:
    params_gb = json.load(file)

In [38]:
res_rf

{'1': 0.27264622347353584, '2': 0.4232025548675165, '3': 0.6024865819197409}

In [39]:
opt_models = {}
for elem in tqdm(res_rf):
    diff = res_rf[elem] - res_gb[elem]
    if diff > 0:
        opt_params = params_gb[elem]
        model = XGBRegressor(n_estimators = opt_params[0], max_depth=opt_params[1], eta=opt_params[2], colsample_bytree=opt_params[3])
    else:
        opt_params = params_rf[elem]
        model = RandomForestRegressor(n_estimators=opt_params[0], max_depth=opt_params[1], min_samples_leaf=opt_params[2], min_samples_split=opt_params[3], n_jobs=-1)

    X,y = df_list[elem]
    model.fit(X,y)

    opt_models[elem] = model  


FI5 = {}
for elem in opt_models:
    if isinstance(opt_models[elem], XGBRegressor):
        FI = list(zip(opt_models[elem].feature_importances_, opt_models[elem].get_booster().feature_names))
        FI5[elem] = ([(elem[1], float(elem[0])) for elem in sorted(FI, key = lambda x: x[0], reverse=True)[:5]], 'GB')
    else:
        X,y = df_list[elem]
        FI = list(zip(opt_models[elem].feature_importances_, X.columns))
        FI5[elem] = ([(elem[1], float(elem[0])) for elem in sorted(FI, key = lambda x: x[0], reverse=True)[:5]], 'RF')

    
# with open('FeatureImportance.json', 'w') as file:
#     json.dump(FI5, file, indent=4)

100%|██████████| 3/3 [00:02<00:00,  1.40it/s]


In [41]:
with open('FeatureImportance5_new.json', 'w') as file:
     json.dump(FI5, file, indent=4)