In [1]:
import numpy as np 
import pandas as pd
import json 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from tqdm import tqdm
from sklearn.linear_model import ElasticNet
import statsmodels.api as sm
from copy import deepcopy

import Utils
import models

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_excel('Cleaned_data.xlsx')
df = df.drop(columns=['log'])
df['gender'] = pd.factorize(df['gender'])[0]


df.columns = ['age', 'gender', 'ACC_ADD1', 'ACC_ADD2', 'ACC_ADD3', 'ACC_DIV1', 'ACC_DIV2', 'ACC_DIV3', 'ACC_MUL1',
       'ACC_MUL2', 'ACC_MUL3', 'ACC_SUB1', 'ACC_SUB2', 'ACC_SUB3', 'RT_ADD1', 'RT_ADD2', 'RT_ADD3',
       'RT_DIV1', 'RT_DIV2', 'RT_DIV3', 'RT_MUL1', 'RT_MUL2', 'RT_MUL3',
       'RT_SUB1', 'RT_SUB2', 'RT_SUB3', 'ADD1', 'ADD2', 'ADD3', 'DIV1',
       'DIV2', 'DIV3', 'MUL1', 'MUL2', 'MUL3', 'SUB1', 'SUB2',
       'SUB3', 'm_score_bal', 'acc_1_bal', 'acc_2_bal', 'acc_3_bal',
       'acc_4_bal', 'acc_5_bal', 'acc_6_bal', 'rt_1_bal', 'rt_2_bal',
       'rt_3_bal', 'rt_4_bal', 'rt_5_bal', 'rt_6_bal', 'm_score_cl',
       'acc_1_cl', 'acc_2_cl', 'acc_3_cl', 'acc_4_cl', 'acc_5_cl', 'acc_6_cl',
       'rt_1_cl', 'rt_2_cl', 'rt_3_cl', 'rt_4_cl', 'rt_5_cl', 'rt_6_cl',
       'n_sum', 'rt_mean', 'rt_cmt_mean', 'rt_bal', 'rt_cl']


seed = 0xAB0BA
np.random.seed(seed)

In [3]:
types = ['ACC_', 'RT_', '']
names = ['ADD', "DIV", "MUL", "SUB"]

for t in types:
    for name in names:
        col_name = t+name
        new_name = 'DIFF_' + col_name

        df[new_name + '21'] = df[col_name+'2'] - df[col_name+'1']
        df[new_name + '32'] = df[col_name+'3'] - df[col_name+'2']
        df[new_name + '31'] = df[col_name+'3'] - df[col_name+'1']



In [4]:
def create_y(df):
    var_names = []
    for i in range(26, 38):
        name = df.iloc[:,i].name
        vals = df.iloc[:,i]

        var_names.append(name)

        globals()[name] = vals

    for i in range(2, 14):
        name = df.iloc[:,i].name
        vals = df.iloc[:,i]

        var_names.append(name)

        globals()[name] = vals


    global ADD 
    global DIV
    global SUB
    global MUL

    global SUM
    
    ADD = ADD1 + ADD2 + ADD3
    DIV = DIV1 + DIV2 + DIV3
    SUB = SUB1 + SUB2 + SUB3
    MUL = MUL1 + MUL2 + MUL3
    
    SUM = ADD + DIV + SUB + MUL

    var_names += ['ADD', 'DIV', "MUL", 'SUB', "SUM"]
    
    return var_names


var_names = create_y(df)

# Statmodels

In [18]:
results = {}

for name in tqdm(var_names):
    
    y = globals()[name]
    
    X = Utils.make_X(df, name)
    
    y_pred, y_test, r2, mse = models.LinRegSklearn(X, y, alpha = 3.15, l1_ratio = 0.1, normalize = False, max_iter = 5000)
    
    results[name] = mse

100%|██████████| 29/29 [00:00<00:00, 51.01it/s]


In [19]:
results

{'ADD1': 10.718606500213202,
 'ADD2': 4.287739120094503,
 'ADD3': 2.8757280246249324,
 'DIV1': 6.636674447555967,
 'DIV2': 3.8129091368806765,
 'DIV3': 1.9540534093153792,
 'MUL1': 8.479028079548169,
 'MUL2': 2.6992911340898105,
 'MUL3': 2.286514027421679,
 'SUB1': 6.143939318523339,
 'SUB2': 3.1668302232292627,
 'SUB3': 2.0025949814391217,
 'ACC_ADD1': 0.10553090292237144,
 'ACC_ADD2': 0.17093849736180955,
 'ACC_ADD3': 0.21766645148978,
 'ACC_DIV1': 0.16363677020316802,
 'ACC_DIV2': 0.24851286131851277,
 'ACC_DIV3': 0.2605392790692473,
 'ACC_MUL1': 0.13637547567801594,
 'ACC_MUL2': 0.2259187997982806,
 'ACC_MUL3': 0.27579770173212576,
 'ACC_SUB1': 0.13047999610681327,
 'ACC_SUB2': 0.2536040303431101,
 'ACC_SUB3': 0.3810692762236598,
 'ADD': 16.64863800190257,
 'DIV': 10.495974709699283,
 'MUL': 11.077272921749131,
 'SUB': 7.500171614619702,
 'SUM': 1.1320211018099344}

In [52]:
y = globals()['DIV']

X = Utils.make_X(df, 'DIV')
res = models.LinRegStatmodels(X, y)

In [53]:
res.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.961
Model:,OLS,Adj. R-squared (uncentered):,0.959
Method:,Least Squares,F-statistic:,572.8
Date:,"Sat, 10 Sep 2022",Prob (F-statistic):,5.28e-143
Time:,20:49:17,Log-Likelihood:,-834.75
No. Observations:,220,AIC:,1687.0
Df Residuals:,211,BIC:,1718.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,-0.6762,0.237,-2.858,0.005,-1.143,-0.210
RT_ADD1,2.8565,0.568,5.030,0.000,1.737,3.976
RT_ADD3,1.5035,0.360,4.175,0.000,0.794,2.213
RT_MUL2,-1.1764,0.373,-3.156,0.002,-1.911,-0.442
ADD1,0.3510,0.095,3.681,0.000,0.163,0.539
ADD3,1.3654,0.262,5.215,0.000,0.849,1.882
MUL1,0.8492,0.093,9.160,0.000,0.666,1.032
m_score_cl,-1.4232,0.580,-2.456,0.015,-2.566,-0.281
DIFF_RT_ADD31,-1.3529,0.253,-5.340,0.000,-1.852,-0.853

0,1,2,3
Omnibus:,176.546,Durbin-Watson:,1.962
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5036.708
Skew:,2.735,Prob(JB):,0.0
Kurtosis:,25.794,Cond. No.,1.15e+17


In [54]:
results = {}

for name in tqdm(var_names):
    X = Utils.make_X(df, name)
    y = globals()[name]
    
    res = models.LinRegStatmodels(X, y)
    
    r2 = res.rsquared
    params = res.params.index
    
    results[name] = res

100%|██████████| 29/29 [00:13<00:00,  2.21it/s]


In [55]:
params_frequency = {}
for name in df.columns:
    params_frequency[name] = set()

params_frequency['const'] = set()

for res in results:
    for param_name in results[res].params.index:
        params_frequency[param_name].add(res)

for name in params_frequency:
    new_input = str(params_frequency[name])
    if new_input == 'set()':
        new_input = ''
    params_frequency[name] = new_input

with open("params_in_models.json", "w") as outfile:
    json.dump(params_frequency, outfile, indent=4)


r2_dict = {}
for name in results:
    r2_dict[name] = results[name].rsquared

with open("r2_LinReg.json", "w") as outfile:
    json.dump(r2_dict, outfile, indent=4)


rmse_dict = {}
for name in results:
    rmse_dict[name] = float(np.sqrt(results[name].mse_total))

with open("rmse_LinReg.json", "w") as outfile:
    json.dump(rmse_dict, outfile, indent=4)

In [56]:
results['MUL2'].summary()

0,1,2,3
Dep. Variable:,MUL2,R-squared (uncentered):,0.945
Model:,OLS,Adj. R-squared (uncentered):,0.942
Method:,Least Squares,F-statistic:,327.8
Date:,"Sat, 10 Sep 2022",Prob (F-statistic):,3.1200000000000004e-125
Time:,20:49:38,Log-Likelihood:,-520.62
No. Observations:,220,AIC:,1063.0
Df Residuals:,209,BIC:,1101.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,-0.1437,0.061,-2.369,0.019,-0.263,-0.024
ACC_MUL1,-5.3426,1.532,-3.488,0.001,-8.362,-2.323
RT_ADD2,-0.2673,0.090,-2.961,0.003,-0.445,-0.089
ADD3,0.2145,0.080,2.672,0.008,0.056,0.373
MUL1,0.1294,0.023,5.706,0.000,0.085,0.174
SUB1,0.1425,0.034,4.164,0.000,0.075,0.210
SUB2,0.1474,0.059,2.491,0.014,0.031,0.264
SUB3,0.2417,0.058,4.135,0.000,0.126,0.357
acc_4_bal,0.0653,0.018,3.623,0.000,0.030,0.101

0,1,2,3
Omnibus:,7.256,Durbin-Watson:,1.701
Prob(Omnibus):,0.027,Jarque-Bera (JB):,8.033
Skew:,0.304,Prob(JB):,0.018
Kurtosis:,3.711,Cond. No.,2.99e+16


In [None]:
# for name in tqdm(results):


#     plt.rc('figure', figsize=(12, 7))
#     #plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}) old approach
#     plt.text(0.01, 0.05, str(results[name].summary()), {'fontsize': 10}, fontproperties = 'monospace') # approach improved by OP -> monospace!
#     plt.axis('off')
#     plt.tight_layout()
#     plt.savefig(name+'.jpg')
#     plt.show()


# ML

In [7]:
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor


In [9]:

regressors = [KNeighborsRegressor(7),
    ElasticNet(),
    BayesianRidge(),
    SVR(C=1.0, epsilon=0.2),
    GaussianProcessRegressor(),
    DecisionTreeRegressor(),
    AdaBoostRegressor(n_estimators=200),
    GradientBoostingRegressor(n_estimators=200),
    RandomForestRegressor(n_estimators=400),
    MLPRegressor()
]

In [10]:
score = {}
y = globals()['MUL']

X = Utils.make_X(df, 'MUL')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)



for model in tqdm(regressors):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score[model] = mean_squared_error(y_pred,y_test, squared=False)

100%|██████████| 10/10 [00:02<00:00,  4.64it/s]


In [11]:
score

{KNeighborsRegressor(n_neighbors=7): 13.150736392162734,
 ElasticNet(): 11.189444784832082,
 BayesianRidge(): 11.003424112523298,
 SVR(epsilon=0.2): 17.067402587239716,
 GaussianProcessRegressor(): 59.52794098216196,
 DecisionTreeRegressor(): 17.009355714376188,
 AdaBoostRegressor(n_estimators=200): 11.152850855251387,
 GradientBoostingRegressor(n_estimators=200): 10.798462575200707,
 RandomForestRegressor(n_estimators=400): 10.339087347441762,
 MLPRegressor(): 23.705053771459202}

## GB

In [79]:
from xgboost import XGBRegressor
from sklearn.model_selection import RepeatedKFold, cross_val_score


In [13]:
result = {}
for name in tqdm(var_names):
    y = globals()[name]
    X = Utils.make_X(df, name)

    op, rmse, model = models.XGBReg(X, y)

    result[name] = (op, rmse, model)

100%|██████████| 29/29 [38:57<00:00, 80.59s/it]


In [15]:
rmse_dic_xgb = {}
opt_params = {}
for elem in result:
    rmse_dic_xgb[elem] = result[elem][1]
    opt_params[elem] = result[elem][0]

In [23]:
with open("rmse_XGB.json", "w") as outfile:
    json.dump(rmse_dic_xgb, outfile, indent=4)

In [24]:
with open("params_XGB.json", "w") as outfile:
    json.dump(opt_params, outfile, indent=4)