In [145]:
%reset -f

In [146]:
import matplotlib.pyplot as plt
plt.style.use('seaborn')

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression

from collections import Counter

import psutil

from colorama import Style, Fore, Back

In [147]:
df = pd.read_csv('data_Regression.csv')
df.head()

Unnamed: 0,Ft_1,Ft_2,Ft_3,Ft_4,Ft_5,Ft_6,Ft_7,Ft_8,Ft_9,Ft_10,y
0,-0.157627,-0.92092,-1.0184,-0.590008,-1.255163,-0.392845,1.803401,0.890018,-0.196411,-0.023246,-107.805388
1,-0.348395,0.801671,-0.402265,-0.405617,-1.077407,0.426597,0.648775,0.907793,0.578962,-1.477395,-136.844678
2,1.085262,0.045359,-0.967618,0.072622,-0.067625,-2.489593,0.841832,0.389132,0.701437,-0.691908,-177.479185
3,1.511375,-0.547199,1.593585,0.421384,-0.334831,-0.08326,0.115761,-0.311841,1.156146,-0.143509,-40.258523
4,0.257329,-0.17458,0.193389,-0.365684,-0.0688,0.737681,-0.023763,-1.816901,1.607556,0.588749,71.770747


In [148]:
data = df.drop(['y'], axis=1)
data = pd.get_dummies(data)

target = df['y']

theCols = data.columns

In [149]:
x, xt, y, yt = train_test_split(data, target)

In [150]:
stdScaler = StandardScaler()

stdScaler.fit(x)

x = stdScaler.transform(x)
xt = stdScaler.transform(xt)

## Without Hyperparameter Tuning

In [151]:
baseModel = ElasticNet(alpha=1.5, l1_ratio=0.1, max_iter=2000)

baseModel.fit(x,y);

In [152]:
yp = baseModel.predict(x)
ytp = baseModel.predict(xt)

In [153]:
testAcc = round(r2_score(y_true=yt, y_pred=ytp), 4)
print('\n\nTest R^2 Score (No Tuning):', testAcc, '\n')



Test R^2 Score (No Tuning): 0.6659 



## With Hyperparameter Tuning

In [154]:
numJobs = psutil.cpu_count(logical=False)

# scrList = [r2_score, mean_absolute_error, mean_squared_error]
scr = r2_score
scr = make_scorer(scr)

In [155]:
alphaListList = np.logspace(-2,2,5)

ratioList = np.linspace(0,1,11)

In [156]:
hyperParams = {'alpha':alphaListList, 'l1_ratio':ratioList}

best_model = GridSearchCV(estimator=baseModel, param_grid=hyperParams, cv=3, scoring=scr, n_jobs=numJobs, verbose=1)

best_model.fit(x, y);

best_model.best_params_

Fitting 3 folds for each of 55 candidates, totalling 165 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  57 tasks      | elapsed:   15.3s
[Parallel(n_jobs=4)]: Done 165 out of 165 | elapsed:   16.8s finished


{'alpha': 0.01, 'l1_ratio': 1.0}

In [157]:
yp = best_model.best_estimator_.predict(x)
ytp = best_model.best_estimator_.predict(xt)

In [158]:
testAcc = round(r2_score(y_true=yt, y_pred=ytp), 4)
print('\n\nTest R^2 Score (With Tuning):', testAcc, '\n')



Test R^2 Score (With Tuning): 1.0 



In [159]:
sdf = pd.DataFrame()
kdf = pd.DataFrame()

sdf['OrigTarget'] = target.round(2)
sdf['BaseModelPred'] = baseModel.predict(data).round(2)
sdf['Error'] = (sdf['OrigTarget'] - sdf['BaseModelPred']).abs()

kdf['OrigTarget'] = target.round(2)
kdf['TunedModelPred'] = best_model.best_estimator_.predict(data).round(2)
kdf['Error'] = (kdf['OrigTarget'] - kdf['TunedModelPred']).abs()

In [160]:
sdf.head(10)

Unnamed: 0,OrigTarget,BaseModelPred,Error
0,-107.81,-47.46,60.35
1,-136.84,-61.4,75.44
2,-177.48,-78.44,99.04
3,-40.26,-17.14,23.12
4,71.77,30.44,41.33
5,-70.86,-28.71,42.15
6,44.54,17.26,27.28
7,214.84,91.47,123.37
8,6.36,0.97,5.39
9,92.05,37.85,54.2


In [161]:
kdf.head(10)

Unnamed: 0,OrigTarget,TunedModelPred,Error
0,-107.81,-110.92,3.11
1,-136.84,-141.2,4.36
2,-177.48,-181.64,4.16
3,-40.26,-42.46,2.2
4,71.77,71.27,0.5
5,-70.86,-73.28,2.42
6,44.54,43.81,0.73
7,214.84,217.05,2.21
8,6.36,4.19,2.17
9,92.05,92.09,0.04
