In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

from sklearn import linear_model


In [2]:
df = pd.read_csv('wc-wo-outliers.csv')

In [3]:
y = df['results']
X = df.drop(columns=['results'])

X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.20, random_state=1)

In [4]:
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
pred_train= reg.predict(X_train)

In [5]:
print('Linear Regression ')
print('---------------------')
print('RMSE: '+str(np.sqrt(mean_squared_error(y_train,pred_train))))
print('R Squared: '+str(r2_score(y_train, pred_train)))
print('MSE: '+str(mean_squared_error(y_train, pred_train)))
print('MAE: '+str(mean_absolute_error(y_train, pred_train)))
print('MAPE: '+str(mean_absolute_percentage_error(y_train, pred_train)))

print('---------------------')

pred_test= reg.predict(X_test)

print('RMSE : '+str(np.sqrt(mean_squared_error(y_test,pred_test)))) 
print('R2 : '+str(r2_score(y_test, pred_test)))
print('MSE: '+str(mean_squared_error(y_test, pred_test)))
print('MAE: '+str(mean_absolute_error(y_test, pred_test)))
print('MAPE: '+str(mean_absolute_percentage_error(y_test, pred_test)))

Linear Regression 
---------------------
RMSE: 6.322750364864893
R Squared: 0.5388878434290846
MSE: 39.97717217639914
MAE: 5.0711348407757555
MAPE: 0.9277204828534783
---------------------
RMSE : 7.589107793384392
R2 : 0.2950386733173721
MSE: 57.59455709960772
MAE: 6.294988780232196
MAPE: 0.8948800905643852


# https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

In [6]:
from sklearn.metrics import explained_variance_score

In [7]:
explained_variance_score(y_test, pred_test)

0.29504085230276744

In [8]:
from sklearn.metrics import mean_pinball_loss
mean_pinball_loss(y_test, pred_test, alpha=0.1)

3.1528313658246794

The pinball loss output is a non-negative floating point. The best value is 0.0.

In [9]:
from sklearn.metrics import mean_pinball_loss
mean_pinball_loss(y_test, pred_test, alpha=0.9)

3.1421574144075164

In [10]:
# from sklearn.metrics import d2_pinball_score


In [11]:
# d2_pinball_score(y_test, pred_test)

In [12]:
# from sklearn.metrics import d2_absolute_error_score
# d2_absolute_error_score(y_test, pred_test)

In [44]:
# from scipy.stats import ttest_ind

In [45]:
# import scipy.stats as stats
# from scipy.stats import ttest_ind

https://www.kaggle.com/code/satishgunjal/tutorial-k-fold-cross-validation

In [27]:
import numpy as np
from scipy.stats import ttest_ind

v1 = np.random.normal(size=100)
v2 = np.random.normal(size=100)

res = ttest_ind(v1, v2)

print(res)

Ttest_indResult(statistic=1.1467503109195556, pvalue=0.25286870872889233)


In [28]:
import numpy as np
from scipy.stats import ttest_ind

v1 = y_test
v2 = pred_test

res = ttest_ind(v1, v2)

print(res)

Ttest_indResult(statistic=-0.006640338550430307, pvalue=0.9947246027266792)


In [34]:
from scipy.stats import kstest

v = y_train

res = kstest(v, 'norm')

print(res)

KstestResult(statistic=0.9264024104247022, pvalue=3.9023458533302164e-134)


In [36]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import linear_model, tree, ensemble

In [37]:
# Lets split the data into 5 folds.  
# We will use this 'kf'(KFold splitting stratergy) object as input to cross_val_score() method
kf =KFold(n_splits=5, shuffle=True, random_state=42)

cnt = 1
# split()  method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt += 1



Fold:1, Train set: 118, Test set:30
Fold:2, Train set: 118, Test set:30
Fold:3, Train set: 118, Test set:30
Fold:4, Train set: 119, Test set:29
Fold:5, Train set: 119, Test set:29


In [38]:
def rmse(score):
    rmse = np.sqrt(-score)
    print(f'rmse= {"{:.2f}".format(rmse)}')

In [39]:
score = cross_val_score(linear_model.LinearRegression(), X, y, cv= kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold: {score}')
rmse(score.mean())

Scores for each fold: [-42.90060139 -67.83759025 -55.70044129 -48.72097654 -60.44251809]
rmse= 7.42


In [40]:
score = cross_val_score(tree.DecisionTreeRegressor(random_state= 42), X, y, cv=kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold: {score}')
rmse(score.mean())

Scores for each fold: [-101.9        -107.93333333  -95.83333333  -89.68965517 -118.34482759]
rmse= 10.14


In [41]:
score = cross_val_score(ensemble.RandomForestRegressor(random_state= 42), X, y, cv= kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold are: {score}')
rmse(score.mean())

Scores for each fold are: [-48.77826333 -52.32521333 -57.16118667 -40.13016897 -45.96182759]
rmse= 6.99


In [42]:
max_depth = [1,2,3,4,5,6,7,8,9,10]

for val in max_depth:
    score = cross_val_score(tree.DecisionTreeRegressor(max_depth= val, random_state= 42), X, y, cv= kf, scoring="neg_mean_squared_error")
    print(f'For max depth: {val}')
    rmse(score.mean())

For max depth: 1
rmse= 7.10
For max depth: 2
rmse= 7.26
For max depth: 3
rmse= 7.72
For max depth: 4
rmse= 9.31
For max depth: 5
rmse= 9.60
For max depth: 6
rmse= 9.57
For max depth: 7
rmse= 9.90
For max depth: 8
rmse= 9.68
For max depth: 9
rmse= 9.85
For max depth: 10
rmse= 10.06


In [43]:
estimators = [50, 100, 150, 200, 250, 300, 350]

for count in estimators:
    score = cross_val_score(ensemble.RandomForestRegressor(n_estimators= count, random_state= 42), X, y, cv= kf, scoring="neg_mean_squared_error")
    print(f'For estimators: {count}')
    rmse(score.mean())

For estimators: 50
rmse= 7.08
For estimators: 100
rmse= 6.99
For estimators: 150
rmse= 6.98
For estimators: 200
rmse= 6.96
For estimators: 250
rmse= 6.96
For estimators: 300
rmse= 6.95
For estimators: 350
rmse= 6.93
