In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

from sklearn import linear_model


In [2]:
df = pd.read_csv('wc-wo-outliers.csv')

In [3]:
y = df['results']
X = df.drop(columns=['results'])

X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.20, random_state=1)

In [4]:
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
pred_train= reg.predict(X_train)

In [5]:
print('Linear Regression ')
print('---------------------')
print('RMSE: '+str(np.sqrt(mean_squared_error(y_train,pred_train))))
print('R Squared: '+str(r2_score(y_train, pred_train)))
print('MSE: '+str(mean_squared_error(y_train, pred_train)))
print('MAE: '+str(mean_absolute_error(y_train, pred_train)))
print('MAPE: '+str(mean_absolute_percentage_error(y_train, pred_train)))

print('---------------------')

pred_test= reg.predict(X_test)

print('RMSE : '+str(np.sqrt(mean_squared_error(y_test,pred_test)))) 
print('R2 : '+str(r2_score(y_test, pred_test)))
print('MSE: '+str(mean_squared_error(y_test, pred_test)))
print('MAE: '+str(mean_absolute_error(y_test, pred_test)))
print('MAPE: '+str(mean_absolute_percentage_error(y_test, pred_test)))

Linear Regression 
---------------------
RMSE: 6.322750364864893
R Squared: 0.5388878434290846
MSE: 39.97717217639914
MAE: 5.0711348407757555
MAPE: 0.9277204828534783
---------------------
RMSE : 7.589107793384392
R2 : 0.2950386733173721
MSE: 57.59455709960772
MAE: 6.294988780232196
MAPE: 0.8948800905643852


In [31]:
from sklearn.metrics import mean_squared_log_error
print('Mean Squared Log Error: ' +str(mean_squared_log_error( y_test, pred_test, squared=False)))

Mean Squared Log Error: 0.6288239705759868


In [34]:
#display adjusted R-squared
print('Adjusted r^2: ' +str(1 - (1-reg.score(X_train, y_train))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)))

Adjusted r^2: 0.46584037308121673


The value for R-squared can range from 0 to 1. A value of 0 indicates that the response variable cannot be explained by the predictor variable at all while a value of 1 indicates that the response variable can be perfectly explained without error by the predictor variables.

- AIC stands for (Akaike’s Information Criteria), a metric developped by the Japanese Statistician, Hirotugu Akaike, 1970. The basic idea of AIC is to penalize the inclusion of additional variables to a model. It adds a penalty that increases the error when including additional terms. The lower the AIC, the better the model.
- AICc is a version of AIC corrected for small sample sizes.
- BIC (or Bayesian information criteria) is a variant of AIC with a stronger penalty for including additional variables to the model.
- Mallows Cp: A variant of AIC developed by Colin Mallows.


In [35]:
import statsmodels.api as sm

In [37]:
#add constant to predictor variables
X_train = sm.add_constant(X_train)

#fit regression model
model = sm.OLS(y_train, X_train).fit()

#view AIC of model
print('AIC: '+str(model.aic))

AIC: 804.0899081188967


In [38]:
X_train.head()

Unnamed: 0,const,goals_z,xg_z,crosses_z,boxtouches_z,passes_z,progpasses_z,takeons_z,progruns_z,tackles_z,interceptions_z,clearances_z,blocks_z,aerials_z,fouls_z,fouled_z,nsxg_z
126,1.0,-0.018462,0.184615,0.168462,0.276923,-0.106923,-0.225385,0.138462,0.063846,-0.024615,-0.153846,-0.119231,0.056923,0.285385,-0.036923,-0.293077,0.228462
97,1.0,0.198462,-0.116154,-0.09,-0.013077,0.193077,0.132308,0.010769,0.395385,0.102308,0.802308,-0.154615,-0.013077,0.016154,0.668462,0.585385,-0.03
42,1.0,0.214615,0.523077,-0.176154,0.277692,-0.196923,-0.245385,-0.013846,-0.178462,-0.228462,-0.25,0.096154,0.032308,0.760769,-0.122308,0.119231,0.124615
135,1.0,0.766923,0.643846,-0.265385,0.23,0.353077,-0.015385,0.496154,0.606923,0.446154,-0.096923,-0.128462,0.086923,-0.233846,0.180769,-0.118462,0.404615
28,1.0,0.332308,0.362308,-0.043846,0.316923,0.06,0.019231,0.140769,0.21,0.343846,0.276154,-0.057692,0.112308,-0.053846,0.143077,0.060769,0.192308


In [40]:
new_x = X_train[['goals_z', 'boxtouches_z', 'fouls_z', 'fouled_z', 'blocks_z']]

In [41]:
#add constant to predictor variables
new_x = sm.add_constant(new_x)

#fit regression model
model = sm.OLS(y_train, new_x).fit()

#view AIC of model
print(model.aic)

794.4357164871906


for AIC - the model with a lower AIC value is the better fitting model. 

In [43]:
df.corr()

Unnamed: 0,goals_z,xg_z,crosses_z,boxtouches_z,passes_z,progpasses_z,takeons_z,progruns_z,tackles_z,interceptions_z,clearances_z,blocks_z,aerials_z,fouls_z,fouled_z,nsxg_z,results
goals_z,1.0,0.5632,-0.023642,0.257357,0.281465,0.144265,0.247732,0.228703,0.303571,0.245405,-0.083437,0.186413,0.077414,-0.097406,0.013629,0.343646,-0.686074
xg_z,0.5632,1.0,0.332501,0.615918,0.349206,0.193602,0.227966,0.401422,0.371983,0.273211,-0.275451,0.226644,0.125091,0.010405,0.114389,0.632199,-0.325367
crosses_z,-0.023642,0.332501,1.0,0.647695,0.250075,0.341407,0.030013,0.238411,0.190856,0.216208,-0.18067,0.195745,0.275481,0.085361,0.166262,0.667891,0.008766
boxtouches_z,0.257357,0.615918,0.647695,1.0,0.341549,0.279011,0.059851,0.329907,0.351479,0.294917,-0.386322,0.298506,0.317604,-0.008728,0.084373,0.839111,-0.174968
passes_z,0.281465,0.349206,0.250075,0.341549,1.0,0.749139,0.378908,0.637138,0.760528,0.697863,-0.092854,0.38783,0.023646,0.02869,0.257476,0.502357,-0.129844
progpasses_z,0.144265,0.193602,0.341407,0.279011,0.749139,1.0,0.176248,0.415549,0.512285,0.476149,-0.059196,0.287707,0.063655,0.146764,0.091293,0.437218,-0.076013
takeons_z,0.247732,0.227966,0.030013,0.059851,0.378908,0.176248,1.0,0.539493,0.341033,0.339543,0.037811,0.25411,-0.004019,0.08924,0.331388,0.265547,-0.154032
progruns_z,0.228703,0.401422,0.238411,0.329907,0.637138,0.415549,0.539493,1.0,0.566131,0.463349,-0.014233,0.319688,0.012882,0.083453,0.270851,0.514162,-0.155687
tackles_z,0.303571,0.371983,0.190856,0.351479,0.760528,0.512285,0.341033,0.566131,1.0,0.692089,-0.190389,0.28895,0.066171,-0.054803,0.261507,0.465843,-0.206774
interceptions_z,0.245405,0.273211,0.216208,0.294917,0.697863,0.476149,0.339543,0.463349,0.692089,1.0,-0.076003,0.295889,0.113126,0.042158,0.295886,0.420245,-0.152989


# https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

In [6]:
from sklearn.metrics import explained_variance_score

In [7]:
explained_variance_score(y_test, pred_test)

0.29504085230276744

In [8]:
from sklearn.metrics import mean_pinball_loss
mean_pinball_loss(y_test, pred_test, alpha=0.1)

3.1528313658246794

The pinball loss output is a non-negative floating point. The best value is 0.0.

In [9]:
from sklearn.metrics import mean_pinball_loss
mean_pinball_loss(y_test, pred_test, alpha=0.9)

3.1421574144075164

https://www.kaggle.com/code/satishgunjal/tutorial-k-fold-cross-validation

In [15]:
import numpy as np
from scipy.stats import ttest_ind

v1 = np.random.normal(size=100)
v2 = np.random.normal(size=100)

res = ttest_ind(v1, v2)

print(res)

Ttest_indResult(statistic=-0.75011141092798, pvalue=0.4540780092168577)


In [16]:
import numpy as np
from scipy.stats import ttest_ind

v1 = y_test
v2 = pred_test

res = ttest_ind(v1, v2)

print(res)

Ttest_indResult(statistic=-0.006640338550430307, pvalue=0.9947246027266792)


In [17]:
from scipy.stats import kstest

v = y_train

res = kstest(v, 'norm')

print(res)

KstestResult(statistic=0.9264024104247022, pvalue=3.9023458533302164e-134)


In [18]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import linear_model, tree, ensemble

In [19]:
# Lets split the data into 5 folds.  
# We will use this 'kf'(KFold splitting stratergy) object as input to cross_val_score() method
kf =KFold(n_splits=5, shuffle=True, random_state=42)

cnt = 1
# split()  method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt += 1



Fold:1, Train set: 118, Test set:30
Fold:2, Train set: 118, Test set:30
Fold:3, Train set: 118, Test set:30
Fold:4, Train set: 119, Test set:29
Fold:5, Train set: 119, Test set:29


In [20]:
def rmse(score):
    rmse = np.sqrt(-score)
    print(f'rmse= {"{:.2f}".format(rmse)}')

In [21]:
score = cross_val_score(linear_model.LinearRegression(), X, y, cv= kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold: {score}')
rmse(score.mean())

Scores for each fold: [-42.90060139 -67.83759025 -55.70044129 -48.72097654 -60.44251809]
rmse= 7.42


In [22]:
score = cross_val_score(tree.DecisionTreeRegressor(random_state= 42), X, y, cv=kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold: {score}')
rmse(score.mean())

Scores for each fold: [-101.9        -107.93333333  -95.83333333  -89.68965517 -118.34482759]
rmse= 10.14


In [23]:
score = cross_val_score(ensemble.RandomForestRegressor(random_state= 42), X, y, cv= kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold are: {score}')
rmse(score.mean())

Scores for each fold are: [-48.77826333 -52.32521333 -57.16118667 -40.13016897 -45.96182759]
rmse= 6.99


In [24]:
max_depth = [1,2,3,4,5,6,7,8,9,10]

for val in max_depth:
    score = cross_val_score(tree.DecisionTreeRegressor(max_depth= val, random_state= 42), X, y, cv= kf, scoring="neg_mean_squared_error")
    print(f'For max depth: {val}')
    rmse(score.mean())

For max depth: 1
rmse= 7.10
For max depth: 2
rmse= 7.26
For max depth: 3
rmse= 7.72
For max depth: 4
rmse= 9.31
For max depth: 5
rmse= 9.60
For max depth: 6
rmse= 9.57
For max depth: 7
rmse= 9.90
For max depth: 8
rmse= 9.68
For max depth: 9
rmse= 9.85
For max depth: 10
rmse= 10.06


In [25]:
estimators = [50, 100, 150, 200, 250, 300, 350]

for count in estimators:
    score = cross_val_score(ensemble.RandomForestRegressor(n_estimators= count, random_state= 42), X, y, cv= kf, scoring="neg_mean_squared_error")
    print(f'For estimators: {count}')
    rmse(score.mean())

For estimators: 50
rmse= 7.08
For estimators: 100
rmse= 6.99
For estimators: 150
rmse= 6.98
For estimators: 200
rmse= 6.96
For estimators: 250
rmse= 6.96
For estimators: 300
rmse= 6.95
For estimators: 350
rmse= 6.93
