The same base models that were run before will be run on the dataset with reduced features. This will give an indication of how well this subset of best predictors can predict the target feature. If there is no significant change in model performance, you may suggest that only the best predictors are necessary for modelling therefore reducing data collecting times in subsequent analyses.

## Read in Data

In [None]:
import pandas as pd
x_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
x_val = pd.read_csv('X_val.csv')
y_val = pd.read_csv('y_val.csv')

The features identified in forward and backward feature selction are initialized.

In [None]:
models_forward = ['FIFA Rank', 'Caps', 'Titles', 'H_clustering', 'Q_Clean_Sheets%', 'Q_xGF']
models_backward = ['FIFA Rank', 'Manager_Age', 'Titles', 'Months_installed', 'Age', 'Height']

## Models forward

Training and validation sets are reduced to include only those features identified as the best 6 in forward feature selection.

In [None]:
x_train_mf = x_train[models_forward]
x_train_mf.head(3)

Unnamed: 0,FIFA Rank,Caps,Titles,H_clustering,Q_Clean_Sheets%,Q_xGF
0,0.876712,0.366782,0.0,4,0.384615,0.082803
1,0.0,0.621733,0.666667,1,0.769231,1.0
2,0.630137,0.611778,0.333333,4,0.384615,0.235669


In [None]:
x_val_mf = x_val[models_forward]
x_val_mf.head(3)

Unnamed: 0,FIFA Rank,Caps,Titles,H_clustering,Q_Clean_Sheets%,Q_xGF
0,0.753425,0.489959,0.0,4,0.230769,0.261146
1,0.027397,0.329296,0.0,5,0.384615,0.356688
2,0.260274,0.891546,0.333333,3,0.076923,0.573248


In [None]:
pip install pygam

In [None]:
#Linear Regression
from sklearn.linear_model import LinearRegression
linear_regression_model = LinearRegression()
#Ridge
from sklearn.linear_model import Ridge
ridge_model = Ridge()
#Lasso
from sklearn import linear_model
lasso_model= linear_model.Lasso()
#Zip
import statsmodels.api as sm
# Note: ZIP requires both endog and exog to be specified when fitting.
# Here we're just initializing the model without fitting it.
zip_model = None
#Trees
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor()
#Random Forrest
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor()
#SVM - SVR
from sklearn import svm
SVM = svm.SVR()
#Neural network
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
mlp = MLPRegressor()
#Generalized linear model
import statsmodels.api as sm
# Note: GLM requires both endog and exog to be specified when fitting.
# Here we're just initializing the model without fitting it.
glm_model = sm.GLM(endog=y_train, exog=x_train_mf, family=sm.families.Poisson())
#Generalized additive model
from pygam import LinearGAM, s
gam = LinearGAM()

In [None]:
from math import sqrt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import r2_score

The same models that have been previosuly run are run on the two feature subsets. Results are stored in  dataframe.

### GLM

In [None]:
forward_model_eval = []

In [None]:
model='GLM'
# Fit the model
glm_result = glm_model.fit()
# Predict the values for the validation set
predictions = glm_result.predict(x_val_mf)

# Calculate the evaluation metrics
mae = mean_absolute_error(y_val, predictions)
mse = mean_squared_error(y_val, predictions)
rmse = sqrt(mse)
r2 = r2_score(y_val, predictions)
forward_model_eval.append({'Model': model, 'Mean Absolute Error': mae, 'Mean Squared Error': mse, 'RMSE': rmse,'R-squared': r2})

In [None]:
forward_model_eval

[{'Model': 'GLM',
  'Mean Absolute Error': 0.06484794023383723,
  'Mean Squared Error': 0.0049339661545099575,
  'RMSE': 0.07024219639582718,
  'R-squared': 0.8969198227445971}]

### ZIP Model

In [None]:
# Initialize and fit the Zero-Inflated Poisson model
model='ZIP'
zip_model = sm.ZeroInflatedPoisson(endog=y_train, exog=x_train_mf, exog_infl=x_train_mf, inflation='logit')
zip_result = zip_model.fit()

         Current function value: 0.448420
         Iterations: 35
         Function evaluations: 36
         Gradient evaluations: 36


  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


In [None]:
# Add a constant to the predictor features matrix (for the intercept)
X_train = sm.add_constant(x_train_mf)
X_val = sm.add_constant(x_val_mf)

# Initialize and fit the Zero-Inflated Poisson model
zip_model = sm.ZeroInflatedPoisson(endog=y_train, exog=X_train, exog_infl=X_train, inflation='logit')
zip_result = zip_model.fit()

# Predict the values for the validation set
predictions = zip_result.predict(exog=X_val, exog_infl=X_val)

# Calculate the evaluation metrics
mae = mean_absolute_error(y_val, predictions)
mse = mean_squared_error(y_val, predictions)
rmse = sqrt(mse)
r2 = r2_score(y_val, predictions)
forward_model_eval.append({'Model': model, 'Mean Absolute Error': mae, 'Mean Squared Error': mse, 'RMSE': rmse,'R-squared': r2})

         Current function value: 0.414076
         Iterations: 35
         Function evaluations: 36
         Gradient evaluations: 36


  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


In [None]:
forward_model_eval

[{'Model': 'GLM',
  'Mean Absolute Error': 0.06484794023383723,
  'Mean Squared Error': 0.0049339661545099575,
  'RMSE': 0.07024219639582718,
  'R-squared': 0.8969198227445971},
 {'Model': 'ZIP',
  'Mean Absolute Error': 0.2159305212244197,
  'Mean Squared Error': 0.0798474124624528,
  'RMSE': 0.28257284452412057,
  'R-squared': -0.6681681982134249}]

### Rest of models

In [None]:
models = [linear_regression_model, ridge_model, lasso_model, dt_model, rf_model, SVM, mlp, gam]

In [None]:
for model in models:
  model.fit(x_train_mf, y_train)
  predictions = model.predict(x_val_mf)
  type(predictions)
  # Calculate the absolute errors
  errors = abs(predictions - y_val.values)
  # Calculate model evaluation metrics
  mae = mean_absolute_error(y_val, predictions)
  mse = mean_squared_error(y_val, predictions)
  rmse = sqrt(mse)
  r2 = r2_score(y_val, predictions)
  print(f'MODEL: {model}')
  print('Mean Absolute Error: ', mae, ' degrees.')
  print('Mean Squared Error:', mse, ' degrees')
  print('RMSE:', rmse, ' degrees')
  print('R2:', r2)
  forward_model_eval.append({'Model': model, 'Mean Absolute Error': mae, 'Mean Squared Error': mse, 'RMSE': rmse,'R-squared': r2})

  return fit_method(estimator, *args, **kwargs)


MODEL: LinearRegression()
Mean Absolute Error:  0.1804209440816962  degrees.
Mean Squared Error: 0.04318553735548029  degrees
RMSE: 0.20781130228041084  degrees
R2: 0.09776988611814674
MODEL: Ridge()
Mean Absolute Error:  0.21211310030711042  degrees.
Mean Squared Error: 0.06468691641229174  degrees
RMSE: 0.2543362270937661  degrees
R2: -0.35143586337525656
MODEL: Lasso()
Mean Absolute Error:  0.22747566297415242  degrees.
Mean Squared Error: 0.05269135289760787  degrees
RMSE: 0.2295459712075293  degrees
R2: -0.10082514278046428
MODEL: DecisionTreeRegressor()
Mean Absolute Error:  0.19234642497482377  degrees.
Mean Squared Error: 0.08557553089822788  degrees
RMSE: 0.2925329569437055  degrees
R2: -0.7878397657129141
MODEL: RandomForestRegressor()
Mean Absolute Error:  0.17639140651225252  degrees.
Mean Squared Error: 0.04587604770824444  degrees
RMSE: 0.21418694569988256  degrees
R2: 0.0415598766885279
MODEL: SVR()
Mean Absolute Error:  0.24081401878256636  degrees.
Mean Squared Error: 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


MODEL: MLPRegressor()
Mean Absolute Error:  0.14253814967817788  degrees.
Mean Squared Error: 0.022700996706317086  degrees
RMSE: 0.1506684993829735  degrees
R2: 0.5257318978115503
MODEL: LinearGAM(callbacks=[Deviance(), Diffs()], fit_intercept=True, 
   max_iter=100, scale=None, 
   terms=s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + intercept, 
   tol=0.0001, verbose=False)
Mean Absolute Error:  0.2586597994420556  degrees.
Mean Squared Error: 0.08216308234235802  degrees
RMSE: 0.28664103394726653  degrees
R2: -0.7165470589941052


In [None]:
forward_model_eval

[{'Model': 'GLM',
  'Mean Absolute Error': 0.06484794023383723,
  'Mean Squared Error': 0.0049339661545099575,
  'RMSE': 0.07024219639582718,
  'R-squared': 0.8969198227445971},
 {'Model': 'ZIP',
  'Mean Absolute Error': 0.2159305212244197,
  'Mean Squared Error': 0.0798474124624528,
  'RMSE': 0.28257284452412057,
  'R-squared': -0.6681681982134249},
 {'Model': LinearRegression(),
  'Mean Absolute Error': 0.1804209440816962,
  'Mean Squared Error': 0.04318553735548029,
  'RMSE': 0.20781130228041084,
  'R-squared': 0.09776988611814674},
 {'Model': Ridge(),
  'Mean Absolute Error': 0.21211310030711042,
  'Mean Squared Error': 0.06468691641229174,
  'RMSE': 0.2543362270937661,
  'R-squared': -0.35143586337525656},
 {'Model': Lasso(),
  'Mean Absolute Error': 0.22747566297415242,
  'Mean Squared Error': 0.05269135289760787,
  'RMSE': 0.2295459712075293,
  'R-squared': -0.10082514278046428},
 {'Model': DecisionTreeRegressor(),
  'Mean Absolute Error': 0.19234642497482377,
  'Mean Squared Er

In [None]:
forward_model_eval_df = pd.DataFrame(forward_model_eval)
forward_model_eval_df

Unnamed: 0,Model,Mean Absolute Error,Mean Squared Error,RMSE,R-squared
0,GLM,0.064848,0.004934,0.070242,0.89692
1,ZIP,0.215931,0.079847,0.282573,-0.668168
2,LinearRegression(),0.180421,0.043186,0.207811,0.09777
3,Ridge(),0.212113,0.064687,0.254336,-0.351436
4,Lasso(),0.227476,0.052691,0.229546,-0.100825
5,DecisionTreeRegressor(),0.192346,0.085576,0.292533,-0.78784
6,"(DecisionTreeRegressor(max_features=1.0, rando...",0.176391,0.045876,0.214187,0.04156
7,SVR(),0.240814,0.097356,0.312019,-1.03395
8,MLPRegressor(),0.142538,0.022701,0.150668,0.525732
9,"LinearGAM(callbacks=[Deviance(), Diffs()], fit...",0.25866,0.082163,0.286641,-0.716547


###Results

In [None]:
model_names = ['GLM', 'ZIP', 'Linear Regression', 'Ridge Regression', 'Lasso Regression',  'Decision Tree', 'Random forrest', 'SVM', 'Neural Network', 'GAM']
forward_model_eval_df.insert(1, 'model_nice_names', model_names)
forward_model_eval_df

Unnamed: 0,Model,model_nice_names,Mean Absolute Error,Mean Squared Error,RMSE,R-squared
0,GLM,GLM,0.064848,0.004934,0.070242,0.89692
1,ZIP,ZIP,0.215931,0.079847,0.282573,-0.668168
2,LinearRegression(),Linear Regression,0.180421,0.043186,0.207811,0.09777
3,Ridge(),Ridge Regression,0.212113,0.064687,0.254336,-0.351436
4,Lasso(),Lasso Regression,0.227476,0.052691,0.229546,-0.100825
5,DecisionTreeRegressor(),Decision Tree,0.192346,0.085576,0.292533,-0.78784
6,"(DecisionTreeRegressor(max_features=1.0, rando...",Random forrest,0.176391,0.045876,0.214187,0.04156
7,SVR(),SVM,0.240814,0.097356,0.312019,-1.03395
8,MLPRegressor(),Neural Network,0.142538,0.022701,0.150668,0.525732
9,"LinearGAM(callbacks=[Deviance(), Diffs()], fit...",GAM,0.25866,0.082163,0.286641,-0.716547


In [None]:
from google.colab import files
forward_model_eval_df.to_csv('Forward_Selection_Model_Results.csv', index=None)
files.download("Forward_Selection_Model_Results.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Models Backward

Training and validation sets are reduced to include only those features identified as the best 6 in backward feature selection.

In [None]:
x_train_bw = x_train[models_backward]
x_train_bw.head(3)

Unnamed: 0,FIFA Rank,Manager_Age,Titles,Months_installed,Age,Height
0,0.876712,0.482759,0.0,0.086957,0.684229,0.3692
1,0.0,0.655172,0.666667,1.0,0.537804,0.5342
2,0.630137,0.655172,0.333333,0.130435,0.670886,0.5308


In [None]:
x_val_bw = x_val[models_backward]
x_val_bw.head(3)

Unnamed: 0,FIFA Rank,Manager_Age,Titles,Months_installed,Age,Height
0,0.753425,0.896552,0.0,0.442029,0.697229,0.6308
1,0.027397,0.586207,0.0,0.630435,0.263086,0.1386
2,0.260274,0.551724,0.333333,0.297101,0.8156,0.9


In [None]:
backward_model_eval = []

In [None]:
model='GLM'
# Fit the model
glm_result = glm_model.fit()
# Predict the values for the validation set
predictions = glm_result.predict(x_val_bw)

# Calculate the evaluation metrics
mae = mean_absolute_error(y_val, predictions)
mse = mean_squared_error(y_val, predictions)
rmse = sqrt(mse)
r2 = r2_score(y_val, predictions)
backward_model_eval.append({'Model': model, 'Mean Absolute Error': mae, 'Mean Squared Error': mse, 'RMSE': rmse,'R-squared': r2})

In [None]:
# Initialize and fit the Zero-Inflated Poisson model
model='ZIP'
zip_model = sm.ZeroInflatedPoisson(endog=y_train, exog=x_train_bw, exog_infl=x_train_bw, inflation='logit')
zip_result = zip_model.fit()

         Current function value: 0.472363
         Iterations: 35
         Function evaluations: 36
         Gradient evaluations: 36


  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


In [None]:
# Add a constant to the predictor features matrix (for the intercept)
X_train = sm.add_constant(x_train_bw)
X_val = sm.add_constant(x_val_bw)

# Initialize and fit the Zero-Inflated Poisson model
zip_model = sm.ZeroInflatedPoisson(endog=y_train, exog=X_train, exog_infl=X_train, inflation='logit')
zip_result = zip_model.fit()

# Predict the values for the validation set
predictions = zip_result.predict(exog=X_val, exog_infl=X_val)

# Calculate the evaluation metrics
mae = mean_absolute_error(y_val, predictions)
mse = mean_squared_error(y_val, predictions)
rmse = sqrt(mse)
r2 = r2_score(y_val, predictions)
backward_model_eval.append({'Model': model, 'Mean Absolute Error': mae, 'Mean Squared Error': mse, 'RMSE': rmse,'R-squared': r2})

         Current function value: 0.418816
         Iterations: 35
         Function evaluations: 36
         Gradient evaluations: 36


  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


### Rest of models

In [None]:
models = [linear_regression_model, ridge_model, lasso_model, dt_model, rf_model, SVM, mlp, gam]

In [None]:
for model in models:
  model.fit(x_train_bw, y_train)
  predictions = model.predict(x_val_bw)
  type(predictions)
  # Calculate the absolute errors
  errors = abs(predictions - y_val.values)
  # Calculate model evaluation metrics
  mae = mean_absolute_error(y_val, predictions)
  mse = mean_squared_error(y_val, predictions)
  rmse = sqrt(mse)
  r2 = r2_score(y_val, predictions)
  print(f'MODEL: {model}')
  print('Mean Absolute Error: ', mae, ' degrees.')
  print('Mean Squared Error:', mse, ' degrees')
  print('RMSE:', rmse, ' degrees')
  print('R2:', r2)
  backward_model_eval.append({'Model': model, 'Mean Absolute Error': mae, 'Mean Squared Error': mse, 'RMSE': rmse,'R-squared': r2})

MODEL: LinearRegression()
Mean Absolute Error:  0.2661122183290218  degrees.
Mean Squared Error: 0.08011496000694672  degrees
RMSE: 0.2830458620205332  degrees
R2: -0.6737577883012027
MODEL: Ridge()
Mean Absolute Error:  0.0708123533190692  degrees.
Mean Squared Error: 0.012027402080249982  degrees
RMSE: 0.10966951299358442  degrees
R2: 0.7487241096656236
MODEL: Lasso()
Mean Absolute Error:  0.22747566297415242  degrees.
Mean Squared Error: 0.05269135289760787  degrees
RMSE: 0.2295459712075293  degrees
R2: -0.10082514278046428
MODEL: DecisionTreeRegressor()
Mean Absolute Error:  0.21517287680429675  degrees.
Mean Squared Error: 0.08952935739839842  degrees
RMSE: 0.2992145674902852  degrees
R2: -0.870442913710221


  return fit_method(estimator, *args, **kwargs)


MODEL: RandomForestRegressor()
Mean Absolute Error:  0.1908828465928164  degrees.
Mean Squared Error: 0.06756283578875556  degrees
RMSE: 0.25992852053738846  degrees
R2: -0.41151942897230964
MODEL: SVR()
Mean Absolute Error:  0.1303403667164961  degrees.
Mean Squared Error: 0.024750712551606848  degrees
RMSE: 0.15732359184689004  degrees
R2: 0.4829093355889561


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


MODEL: MLPRegressor()
Mean Absolute Error:  0.13020663994051634  degrees.
Mean Squared Error: 0.02265256166060016  degrees
RMSE: 0.1505076797396072  degrees
R2: 0.526743800394893
MODEL: LinearGAM(callbacks=[Deviance(), Diffs()], fit_intercept=True, 
   max_iter=100, scale=None, 
   terms=s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + intercept, 
   tol=0.0001, verbose=False)
Mean Absolute Error:  0.19259022087387748  degrees.
Mean Squared Error: 0.04176539217800591  degrees
RMSE: 0.20436582928172192  degrees
R2: 0.1274394890376268


In [None]:
backward_model_eval_df = pd.DataFrame(backward_model_eval)
model_names = ['GLM', 'ZIP', 'Linear Regression', 'Ridge Regression', 'Lasso Regression',  'Decision Tree', 'Random forrest', 'SVM', 'Neural Network', 'GAM']
backward_model_eval_df.insert(1, 'model_nice_names', model_names)
backward_model_eval_df

Unnamed: 0,Model,model_nice_names,Mean Absolute Error,Mean Squared Error,RMSE,R-squared
0,GLM,GLM,0.220899,0.093748,0.306183,-0.958582
1,ZIP,ZIP,0.042554,0.003745,0.061193,0.921769
2,LinearRegression(),Linear Regression,0.266112,0.080115,0.283046,-0.673758
3,Ridge(),Ridge Regression,0.070812,0.012027,0.10967,0.748724
4,Lasso(),Lasso Regression,0.227476,0.052691,0.229546,-0.100825
5,DecisionTreeRegressor(),Decision Tree,0.215173,0.089529,0.299215,-0.870443
6,"(DecisionTreeRegressor(max_features=1.0, rando...",Random forrest,0.190883,0.067563,0.259929,-0.411519
7,SVR(),SVM,0.13034,0.024751,0.157324,0.482909
8,MLPRegressor(),Neural Network,0.130207,0.022653,0.150508,0.526744
9,"LinearGAM(callbacks=[Deviance(), Diffs()], fit...",GAM,0.19259,0.041765,0.204366,0.127439


In [None]:
forward_model_eval_df

Unnamed: 0,Model,model_nice_names,Mean Absolute Error,Mean Squared Error,RMSE,R-squared
0,GLM,GLM,0.064848,0.004934,0.070242,0.89692
1,ZIP,ZIP,0.215931,0.079847,0.282573,-0.668168
2,LinearRegression(),Linear Regression,0.180421,0.043186,0.207811,0.09777
3,Ridge(),Ridge Regression,0.212113,0.064687,0.254336,-0.351436
4,Lasso(),Lasso Regression,0.227476,0.052691,0.229546,-0.100825
5,DecisionTreeRegressor(),Decision Tree,0.192346,0.085576,0.292533,-0.78784
6,"(DecisionTreeRegressor(max_features=1.0, rando...",Random forrest,0.176391,0.045876,0.214187,0.04156
7,SVR(),SVM,0.240814,0.097356,0.312019,-1.03395
8,MLPRegressor(),Neural Network,0.142538,0.022701,0.150668,0.525732
9,"LinearGAM(callbacks=[Deviance(), Diffs()], fit...",GAM,0.25866,0.082163,0.286641,-0.716547


In [None]:
from google.colab import files
backward_model_eval_df.to_csv('Backward_Selection_Model_Results.csv', index=None)
files.download("Backward_Selection_Model_Results.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>