In [None]:
import os
import statistics
import scipy as sp
import math
import pandas as pd
#pd.get_option("display.max_rows")
#pd.reset_option("display.max_rows")
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as tick
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from statsmodels.regression.linear_model import OLS
from statsmodels.stats.outliers_influence import OLSInfluence
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, RegressorMixin
%matplotlib inline

In [None]:
os.chdir(' ') #set proper directory depending where you put that project
#data - data for 2019/2020
#data05 - data for 2017/2018
#data1 - data for 2018/19
#data15 - data for 2020/21
data    = pd.read_csv('danelic2019.csv',sep=';',engine='python')
data05  = pd.read_csv('danelic2017.csv',sep=';',engine='python')
data1   = pd.read_csv('danelic2018.csv',sep=';',engine='python')
#data15  = pd.read_csv('danelic2020.csv',sep=';',engine='python')
data1   = pd.DataFrame.append(data,data1)
data1   = pd.DataFrame.append(data1,data05,ignore_index=True)
#data1   = pd.DataFrame.append(data1,data15,ignore_index=True)
data1.sort_values('player')

In [None]:
#adding dummy variables to dataset
data1 = pd.get_dummies(data1, columns=['league'])
data1 = data1.rename({"league_Bundesliga":"isBundesliga",
                                "league_La Liga":"isLaLiga",
                                "league_Premier League":"isPremierLeague",
                                "league_Ligue 1":"isLigue1",
                                "league_Serie A":"isSerieA"},axis='columns')
data1 = pd.get_dummies(data1,columns=['Season'])
data1 = pd.get_dummies(data1,columns=['foot'])
#deleting potential outliers that actually contribute nothing
data1 = data1[data1['value']>1000000]
data1 = data1[data1['games']>5]
data1 = data1[data1['age']>0]
data1 = data1[data1['height']>0]
data1

In [None]:
#MIDFIELDERS
dataMID = data1[data1['position2'].str[:10]=='Midfielder']
dataMID1 = data1[data1['position2'].str[:8]=='midfield']
dataMID=pd.DataFrame.append(dataMID,dataMID1)

In [None]:
#was originally used for correlations calculations (visible in appendix)

#dataMID['age']=dataMID['age']*dataMID['age']
#dataMID_cor = dataMID.corr()
#show=pd.Series(dataMID_cor['value'])
#pd.set_option('display.max_rows', None)
#show.sort_values(ascending=False)

In [None]:
#for my liking
def ln(x):
    return np.log(x)

#Creating a linear regression
model_blueprint = ('ln(value)~age+goals+CL+passes_completed_short+passes_into_final_third'
                   '+Pts+xG+xGA'
                   '+xg_xa_per90'
                   '+carry_distance+tackles_won+'
                   '+isPremierLeague+isLigue1')

trainMID, testMID = train_test_split(dataMID, train_size=0.8)
modelMID=smf.ols('ln(value)~age+goals+CL+passes_completed_short+passes_into_final_third'
                   '+Pts+xG+xGA'
                   '+xg_xa_per90'
                   '+carry_distance+tackles_won+'
                   '+isPremierLeague+isLigue1',data=dataMID)
resultsMID=modelMID.fit()
resultsMID_params=resultsMID.params

#Creating a robust regression
modelMIDrobust=sm.RLM(modelMID.endog,modelMID.exog,data=trainMID).fit()
finalMID1 = sm.regression.linear_model.OLSResults(modelMID, 
                                              modelMIDrobust.params, 
                                              modelMID.normalized_cov_params)
finalMID1.summary()

In [None]:
#testing
#Breusch-Pagan
bptestMID = sm.stats.diagnostic.het_breuschpagan(finalMID1.resid, finalMID1.model.exog)
bptestMID

In [None]:
#VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor 
vif                 = pd.DataFrame()
vif["features"]     = modelMID.exog_names
vif["VIF Factor"]   = [variance_inflation_factor(finalMID1.model.exog, i) for i in range(finalMID1.model.exog.shape[1])]
vif

In [None]:
#Chowtest
trainMID1 = dataMID[dataMID['Season_201819#']==0]
trainMID2 = dataMID[dataMID['Season_201819#']==1]

JMID  = len(resultsMID.params)-1 #number of coefficients
N1MID = trainMID1.shape[0]
N2MID = trainMID2.shape[0]

RSSdMID          =  resultsMID.ssr
resultsMIDridge1 = smf.ols(model_blueprint,data=trainMID1).fit()
RSSbMID          = resultsMIDridge1.ssr
kMID             = len(resultsMIDridge1.params)-1
resultsMIDridge2 = smf.ols(model_blueprint,data=trainMID2).fit()
RSSnbMID         = resultsMIDridge2.ssr

ChowMID=((RSSdMID-(RSSbMID+RSSnbMID))/JMID)/((RSSbMID+RSSnbMID)/(N1MID+N2MID-2*kMID))
pMID=sp.stats.f.cdf(ChowMID, JMID, N1MID+N2MID-2*kMID)

print(ChowMID,pMID,JMID)

In [None]:
#cross validation
class SMWrapper(BaseEstimator, RegressorMixin):
    """ A universal sklearn-style wrapper for statsmodels regressors """
    def __init__(self, model_class, fit_intercept=True):
        self.model_class = model_class
        self.fit_intercept = fit_intercept
    def fit(self, X, y):
        if self.fit_intercept:
            X = sm.add_constant(X)
        self.model_ = self.model_class(y, X)
        self.results_ = self.model_.fit()
        #self.results_ = self.model_.fit_regularized(L1_wt=1, alpha=0.1,start_params=resultsMID_params)
    def predict(self, X):
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.results_.predict(X)
    
linearcval=cross_val_score(SMWrapper(sm.RLM), modelMID.exog, modelMID.endog, scoring='neg_root_mean_squared_error')
pd.DataFrame(pd.Series(np.transpose(linearcval)))
#robustcval=cross_val_score(SMWrapper(sm.RLM), modelMID.exog, modelMID.endog, scoring='neg_mean_absolute_error')
#pd.DataFrame(pd.concat([pd.Series(np.transpose(linearcval)),pd.Series(np.transpose(robustcval))],axis=1))

In [None]:
#visualizations
def millions(x, pos):
    'The two args are the value and tick position'
    return '%1.1fM' % (x * 1e-6)
formatter = mpl.ticker.FuncFormatter(millions)


#dataMID=dataMID[['goals','xg_xa_per90','passes_completed_short','passes_into_final_third','carry_distance','tackles_won']

#dataMID=dataMID[dataMID['goals']>0]
dataMID=dataMID[dataMID['xg_xa_per90']>0]
dataMID=dataMID[dataMID['passes_completed_short']>0]
dataMID=dataMID[dataMID['passes_into_final_third']>0]
dataMID=dataMID[dataMID['carry_distance']>0]
dataMID=dataMID[dataMID['tackles_won']>0]
corrcoef1=np.corrcoef(dataMID['value'],dataMID['goals'])[0,1]
corrcoef2=np.corrcoef(dataMID['value'],dataMID['xg_xa_per90'])[0,1]
corrcoef3=np.corrcoef(dataMID['value'],dataMID['passes_completed_short'])[0,1]
corrcoef4=np.corrcoef(dataMID['value'],dataMID['passes_into_final_third'])[0,1]
corrcoef5=np.corrcoef(dataMID['value'],dataMID['carry_distance'])[0,1]
corrcoef6=np.corrcoef(dataMID['value'],dataMID['tackles_won'])[0,1]

fig, ax = plt.subplots(3, 2, figsize=(12, 12))

sns.regplot(ax=ax[0,0],
            x=dataMID['goals'],
            y=dataMID['value'],
            data=dataMID,
            color='g')

sns.regplot(ax=ax[1,0],
            x=dataMID['xg_xa_per90'],
            y=dataMID['value'],
            data=dataMID,
            color='blue')

sns.regplot(ax=ax[2,0],
            x=dataMID['passes_completed_short'],
            y=dataMID['value'],
            data=dataMID,
            color='orange')

sns.regplot(ax=ax[0,1],
            x=dataMID['passes_into_final_third'],
            y=dataMID['value'],
            data=dataMID,
            color='cyan')

sns.regplot(ax=ax[1,1],
            x=dataMID['carry_distance'],
            y=dataMID['value'],
            data=dataMID,
            color='magenta')

sns.regplot(ax=ax[2,1],
            x=dataMID['tackles_won'],
            y=dataMID['value'],
            data=dataMID,
            color='chocolate')

ax[0,0].yaxis.set_major_formatter(formatter)
ax[0,0].annotate("r=",xy=(0.8,0.85), xycoords="axes fraction")
ax[0,0].annotate("{:.2f}".format(corrcoef1),xy=(0.85,0.85), xycoords="axes fraction")

ax[1,0].yaxis.set_major_formatter(formatter)
ax[1,0].annotate("r=",xy=(0.8,0.85), xycoords="axes fraction")
ax[1,0].annotate("{:.2f}".format(corrcoef2),xy=(0.85,0.85), xycoords="axes fraction")

ax[2,0].yaxis.set_major_formatter(formatter)
ax[2,0].annotate("r=",xy=(0.8,0.85), xycoords="axes fraction")
ax[2,0].annotate("{:.2f}".format(corrcoef3),xy=(0.85,0.85), xycoords="axes fraction")

ax[0,1].yaxis.set_major_formatter(formatter)
ax[0,1].annotate("r=",xy=(0.8,0.85), xycoords="axes fraction")
ax[0,1].annotate("{:.2f}".format(corrcoef4),xy=(0.85,0.85), xycoords="axes fraction")

ax[1,1].yaxis.set_major_formatter(formatter)
ax[1,1].annotate("r=",xy=(0.8,0.85), xycoords="axes fraction")
ax[1,1].annotate("{:.2f}".format(corrcoef5),xy=(0.85,0.85), xycoords="axes fraction")

ax[2,1].yaxis.set_major_formatter(formatter)
ax[2,1].annotate("r=",xy=(0.8,0.85), xycoords="axes fraction")
ax[2,1].annotate("{:.2f}".format(corrcoef6),xy=(0.85,0.85), xycoords="axes fraction")