In [None]:
import os
import statistics
import scipy as sp
import math
import pandas as pd
#pd.get_option("display.max_rows")
#pd.reset_option("display.max_rows")
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as tick
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from statsmodels.regression.linear_model import OLS
from statsmodels.stats.outliers_influence import OLSInfluence
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, RegressorMixin
%matplotlib inline

In [None]:
os.chdir(' ') #set proper directory depending where you put that project
#data - data for 2019/2020
#data05 - data for 2017/2018
#data1 - data for 2018/19
#data15 - data for 2020/21
data    = pd.read_csv('danelic2019.csv',sep=';',engine='python')
data05  = pd.read_csv('danelic2017.csv',sep=';',engine='python')
data1   = pd.read_csv('danelic2018.csv',sep=';',engine='python')
#data15  = pd.read_csv('danelic2020.csv',sep=';',engine='python')
data1   = pd.DataFrame.append(data,data1)
data1   = pd.DataFrame.append(data1,data05,ignore_index=True)
#data1   = pd.DataFrame.append(data1,data15,ignore_index=True)
data1.sort_values('player')

In [None]:
#adding dummy variables to dataset
data1 = pd.get_dummies(data1, columns=['league'])
data1 = data1.rename({"league_Bundesliga":"isBundesliga",
                                "league_La Liga":"isLaLiga",
                                "league_Premier League":"isPremierLeague",
                                "league_Ligue 1":"isLigue1",
                                "league_Serie A":"isSerieA"},axis='columns')
data1 = pd.get_dummies(data1,columns=['Season'])
data1 = pd.get_dummies(data1,columns=['foot'])
#deleting potential outliers that actually contribute nothing
data1 = data1[data1['value']>1000000]
data1 = data1[data1['games']>5]
data1 = data1[data1['age']>0]
data1 = data1[data1['height']>0]
data1

In [None]:
#FORWARDS
dataFWD  = data1[data1['position2'].str[:7]=='Forward']
dataFWD2 = data1[data1['position2'].str[:6]=='attack']
dataFWD  = pd.DataFrame.append(dataFWD,dataFWD2)
dataFWD

In [None]:
#was originally used for correlations calculations (visible in appendix)

#dataFWD['age']=dataFWD['age']*dataFWD['age']
#dataFWD_cor = dataFWD.corr()
#show=pd.Series(dataFWD_cor['value'])
#pd.set_option('display.max_rows', None)
#show.sort_values(ascending=False)

In [None]:
#for my liking
def ln(x):
    return np.log(x)

#Creating a linear regression
model_blueprint = ('ln(value)~age+CL+goals+gca'
                   '+Pts+xG+xGA+dribbles_completed'
                   ''
                   '+xg_xa_per90+touches_att_pen_area+'
                   '+passes_into_final_third+'
                   '+isPremierLeague+isLigue1')

trainFWD, testFWD = train_test_split(dataFWD, train_size=0.8)
modelFWD=smf.ols(model_blueprint,data=dataFWD)

resultsFWD=modelFWD.fit()
resultsFWD_params=resultsFWD.params

#Creating a robust regression
modelFWDrobust=sm.RLM(modelFWD.endog,modelFWD.exog,data=trainFWD).fit()

finalFWD1 = sm.regression.linear_model.OLSResults(modelFWD, 
                                              modelFWDrobust.params, 
                                              modelFWD.normalized_cov_params)
finalFWD1.summary()


In [None]:
#VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor 
vif               = pd.DataFrame()
vif["features"]   = modelFWD.exog_names
vif["VIF Factor"] = [variance_inflation_factor(finalFWD1.model.exog, i) for i in range(finalFWD1.model.exog.shape[1])]
vif

In [None]:
#testing
#Breusch-Pagan
bptestFWD = sm.stats.diagnostic.het_breuschpagan(finalFWD1.resid, finalFWD1.model.exog)
bptestFWD

In [None]:
#Chowtest
trainFWD1 = dataFWD[dataFWD['Season_201819#']==0]
trainFWD2 = dataFWD[dataFWD['Season_201819#']==1]

JFWD  = len(resultsFWD.params)-1 #number of coefficients
N1FWD = trainFWD1.shape[0]
N2FWD = trainFWD2.shape[0]

RSSdFWD          =  resultsFWD.ssr
resultsFWDridge1 = smf.ols(model_blueprint,data=trainFWD1).fit()
RSSbFWD          = resultsFWDridge1.ssr
kFWD             = len(resultsFWDridge1.params)-1
resultsFWDridge2 = smf.ols(model_blueprint,data=trainFWD2).fit()
RSSnbFWD         = resultsFWDridge2.ssr

ChowFWD=((RSSdFWD-(RSSbFWD+RSSnbFWD))/JFWD)/((RSSbFWD+RSSnbFWD)/(N1FWD+N2FWD-2*kFWD))
pFWD=sp.stats.f.cdf(ChowFWD, JFWD, N1FWD+N2FWD-2*kFWD)

print(ChowFWD,pFWD,JFWD)

In [None]:
#cross validation
class SMWrapper(BaseEstimator, RegressorMixin):
    """ A universal sklearn-style wrapper for statsmodels regressors """
    def __init__(self, model_class, fit_intercept=True):
        self.model_class = model_class
        self.fit_intercept = fit_intercept
    def fit(self, X, y):
        if self.fit_intercept:
            X = sm.add_constant(X)
        self.model_ = self.model_class(y, X)
        self.results_ = self.model_.fit()
        #self.results_ = self.model_.fit_regularized(L1_wt=1, alpha=0.1,start_params=resultsFWD_params)
    def predict(self, X):
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.results_.predict(X)
    
linearcval=cross_val_score(SMWrapper(sm.RLM), modelFWD.exog, modelFWD.endog, scoring='neg_root_mean_squared_error')
pd.DataFrame(pd.Series(np.transpose(linearcval)))
#robustcval=cross_val_score(SMWrapper(sm.RLM), modelFWD.exog, modelFWD.endog, scoring='neg_root_mean_squared_error')
#pd.DataFrame(pd.concat([pd.Series(np.transpose(linearcval)),pd.Series(np.transpose(robustcval))],axis=1))

In [None]:
#visualizations
def millions(x, pos):
    'The two args are the value and tick position'
    return '%1.1fM' % (x * 1e-6)
formatter = mpl.ticker.FuncFormatter(millions)


#dataFWD = dataFWD[['goals','xg_xa_per90','passes_into_final_third','touches_att_pen_area','gca','dribbles_completed']

#dataFWD = dataFWD[dataFWD['goals']>0]
dataFWD = dataFWD[dataFWD['xg_xa_per90']>0]
dataFWD = dataFWD[dataFWD['passes_into_final_third']>0]
dataFWD = dataFWD[dataFWD['touches_att_pen_area']>0]
dataFWD = dataFWD[dataFWD['gca']>0]
dataFWD = dataFWD[dataFWD['dribbles_completed']>0]

corrcoef1 = np.corrcoef(dataFWD['value'],dataFWD['goals'])[0,1]
corrcoef2 = np.corrcoef(dataFWD['value'],dataFWD['xg_xa_per90'])[0,1]
corrcoef3 = np.corrcoef(dataFWD['value'],dataFWD['passes_into_final_third'])[0,1]
corrcoef4 = np.corrcoef(dataFWD['value'],dataFWD['touches_att_pen_area'])[0,1]
corrcoef5 = np.corrcoef(dataFWD['value'],dataFWD['gca'])[0,1]
corrcoef6 = np.corrcoef(dataFWD['value'],dataFWD['dribbles_completed'])[0,1]

fig, ax = plt.subplots(3, 2, figsize=(12, 12))

sns.regplot(ax=ax[0,0],
            x=dataFWD['goals'],
            y=dataFWD['value'],
            data=dataFWD,
            color='g')

sns.regplot(ax=ax[1,0],
            x=dataFWD['xg_xa_per90'],
            y=dataFWD['value'],
            data=dataFWD,
            color='blue')

sns.regplot(ax=ax[2,0],
            x=dataFWD['passes_into_final_third']
            ,y=dataFWD['value'],
            data=dataFWD,
            color='orange')

sns.regplot(ax=ax[0,1],
            x=dataFWD['touches_att_pen_area'],
            y=dataFWD['value'],
            data=dataFWD,
            color='cyan')

sns.regplot(ax=ax[1,1],
            x=dataFWD['gca'],
            y=dataFWD['value'],
            data=dataFWD,
            color='magenta')

sns.regplot(ax=ax[2,1],
            x=dataFWD['dribbles_completed'],
            y=dataFWD['value'],
            data=dataFWD,
            color='chocolate')

ax[0,0].yaxis.set_major_formatter(formatter)
ax[0,0].annotate("r=",xy=(0.8,0.85), xycoords="axes fraction")
ax[0,0].annotate("{:.2f}".format(corrcoef1),xy=(0.85,0.85), xycoords="axes fraction")

ax[1,0].yaxis.set_major_formatter(formatter)
ax[1,0].annotate("r=",xy=(0.8,0.85), xycoords="axes fraction")
ax[1,0].annotate("{:.2f}".format(corrcoef2),xy=(0.85,0.85), xycoords="axes fraction")

ax[2,0].yaxis.set_major_formatter(formatter)
ax[2,0].annotate("r=",xy=(0.8,0.85), xycoords="axes fraction")
ax[2,0].annotate("{:.2f}".format(corrcoef3),xy=(0.85,0.85), xycoords="axes fraction")

ax[0,1].yaxis.set_major_formatter(formatter)
ax[0,1].annotate("r=",xy=(0.8,0.85), xycoords="axes fraction")
ax[0,1].annotate("{:.2f}".format(corrcoef4),xy=(0.85,0.85), xycoords="axes fraction")

ax[1,1].yaxis.set_major_formatter(formatter)
ax[1,1].annotate("r=",xy=(0.8,0.85), xycoords="axes fraction")
ax[1,1].annotate("{:.2f}".format(corrcoef5),xy=(0.85,0.85), xycoords="axes fraction")

ax[2,1].yaxis.set_major_formatter(formatter)
ax[2,1].annotate("r=",xy=(0.8,0.85), xycoords="axes fraction")
ax[2,1].annotate("{:.2f}".format(corrcoef6),xy=(0.85,0.85), xycoords="axes fraction")