In [144]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [145]:
df = pd.read_csv('../data/player_stats_trends.csv')
tournament_stats = pd.read_csv('../data/tournament_stats_new.csv')
goals = pd.read_csv('../data/goals.csv')
awards = pd.read_csv('../data/awards.csv')

In [146]:
awards['Year'] = awards['Season'].apply(lambda x: float(str(x)[-2:]))
awards = awards.drop(columns=['Season', 'Club/Country'])
awards = awards.groupby('Award').filter(lambda x: len(x) >= 5)
awards = awards[~awards['Award'].isin(["Winner Ballon d'Or", 'Footballer of the Year', 'UEFA Best Player in Europe', "The Best FIFA Men's Player", 'Player of the Year', 'Top goal scorer', 'African Footballer of the Year', 'TM-Player of the season'])]


awards = pd.get_dummies(awards, columns=['Award'], prefix='', prefix_sep='')

for col in awards.columns:
    if col not in ['Player', 'Year']:
        awards[col] = awards[col].astype(int)
awards = awards.groupby(['Player', 'Year']).sum().reset_index()


In [147]:
def goal_fixer(goals):
    try:
        return int(goals)
    except:
        return 0
def run_model(X, y):
    glm = sm.GLM(endog=y, exog= X, family=sm.families.Gaussian())
    X = sm.add_constant(X)
    glm_results = glm.fit()
    return glm_results
def remove_p(X, glm_results, p):
    to_remove = []
    for i in glm_results.pvalues.index:
        if glm_results.pvalues[i] > p:
        # print(i)
            to_remove.append(i)
        elif np.isnan(glm_results.pvalues[i]):
        # print(i)
            to_remove.append(i)
    X = X.drop(columns=to_remove)
    return X

In [148]:
tournament_stats['Year'] = tournament_stats['Season'].apply(lambda x: float(x[-2:]))
tournament_stats = tournament_stats.drop('Season', axis=1)
tournament_stats = tournament_stats.query('Year >= 8 & Year <= 23')
tournament_stats['Goals'] = tournament_stats['Goals'].apply(lambda x: goal_fixer(x))
tournament_stats['Minutes played'] = tournament_stats['Minutes played'].apply(lambda x: goal_fixer(x))

comp_goals = tournament_stats.pivot_table(index=['Year', 'Player'], columns='Competition', aggfunc='sum')['Goals'].replace(np.nan, 0).reset_index()
comp_goals = comp_goals.rename(columns=lambda x: x + '_Goals')
comp_goals.rename(columns={'Year_Goals': 'Year', 'Player_Goals' : 'Player'}, inplace=True)

comp_mins = tournament_stats.pivot_table(index=['Year', 'Player'], columns='Competition', aggfunc='sum')['Minutes played'].replace(np.nan, 0).reset_index()
comp_mins = comp_mins.rename(columns=lambda x: x + '_Minutes')
comp_mins.rename(columns={'Year_Minutes': 'Year', 'Player_Minutes' : 'Player'}, inplace=True)

In [149]:
df = df.drop(columns=['P1', 'P2', 'P3', 'P4', 'P5', 'Votes', 'RankPts', 'Percent', 'Voted', 'Season'])

In [150]:
for i in df.columns:
    if i  not in ['Player', 'Nationality', 'Club']:
        df[i] = df[i].astype(float)

In [151]:
# df = df.merge(comp_goals, on=['Year', 'Player'], how='left')
df = df.merge(awards, on=['Year', 'Player'], how='left')
df = df.merge(comp_mins, on=['Year', 'Player'], how='left')


df.fillna(0, inplace=True)

In [152]:
df.set_index(['Player', 'Year'], inplace=True)
X, y = df.drop(columns=['Share', 'Nationality', 'Club', 'Rank', 'Points', 'Month']), df['Share']
glm = sm.GLM(endog=y, exog= X, family=sm.families.Gaussian())
X = sm.add_constant(X)
glm_results = glm.fit()

In [153]:
for i in range(3):
    X = remove_p(X, glm_results, 0.5)
    glm_results = run_model(X, y)
for i in range(2):
    X = remove_p(X, glm_results, 0.2)
    glm_results = run_model(X, y)
for i in range(2):
    X = remove_p(X, glm_results, 0.05)
    glm_results = run_model(X, y)

In [154]:
glm_results.summary()

0,1,2,3
Dep. Variable:,Share,No. Observations:,365.0
Model:,GLM,Df Residuals:,347.0
Model Family:,Gaussian,Df Model:,17.0
Link Function:,Identity,Scale:,0.0035767
Method:,IRLS,Log-Likelihood:,519.4
Date:,"Tue, 02 Jan 2024",Deviance:,1.2411
Time:,10:14:26,Pearson chi2:,1.24
No. Iterations:,3,Pseudo R-squ. (CS):,0.612
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0311,0.006,-4.933,0.000,-0.044,-0.019
Goals,0.0129,0.001,13.600,0.000,0.011,0.015
Assists,0.0050,0.002,2.949,0.003,0.002,0.008
Goals conceded,0.0051,0.001,4.229,0.000,0.003,0.007
UEFA_Super_Cup_appearances,-0.0724,0.029,-2.508,0.012,-0.129,-0.016
UEFA-Cup_Qualifikation_appearances,-1.187e-05,5.1e-06,-2.325,0.020,-2.19e-05,-1.86e-06
UI_Cup_appearances,-1.187e-05,5.1e-06,-2.325,0.020,-2.19e-05,-1.86e-06
Champions League winner,0.0391,0.008,4.651,0.000,0.023,0.056
Copa América winner,0.0519,0.024,2.174,0.030,0.005,0.099


In [155]:
# to_remove = []
# for i in glm_results.pvalues.index:
#     if glm_results.pvalues[i] > 0.05:
#         # print(i)
#         to_remove.append(i)
#     elif np.isnan(glm_results.pvalues[i]):
#         # print(i)
#         to_remove.append(i)
# X = X.drop(columns=to_remove)

In [156]:
# glm = sm.GLM(endog=y, exog= X, family=sm.families.Gaussian())
# X = sm.add_constant(X)
# glm_results = glm.fit()
# glm_results.summary()

In [157]:
def predict_season(num):
    # print(X.iloc[num])
    pred_score = glm_results.predict(X.iloc[num]).values[0]
    print('Predicted Score:', pred_score, 'Actual Score: ', y.iloc[num])
predict_season(300)

Predicted Score: 0.015977790939741325 Actual Score:  0.0004


# Only using trends data

In [158]:
# X, y = df['Trend'], df['Share']
# glm = sm.GLM(endog=y, exog= X, family=sm.families.Gaussian())
# X = sm.add_constant(X)
# glm_results = glm.fit()

In [159]:
# glm_results.summary()