In [227]:
import statsmodels.api as sm 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [228]:
df = pd.read_csv('../data/player_stats_trends.csv')
ballon_dor  = pd.read_csv('../data/BallonDOr_combined.csv')
tournament_stats = pd.read_csv('../data/tournament_stats.csv')

In [229]:
to_drop = []
for i in df.columns:
    if i not in ['Player', 'Season', 'Share']:
        to_drop.append(i)
df = df.drop(columns = to_drop)

In [230]:
awards = pd.read_csv('../data/awards.csv')
awards['Year'] = awards['Season'].apply(lambda x: float(str(x)[-2:]))
awards = awards.drop(columns=['Season', 'Club/Country'])
awards = awards.groupby('Award').filter(lambda x: len(x) >= 5)
awards = awards[~awards['Award'].isin(["Winner Ballon d'Or", 'Footballer of the Year', 'UEFA Best Player in Europe', "The Best FIFA Men's Player", 'Player of the Year', 'Top goal scorer', 'African Footballer of the Year', 'TM-Player of the season'])]
awards = awards.groupby(['Player', 'Year']).count().reset_index()
awards['Award'] = awards['Award'].fillna(0)

In [231]:
def goal_fixer(goals):
    try:
        return int(goals)
    except:
        return 0
def run_model(X, y):
    glm = sm.GLM(endog=y, exog= X, family=sm.families.Gaussian())
    X = sm.add_constant(X)
    glm_results = glm.fit()
    return glm_results
def remove_p(X, glm_results, p):
    to_remove = []
    for i in glm_results.pvalues.index:
        if glm_results.pvalues[i] > p:
            to_remove.append(i)
        elif np.isnan(glm_results.pvalues[i]):
            to_remove.append(i)
    X = X.drop(columns=to_remove)
    return X

In [232]:
df = df.rename(columns={'Season': 'Year'})

In [233]:
tournament_stats['Year'] = tournament_stats['Season'].apply(lambda x: float(x[-2:]))
tournament_stats = tournament_stats.drop('Season', axis=1)
tournament_stats = tournament_stats.query('Year >= 8 & Year <= 23')
tournament_stats['Goals'] = tournament_stats['Goals'].apply(lambda x: goal_fixer(x))
tournament_stats['Minutes played']= tournament_stats['Minutes played'].str.replace("'", '').str.replace('.', '')
for col in tournament_stats.columns:
    if col not in ['Player', 'Competition', 'Club', 'Year']:
        try:
            tournament_stats[col] = tournament_stats[col].str.replace('-', '0').astype(float)
        except:
            pass 

comp_goals = tournament_stats.pivot_table(index=['Year', 'Player'], columns='Competition', aggfunc='sum')['Goals'].replace(np.nan, 0).reset_index()
comp_goals = comp_goals.rename(columns=lambda x: x + '_Goals')
comp_goals.rename(columns={'Year_Goals': 'Year', 'Player_Goals' : 'Player'}, inplace=True)

comp_ass = tournament_stats.pivot_table(index=['Year', 'Player'], columns='Competition', aggfunc='sum')['Assists'].replace(np.nan, 0).reset_index()
comp_ass = comp_ass.rename(columns=lambda x: x + '_ass')
comp_ass.rename(columns={'Year_ass': 'Year', 'Player_ass' : 'Player'}, inplace=True)

comp_apps = tournament_stats.pivot_table(index=['Year', 'Player'], columns='Competition', aggfunc='sum')['Appearances'].replace(np.nan, 0).reset_index()
comp_apps = comp_apps.rename(columns=lambda x: x + '_apps')
comp_apps.rename(columns={'Year_apps': 'Year', 'Player_apps' : 'Player'}, inplace=True)

Goals = []
for i in comp_goals.iterrows():
    rs = 0
    for j in i[1].index:
        if j not in ['Player', 'Year']:
            rs += i[1][j]
    Goals.append(rs)
comp_goals['Goals'] = Goals

Goals = []
for i in comp_ass.iterrows():
    rs = 0
    for j in i[1].index:
        if j not in ['Player', 'Year']:
            rs += i[1][j]
    Goals.append(rs)
comp_ass['Assists'] = Goals

Goals = []
for i in comp_apps.iterrows():
    rs = 0
    for j in i[1].index:
        if j not in ['Player', 'Year']:
            rs += i[1][j]
    Goals.append(rs)
comp_apps['Appearances'] = Goals

# comp_mins = tournament_stats.pivot_table(index=['Year', 'Player'], columns='Competition', aggfunc='sum')['Minutes played'].replace(np.nan, 0).reset_index()
# comp_mins = comp_mins.rename(columns=lambda x: x + '_Minutes')
# comp_mins.rename(columns={'Year_Minutes': 'Year', 'Player_Minutes' : 'Player'}, inplace=True)

In [234]:
to_drop = []
for i in comp_goals.columns:
    if i not in ['Year', 'Player', 'Goals']:
        to_drop.append(i)
goals = comp_goals.drop(columns=to_drop)

to_drop = []
for i in comp_ass.columns:
    if i not in ['Year', 'Player', 'Assists']:
        to_drop.append(i)
assists = comp_ass.drop(columns=to_drop)

to_drop = []
for i in comp_apps.columns:
    if i not in ['Year', 'Player', 'Appearances']:
        to_drop.append(i)
apps = comp_apps.drop(columns=to_drop)

In [235]:
goals = df.merge(goals, on = ['Player', 'Year'], how = 'left')
goals = goals.merge(apps, on = ['Player', 'Year'], how = 'left')
goals = goals.merge(assists, on = ['Player', 'Year'], how = 'left')
goals = goals.merge(awards, on = ['Player', 'Year'], how = 'left')
goals = goals.fillna(0)

In [236]:
goals = goals.set_index(['Player', 'Year'])

In [237]:
X, y = goals.drop(columns = ['Share']), goals['Share']

In [238]:
glm1 =  sm.GLM(endog=goals['Share'], exog=goals.drop(columns=['Share']), family=sm.families.Gaussian())
X = sm.add_constant(X)
results = glm1.fit()
results.summary()

0,1,2,3
Dep. Variable:,Share,No. Observations:,365.0
Model:,GLM,Df Residuals:,361.0
Model Family:,Gaussian,Df Model:,3.0
Link Function:,Identity,Scale:,0.0045665
Method:,IRLS,Log-Likelihood:,467.59
Date:,"Tue, 02 Jan 2024",Deviance:,1.6485
Time:,13:50:49,Pearson chi2:,1.65
No. Iterations:,3,Pseudo R-squ. (CS):,0.3912
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Goals,0.0028,0.000,10.659,0.000,0.002,0.003
Appearances,-0.0015,0.000,-6.514,0.000,-0.002,-0.001
Assists,0.0022,0.001,3.861,0.000,0.001,0.003
Award,0.0153,0.003,5.798,0.000,0.010,0.020


In [239]:
goals

Unnamed: 0_level_0,Unnamed: 1_level_0,Share,Goals,Appearances,Assists,Award
Player,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cristiano Ronaldo,8.0,0.3097,42.0,49.0,8.0,4.0
Lionel Messi,8.0,0.1951,16.0,40.0,16.0,1.0
Fernando Torres,8.0,0.1243,33.0,46.0,5.0,1.0
Iker Casillas,8.0,0.0924,0.0,0.0,0.0,0.0
Xavi,8.0,0.0674,9.0,54.0,9.0,1.0
...,...,...,...,...,...,...
Jamal Musiala,23.0,0.0000,16.0,47.0,16.0,2.0
Nicolò Barella,23.0,0.0000,9.0,52.0,10.0,2.0
Martin Ødegaard,23.0,0.0000,15.0,45.0,8.0,0.0
Randal Kolo Muani,23.0,0.0000,23.0,46.0,17.0,0.0
