In [339]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

In [340]:
df = pd.read_csv('../data/player_stats_trends.csv')
tournament_stats = pd.read_csv('../data/tournament_stats.csv')
goals = pd.read_csv('../data/goals.csv')
awards = pd.read_csv('../data/awards.csv')

In [341]:
awards['Year'] = awards['Season'].apply(lambda x: float(str(x)[-2:]))
awards = awards.drop(columns=['Season', 'Club/Country'])
awards = awards.groupby('Award').filter(lambda x: len(x) >= 5)
awards = awards[~awards['Award'].isin(["Winner Ballon d'Or", 'Footballer of the Year', 'UEFA Best Player in Europe', "The Best FIFA Men's Player", 'Player of the Year', 'Top goal scorer', 'African Footballer of the Year', 'TM-Player of the season'])]

awards = pd.get_dummies(awards, columns=['Award'], prefix='', prefix_sep='')

for col in awards.columns:
    if col not in ['Player', 'Year']:
        awards[col] = awards[col].astype(int)
awards = awards.groupby(['Player', 'Year']).sum().reset_index()


In [342]:
def goal_fixer(goals):
    try:
        return int(goals)
    except:
        return 0
def run_model(X, y):
    glm = sm.GLM(endog=y, exog= X, family=sm.families.Gaussian())
    X = sm.add_constant(X)
    glm_results = glm.fit()
    return glm_results
def remove_p(X, glm_results, p):
    to_remove = []
    for i in glm_results.pvalues.index:
        if glm_results.pvalues[i] > p:
            to_remove.append(i)
        elif np.isnan(glm_results.pvalues[i]):
            to_remove.append(i)
    X = X.drop(columns=to_remove)
    return X

In [343]:
tournament_stats['Year'] = tournament_stats['Season'].apply(lambda x: float(x[-2:]))
tournament_stats = tournament_stats.drop('Season', axis=1)
tournament_stats = tournament_stats.query('Year >= 8 & Year <= 23')
tournament_stats['Goals'] = tournament_stats['Goals'].apply(lambda x: goal_fixer(x))
tournament_stats['Minutes played']= tournament_stats['Minutes played'].str.replace("'", '').str.replace('.', '')
for col in tournament_stats.columns:
    if col not in ['Player', 'Competition', 'Club', 'Year']:
        try:
            tournament_stats[col] = tournament_stats[col].str.replace('-', '0').astype(float)
        except:
            pass 

comp_goals = tournament_stats.pivot_table(index=['Year', 'Player'], columns='Competition', aggfunc='sum')['Goals'].replace(np.nan, 0).reset_index()
comp_goals = comp_goals.rename(columns=lambda x: x + '_Goals')
comp_goals.rename(columns={'Year_Goals': 'Year', 'Player_Goals' : 'Player'}, inplace=True)

comp_ass = tournament_stats.pivot_table(index=['Year', 'Player'], columns='Competition', aggfunc='sum')['Assists'].replace(np.nan, 0).reset_index()
comp_ass = comp_ass.rename(columns=lambda x: x + '_ass')
comp_ass.rename(columns={'Year_ass': 'Year', 'Player_ass' : 'Player'}, inplace=True)

Goals = []
for i in comp_goals.iterrows():
    rs = 0
    for j in i[1].index:
        if j not in ['Player', 'Year']:
            rs += i[1][j]
    Goals.append(rs)
comp_goals['Goals'] = Goals

Goals = []
for i in comp_ass.iterrows():
    rs = 0
    for j in i[1].index:
        if j not in ['Player', 'Year']:
            rs += i[1][j]
    Goals.append(rs)
comp_ass['Assists'] = Goals

# comp_mins = tournament_stats.pivot_table(index=['Year', 'Player'], columns='Competition', aggfunc='sum')['Minutes played'].replace(np.nan, 0).reset_index()
# comp_mins = comp_mins.rename(columns=lambda x: x + '_Minutes')
# comp_mins.rename(columns={'Year_Minutes': 'Year', 'Player_Minutes' : 'Player'}, inplace=True)

In [344]:
df = df.drop(columns=['P1', 'P2', 'P3', 'P4', 'P5', 'Votes', 'RankPts', 'Percent', 'Voted', 'Season', 'Goals', 'Minutes played', 'Assists'])

In [345]:
for i in df.columns:
    if i  not in ['Player', 'Nationality', 'Club']:
        df[i] = df[i].astype(float)

In [346]:
df = df.merge(comp_goals, on=['Year', 'Player'], how='left')
df = df.merge(awards, on=['Year', 'Player'], how='left')
# df = df.merge(comp_mins, on=['Year', 'Player'], how='left')
df = df.merge(comp_ass, on=['Year', 'Player'], how='left')

dfs = []
for i in range(8, 24):
    scaler = MinMaxScaler()
    if  i == 20:
        continue
    df_temp = df.query('Year == @i')
    df_temp['Goals_Scaled'] = scaler.fit_transform(df_temp['Goals'].values.reshape(-1, 1))
    df_temp['Ass_Scaled'] = scaler.fit_transform(df_temp['Assists'].values.reshape(-1, 1))

    dfs.append(df_temp)
df = pd.concat(dfs)

df.fillna(0, inplace=True)
df = df.groupby(['Player', 'Year']).sum().reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['Goals_Scaled'] = scaler.fit_transform(df_temp['Goals'].values.reshape(-1, 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['Ass_Scaled'] = scaler.fit_transform(df_temp['Assists'].values.reshape(-1, 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['Goals_Scaled'] = scale

In [347]:
df = sm.add_constant(df)

# Tab out after use
df = df.query('Share > 0')
df['Share'] = np.log(df['Share'])

train = df.query('Year < 23')
test = df.query('Year == 23')
train.set_index(['Player', 'Year'], inplace=True)
test.set_index(['Player', 'Year'], inplace=True)

X_train, y_train = train.drop(columns=['Share', 'Nationality', 'Club', 'Rank', 'Points', 'Month']), train['Share']
X_test, y_test = test.drop(columns=['Share', 'Nationality', 'Club', 'Rank', 'Points', 'Month']), test['Share']

glm = sm.GLM(endog=y_train, exog= X_train, family=sm.families.Gaussian())
glm_results = glm.fit()

In [348]:
for i in range(2):
    X_train = remove_p(X_train, glm_results, 0.8)
    glm_results = run_model(X_train, y_train)
for i in range(2):
    X_train = remove_p(X_train, glm_results, 0.65)
    glm_results = run_model(X_train, y_train)
for i in range(2):
    X_train = remove_p(X_train, glm_results, 0.5)
    glm_results = run_model(X_train, y_train)
for i in range(2):
    X_train = remove_p(X_train, glm_results, 0.2)
    glm_results = run_model(X_train, y_train)
for i in range(2):
    X_train = remove_p(X_train, glm_results, 0.05)
    glm_results = run_model(X_train, y_train)

In [349]:
len(set(df['Year']))

15

In [350]:
glm_results.summary()

0,1,2,3
Dep. Variable:,Share,No. Observations:,307.0
Model:,GLM,Df Residuals:,289.0
Model Family:,Gaussian,Df Model:,17.0
Link Function:,Identity,Scale:,1.9391
Method:,IRLS,Log-Likelihood:,-527.99
Date:,"Wed, 03 Jan 2024",Deviance:,560.41
Time:,11:06:52,Pearson chi2:,560.0
No. Iterations:,3,Pseudo R-squ. (CS):,0.4364
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-6.3450,0.211,-30.032,0.000,-6.759,-5.931
Goals conceded,0.1279,0.033,3.909,0.000,0.064,0.192
Trend,1.4184,0.479,2.962,0.003,0.480,2.357
Champions League_Goals,0.1648,0.026,6.266,0.000,0.113,0.216
J1 League_Goals,-1.327e-15,3.51e-16,-3.779,0.000,-2.02e-15,-6.39e-16
Jupiler Pro League playoff Europa League_Goals,-6.129e-15,1.21e-15,-5.064,0.000,-8.5e-15,-3.76e-15
Liga Portugal_Goals,-0.1735,0.054,-3.240,0.001,-0.278,-0.069
Ligapokal_Goals,-1.5467,0.650,-2.379,0.017,-2.821,-0.273
Premier League_Goals,0.0509,0.016,3.093,0.002,0.019,0.083


In [351]:
X_test = X_test[X_train.columns]

In [352]:
preds = glm_results.predict(X_test)
preds = scaler.fit_transform(preds.values.reshape(-1, 1))
y_test = scaler.fit_transform(y_test.values.reshape(-1, 1))

results = []
for i, j, k in zip(X_test.index, y_test, preds):
    results.append([i[0], j[0], k[0]])
results = pd.DataFrame(results, columns=['Player', 'Actual', 'Predicted'])

In [353]:
results.sort_values('Predicted', ascending=False)

Unnamed: 0,Player,Actual,Predicted
15,Lionel Messi,1.0,1.0
22,Yassine Bounou,0.334407,0.898219
4,Erling Haaland,0.860092,0.805825
21,Vinícius Júnior,0.568068,0.527003
19,Rodri,0.590171,0.455884
17,Mohamed Salah,0.372329,0.3527
13,Kylian Mbappé,0.819007,0.344051
10,Kevin De Bruyne,0.672861,0.341536
0,André Onana,0.10195,0.324159
7,Jude Bellingham,0.232457,0.321989


In [354]:
X_test

Unnamed: 0_level_0,Unnamed: 1_level_0,const,Goals conceded,Trend,Champions League_Goals,J1 League_Goals,Jupiler Pro League playoff Europa League_Goals,Liga Portugal_Goals,Ligapokal_Goals,Premier League_Goals,UEFA Cup_Goals,...,Italian Super Cup winner,Olympic medalist,Winner UEFA Nations League,World Cup winner,Coupe de la Ligue_ass,Ligapokal_ass,Premier League_ass,Serie A_ass,UEFA Cup_ass,Assists
Player,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
André Onana,23.0,1.0,11.0,0.523865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Antoine Griezmann,23.0,1.0,0.0,0.075669,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0
Bernardo Silva,23.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,8.0
Bukayo Saka,23.0,1.0,0.0,0.271246,0.0,0.0,0.0,0.0,0.0,14.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,11.0
Erling Haaland,23.0,1.0,0.0,0.552969,12.0,0.0,0.0,0.0,0.0,36.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,9.0
Harry Kane,23.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,30.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0
Joško Gvardiol,23.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jude Bellingham,23.0,1.0,0.0,0.690338,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
Julián Álvarez,23.0,1.0,0.0,0.173458,3.0,0.0,0.0,0.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0
Karim Benzema,23.0,1.0,0.0,0.366705,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
