In [647]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

In [648]:
df = pd.read_csv('../data/player_stats_trends.csv')
tournament_stats = pd.read_csv('../data/tournament_stats.csv')
goals = pd.read_csv('../data/goals.csv')
awards = pd.read_csv('../data/awards.csv')

In [649]:
awards['Year'] = awards['Season'].apply(lambda x: float(str(x)[-2:]))
awards = awards.drop(columns=['Season', 'Club/Country'])
awards = awards.groupby('Award').filter(lambda x: len(x) >= 5)
awards = awards[~awards['Award'].isin(["Winner Ballon d'Or", 'Footballer of the Year', 'UEFA Best Player in Europe', "The Best FIFA Men's Player", 'Player of the Year', 'Top goal scorer', 'African Footballer of the Year', 'TM-Player of the season'])]


awards = pd.get_dummies(awards, columns=['Award'], prefix='', prefix_sep='')

for col in awards.columns:
    if col not in ['Player', 'Year']:
        awards[col] = awards[col].astype(int)
awards = awards.groupby(['Player', 'Year']).sum().reset_index()


In [650]:
def goal_fixer(goals):
    try:
        return int(goals)
    except:
        return 0
def run_model(X, y):
    glm = sm.GLM(endog=y, exog= X, family=sm.families.Gaussian())
    X = sm.add_constant(X)
    glm_results = glm.fit()
    return glm_results
def remove_p(X, glm_results, p):
    to_remove = []
    for i in glm_results.pvalues.index:
        if glm_results.pvalues[i] > p:
            to_remove.append(i)
        elif np.isnan(glm_results.pvalues[i]):
            to_remove.append(i)
    X = X.drop(columns=to_remove)
    return X

In [651]:
tournament_stats['Year'] = tournament_stats['Season'].apply(lambda x: float(x[-2:]))
tournament_stats = tournament_stats.drop('Season', axis=1)
tournament_stats = tournament_stats.query('Year >= 8 & Year <= 23')
tournament_stats['Goals'] = tournament_stats['Goals'].apply(lambda x: goal_fixer(x))
tournament_stats['Minutes played'] = tournament_stats['Minutes played'].apply(lambda x: goal_fixer(x))

comp_goals = tournament_stats.pivot_table(index=['Year', 'Player'], columns='Competition', aggfunc='sum')['Goals'].replace(np.nan, 0).reset_index()
comp_goals = comp_goals.rename(columns=lambda x: x + '_Goals')
comp_goals.rename(columns={'Year_Goals': 'Year', 'Player_Goals' : 'Player'}, inplace=True)

Goals = []
for i in comp_goals.iterrows():
    rs = 0
    for j in i[1].index:
        if j not in ['Player', 'Year']:
            rs += i[1][j]
    Goals.append(rs)
comp_goals['Goals'] = Goals

comp_mins = tournament_stats.pivot_table(index=['Year', 'Player'], columns='Competition', aggfunc='sum')['Minutes played'].replace(np.nan, 0).reset_index()
comp_mins = comp_mins.rename(columns=lambda x: x + '_Minutes')
comp_mins.rename(columns={'Year_Minutes': 'Year', 'Player_Minutes' : 'Player'}, inplace=True)

In [652]:
df = df.drop(columns=['P1', 'P2', 'P3', 'P4', 'P5', 'Votes', 'RankPts', 'Percent', 'Voted', 'Season', 'Goals', 'Minutes played'])

In [653]:
for i in df.columns:
    if i  not in ['Player', 'Nationality', 'Club']:
        df[i] = df[i].astype(float)

In [654]:
df = df.merge(comp_goals, on=['Year', 'Player'], how='left')
df = df.merge(awards, on=['Year', 'Player'], how='left')
df = df.merge(comp_mins, on=['Year', 'Player'], how='left')
dfs = []
for i in range(8, 24):
    scaler = MinMaxScaler()
    if  i == 20:
        continue
    df_temp = df.query('Year == @i')
    df_temp['Goals_Scaled'] = scaler.fit_transform(df_temp['Goals'].values.reshape(-1, 1))
    dfs.append(df_temp)
df = pd.concat(dfs)

df.fillna(0, inplace=True)
df = df.groupby(['Player', 'Year']).sum().reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['Goals_Scaled'] = scaler.fit_transform(df_temp['Goals'].values.reshape(-1, 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['Goals_Scaled'] = scaler.fit_transform(df_temp['Goals'].values.reshape(-1, 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['Goals_Scaled'] = scale

In [655]:
df = sm.add_constant(df)
train = df.query('Year < 23')
test = df.query('Year == 23')
train.set_index(['Player', 'Year'], inplace=True)
test.set_index(['Player', 'Year'], inplace=True)

X_train, y_train = train.drop(columns=['Share', 'Nationality', 'Club', 'Rank', 'Points', 'Month']), train['Share']
X_test, y_test = test.drop(columns=['Share', 'Nationality', 'Club', 'Rank', 'Points', 'Month']), test['Share']

glm = sm.GLM(endog=y_train, exog= X_train, family=sm.families.Gaussian())
glm_results = glm.fit()

In [656]:
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,const,Assists,Yellow cards,Sesond yellow cards,Red cards,Goals conceded,Clean sheets,Champions_League_appearances,Champions_League_Qu._appearances,UEFA_Cup_appearances,...,UEFA-Cup Qualifikation_Minutes,UI Cup_Minutes,US Open Cup_Minutes,USLC_Minutes,USLC Playoffs_Minutes,VL Hessen-Nord_Minutes,VL Hessen-Süd_Minutes,Viareggio Cup_Minutes,ÖFB-Cup_Minutes,Goals_Scaled
Player,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alexis Sánchez,15.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,7.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.409836
Alisson,18.0,1.0,0.0,0.0,0.0,0.0,19.0,5.0,12.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
Andrea Pirlo,13.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333
Andrei Arshavin,8.0,1.0,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285714
Andrei Arshavin,9.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zlatan Ibrahimović,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.803922
Zlatan Ibrahimović,15.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.491803
Zlatan Ibrahimović,16.0,1.0,4.0,2.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.844828
Ángel Di María,14.0,1.0,6.0,1.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.215686


In [657]:
for i in range(2):
    X = remove_p(X_train, glm_results, 0.8)
    glm_results = run_model(X_train, y_train)
for i in range(2):
    X = remove_p(X_train, glm_results, 0.4)
    glm_results = run_model(X_train, y_train)
for i in range(2):
    X = remove_p(X_train, glm_results, 0.2)
    glm_results = run_model(X_train, y_train)
for i in range(1):
    X = remove_p(X_train, glm_results, 0.05)
    glm_results = run_model(X_train, y_train)

In [658]:
glm_results.predict(X_test.iloc[0, :])

None    0.067707
dtype: float64

In [659]:
scaler = MinMaxScaler()
preds = glm_results.predict(X_test).values
preds = scaler.fit_transform(preds.reshape(-1, 1))
y_test = scaler.fit_transform(y_test.values.reshape(-1, 1))

results = []
for i, j, k in zip(X_test.index, y_test, preds):
    # print(i[0], j, k[0])
    results.append([i[0], j[0], k[0]])
results = pd.DataFrame(results, columns=['Player', 'Actual', 'Predicted'])

In [660]:
results.sort_values('Predicted', ascending=False)

Unnamed: 0,Player,Actual,Predicted
4,Erling Haaland,0.38627,1.0
16,Lionel Messi,1.0,0.877889
22,Robert Lewandowski,0.013061,0.801077
10,Karim Benzema,0.006531,0.786459
27,Yassine Bounou,0.010831,0.658252
26,Vinícius Júnior,0.053042,0.606934
23,Rodri,0.061644,0.546704
11,Kevin De Bruyne,0.108155,0.526779
0,André Onana,0.00223,0.526524
14,Kylian Mbappé,0.292131,0.49325


In [661]:
# to_remove = []
# for i in glm_results.pvalues.index:
#     if glm_results.pvalues[i] > 0.05:
#         # print(i)
#         to_remove.append(i)
#     elif np.isnan(glm_results.pvalues[i]):
#         # print(i)
#         to_remove.append(i)
# X = X.drop(columns=to_remove)

In [662]:
# glm = sm.GLM(endog=y, exog= X, family=sm.families.Gaussian())
# X = sm.add_constant(X)
# glm_results = glm.fit()
# glm_results.summary()

In [663]:
def predict_season(num):
    # print(X.iloc[num])
    pred_score = glm_results.predict(X.iloc[num]).values[0]
    print('Predicted Score:', pred_score, 'Actual Score: ', y.iloc[num])
predict_season(300)

ValueError: shapes (1,9) and (532,) not aligned: 9 (dim 1) != 532 (dim 0)

# Only using trends data

In [None]:
# X, y = df['Trend'], df['Share']
# glm = sm.GLM(endog=y, exog= X, family=sm.families.Gaussian())
# X = sm.add_constant(X)
# glm_results = glm.fit()

In [None]:
# glm_results.summary()