In [1]:
#Importing Libraries

import numpy as np #To handle Mathematical calculations
import matplotlib.pyplot as plt #To plot charts 
%matplotlib inline
import pandas as pd #TO import and manage datasets
import glob
import os
import warnings
# pd.set_option('display.max_columns', None)  
# pd.set_option('display.max_rows', None)  

#Disable warning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
#Load preprocessed dataset
combinedPlayerDataframe = pd.read_csv("../Pre_Processed_Datasets/Preprocessed Player Data.csv") 

In [3]:
combinedPlayerDataframe.head()

Unnamed: 0,Name,Team,Age,Position,Apps,Minutes,Total goals,Total Assists,Shots per game,Key passes per game,...,Passes per game,Pass success percentage,Crosses per game,Long balls per game,Through balls per game,Total Goals,Yellow cards,Red cards,Aerials won per game,Man of the match
0,Arjen Robben,Bayern Munich,26,Midfielder,18,1779,16,6,3.4,1.8,...,34.3,80.2,1.5,2.0,0.2,16,1,0,0.4,8
1,Stefan Kießling,Bayer Leverkusen,26,Forward,33,2924,21,5,3.0,1.6,...,32.5,72.4,0.2,0.4,0.1,21,3,0,2.1,6
2,Zvjezdan Misimovic,Wolfsburg,28,Attacking Midfielder,31,2768,10,13,2.4,3.3,...,51.1,77.5,2.5,5.7,0.5,10,7,0,0.2,8
3,Edin Dzeko,Wolfsburg,24,Forward,33,3003,22,7,4.5,1.1,...,19.5,70.7,0.2,1.1,0.1,22,4,0,0.7,6
4,Claudio Pizarro,Werder Bremen,31,Attacking Midfielder,23,2130,16,2,2.6,1.3,...,24.2,75.5,0.1,0.8,0.0,16,4,0,0.4,2


In [4]:
#Selecting only the needed columns for the forecasting process

allPlayers = combinedPlayerDataframe[['Name', 'Season', 'Rating', 'Age']]
allPlayers = allPlayers[allPlayers['Season'].notnull()].copy()
allPlayers['Season'] = allPlayers['Season'].astype(int)
allPlayers

Unnamed: 0,Name,Season,Rating,Age
0,Arjen Robben,2009,8.18,26
1,Stefan Kießling,2009,7.79,26
2,Zvjezdan Misimovic,2009,7.74,28
3,Edin Dzeko,2009,7.73,24
4,Claudio Pizarro,2009,7.63,31
...,...,...,...,...
17786,Frederic Veseli,2018,6.29,26
17787,Joel Campbell,2018,6.29,27
17788,Francesco Zampano,2018,6.28,25
17789,Massimo Gobbi,2018,6.20,38


In [5]:
#Store a list of all the players
nameList = allPlayers.Name.unique()

#create a dataframe just to store the mean values of player rating's 
meanDf = pd.DataFrame(columns=['Mean'])

#Calculating career growth and rating avergae of every player (Feature engineering)
for name in nameList:
    mean = allPlayers.loc[allPlayers.Name == name]['Rating'].mean()
    length = len(allPlayers.loc[allPlayers.Name == name]['Rating'])
    growth = allPlayers.loc[allPlayers.Name == name]['Rating'].tolist()[length-1] - allPlayers.loc[allPlayers.Name == name]['Rating'].tolist()[0]
    meanDf.set_value(name, 'Name', name)
    meanDf.set_value(name, 'Mean', mean)
    meanDf.set_value(name, 'Growth', growth)
    
#Merging all the data together   
allPlayers = pd.merge(allPlayers, meanDf, on=['Name'])

#Converting column type
allPlayers['Mean'] = allPlayers['Mean'].astype(np.float64)

In [6]:
allPlayers.loc[allPlayers.Name == 'Lionel Messi']

Unnamed: 0,Name,Season,Rating,Age,Mean,Growth
5242,Lionel Messi,2009,8.67,23,8.641,-0.19
5243,Lionel Messi,2010,8.76,24,8.641,-0.19
5244,Lionel Messi,2011,8.88,25,8.641,-0.19
5245,Lionel Messi,2012,8.83,26,8.641,-0.19
5246,Lionel Messi,2013,8.34,27,8.641,-0.19
5247,Lionel Messi,2014,8.84,28,8.641,-0.19
5248,Lionel Messi,2015,8.46,29,8.641,-0.19
5249,Lionel Messi,2016,8.47,30,8.641,-0.19
5250,Lionel Messi,2017,8.68,31,8.641,-0.19
5251,Lionel Messi,2018,8.48,32,8.641,-0.19


In [7]:
#Another portion of feature engineering

from pandas import Series
from pandas import DataFrame
from pandas import concat

allPlayers = allPlayers.sort_values(['Season', 'Name'])

#Creating 2 new features for every player by combining both the last season rating and the difference between that and the
#~previous season rating

transformedPlayerDf = allPlayers.copy()
transformedPlayerDf['Last_Season_Rating'] = transformedPlayerDf.groupby(['Name'])['Rating'].shift()
transformedPlayerDf['Last_Season_Diff'] = transformedPlayerDf.groupby('Name')['Last_Season_Rating'].transform(Series.diff)
transformedPlayerDf = transformedPlayerDf.dropna()
transformedPlayerDf.head()

Unnamed: 0,Name,Season,Rating,Age,Mean,Growth,Last_Season_Rating,Last_Season_Diff
10247,Abdoulrazak Boukari,2010,6.84,24,6.84,0.09,6.93,0.18
15778,Alessandro Matri,2010,6.79,26,6.71375,-0.58,6.92,-0.06
1643,Anderson,2010,6.95,23,6.806667,0.79,6.16,-0.68
992,Andrea Barzagli,2010,7.27,30,6.971111,-0.08,6.86,0.16
15003,Andrea Ranocchia,2010,7.16,23,7.157778,-0.22,7.48,0.17


In [8]:
#Forecasting using linear regression

from statistics import mean 
from sklearn import linear_model 

from sklearn.model_selection import cross_val_score
from sklearn import model_selection

accuracy = []

#For every season using previous season data to forecast current season rating
for season in range(2015,2019):
    
    training_set = transformedPlayerDf[transformedPlayerDf['Season'] == season-1]
    testing_set = transformedPlayerDf[transformedPlayerDf['Season'] == season]

    X_train = training_set.drop(['Rating', 'Name', 'Season'], axis=1)
    y_train = training_set['Rating'].values
    X_test = testing_set.drop(['Rating', 'Name', 'Season'], axis=1)
    y_test = testing_set['Rating'].values

    lin_reg = linear_model.LinearRegression(n_jobs=-1) 
    
    lin_reg.fit(X_train, y_train)

    #using k-folds cross validation for calculating forecast accuracy
    scores = cross_val_score(lin_reg, X_test, y_test, cv=10)
    accuracy.append(scores.mean())
     
print('Average accuracy for forecasting player ratings between 2015-2018: ' + "{:.2f}".format(mean(accuracy)*100) + "%")
# accuracy

Average accuracy for forecasting player ratings between 2015-2018: 66.52%


In [9]:
#Forecasting player ratings between 2019-2022

for season in range(2017, 2021):
    
    previousPlayers = allPlayers.loc[allPlayers.Season == season].copy()
    currentPlayers = allPlayers.loc[allPlayers.Season == season+1].copy()
    futurePlayers = currentPlayers.copy()

    futurePlayers['Season'] = currentPlayers['Season'].values+1
    futurePlayers['Age'] = currentPlayers['Age'].values+1

    futurePlayers.drop(['Rating'], axis=1)

    li = []
    li.append(previousPlayers)
    li.append(currentPlayers)
    li.append(futurePlayers)

    combinedDf = pd.concat(li)
    
    combinedDf = combinedDf.sort_values(['Season', 'Name'])

    transformedPlayerDf = combinedDf.copy()
    transformedPlayerDf['Last_Season_Rating'] = transformedPlayerDf.groupby(['Name'])['Rating'].shift()
    transformedPlayerDf['Last_Season_Diff'] = transformedPlayerDf.groupby('Name')['Last_Season_Rating'].transform(Series.diff)
    transformedPlayerDf = transformedPlayerDf.dropna()
    
    transformedPlayerDf = transformedPlayerDf.loc[transformedPlayerDf['Season'] == season+2]
    
    currentSeasonDf = transformedPlayerDf.drop(['Rating', 'Name', 'Season'], axis=1)
    y_pred = lin_reg.predict(currentSeasonDf)

    #Attaching the predicted values to the original DF so that the cycle can continue 
    #~until the forecasting range comes to an end 
    ratings = ["{:.2f}".format(value) for value in y_pred.tolist()]
    transformedPlayerDf['Rating'] = ratings
    transformedPlayerDf = transformedPlayerDf[['Name', 'Season', 'Rating', 'Age', 'Mean', 'Growth']]
    allPlayers = allPlayers.append(transformedPlayerDf)
    allPlayers = allPlayers[allPlayers['Rating'].notnull()].copy()
    allPlayers['Rating'] = allPlayers['Rating'].astype(float)


In [10]:
allPlayers.loc[allPlayers.Name == 'Kylian Mbappé']

Unnamed: 0,Name,Season,Rating,Age,Mean,Growth
12143,Kylian Mbappé,2016,7.22,18,7.586667,0.78
12144,Kylian Mbappé,2017,7.54,19,7.586667,0.78
12145,Kylian Mbappé,2018,8.0,20,7.586667,0.78
12145,Kylian Mbappé,2019,7.78,21,7.586667,0.78
12145,Kylian Mbappé,2020,7.69,22,7.586667,0.78
12145,Kylian Mbappé,2021,7.72,23,7.586667,0.78
12145,Kylian Mbappé,2022,7.73,24,7.586667,0.78


In [11]:
allPlayers.loc[(allPlayers.Season == 2018) & ((allPlayers.Age == 24) | (allPlayers.Age == 25))]

Unnamed: 0,Name,Season,Rating,Age,Mean,Growth
12366,Abdoulaye Touré,2018,6.85,25,6.895000,-0.09
2101,Abdul Rahman Baba,2018,6.90,24,6.975000,-0.13
4129,Adam Maher,2018,7.47,25,7.162500,0.74
15213,Adam Masina,2018,6.66,25,6.872500,0.21
3508,Adam Zrelák,2018,6.42,25,6.420000,0.00
...,...,...,...,...,...,...
3114,Yussuf Poulsen,2018,7.27,25,6.986667,0.26
3513,Yuya Kubo,2018,6.31,25,6.310000,0.00
17612,Ádám Nagy,2018,6.38,24,6.460000,-0.16
8683,Álvaro Medrán,2018,6.53,25,6.496667,0.01


In [12]:
#write the DF to a csv file
allPlayers.to_csv('../Pre_Processed_Datasets/Forecasted Ratings 2019-2022.csv', index = False)

In [13]:
# saving the model
import pickle
with open('../Models/model_forecasting_lr.pkl','wb') as file:
    pickle.dump(lin_reg, file)