In [8]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

import statsmodels.api as sm
import statsmodels.formula.api as smf  

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree, export_text
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error,confusion_matrix, classification_report
from sklearn.model_selection import KFold

In [9]:
p18 = pd.read_csv('players_18.csv', encoding='latin-1') #players_18.csv file, some of the cases we encoded in latin1
p19 = pd.read_csv('players_19.csv', encoding='latin-1') #players_19.csv file, some of the cases we encoded in latin1
m_results = pd.read_csv('match_result.csv', encoding='latin-1') #match_results.csv file, some of the cases we encoded in latin1

In [10]:
#dropping unneeded variables from the player_18 and player_19 csv files first

#list of variables of interest (voi) that we want to keep for our models
voi_player = ['short_name', 'age', 'height_cm', 'weight_kg', 'club', 'wage_eur', 'player_positions',
      'pace' ,'shooting', 'passing', 'dribbling', 'defending', 'physic']

columns_list_18 = p18.columns.values.tolist() #list of all columns
columns_list_19 = p19.columns.values.tolist()

for i in range(len(columns_list_18)): #iterate through list of columns
    if columns_list_18[i] not in voi_player: #if column is not in our variables of interest list
        p18.pop(columns_list_18[i]) #remove it from the df
        
for i in range(len(columns_list_19)):
    if columns_list_19[i] not in voi_player:
        p19.pop(columns_list_19[i])

In [11]:
p18['short_name'] = p18['short_name'].astype('category') #setting to correct variable types
p18['club'] = p18['club'].astype('category') #setting to correct variable types
p19['short_name'] = p19['short_name'].astype('category') #setting to correct variable types
p19['club'] = p19['club'].astype('category') #setting to correct variable types

In [12]:
p18.drop(p18[p18['player_positions'] == "GK"].index, inplace = True) #remove all goalkeepers from p18 dataset
p19.drop(p19[p19['player_positions'] == "GK"].index, inplace = True) #remove all goalkeepers from p19 dataset

In [13]:
p18.to_csv('players_18_1.csv') #save players_18 dataframe as the new csv file
p19.to_csv('players_19_1.csv') #save players_19 dataframe as the new csv file
m_results.to_csv('match_result_1.csv') #save match_results dataframe as the new csv file

In [14]:
p18_slim = pd.read_csv('players_18_1.csv')
p19_slim = pd.read_csv('players_19_1.csv')
p18_slim.head()

Unnamed: 0.1,Unnamed: 0,short_name,age,height_cm,weight_kg,club,wage_eur,player_positions,pace,shooting,passing,dribbling,defending,physic
0,0,Cristiano Ronaldo,32,185,80,Real Madrid,565000,"LW, ST",90.0,93.0,82.0,90.0,33.0,80.0
1,1,L. Messi,30,170,72,FC Barcelona,565000,RW,89.0,90.0,86.0,96.0,26.0,61.0
2,2,Neymar,25,175,68,Paris Saint-Germain,280000,LW,92.0,84.0,79.0,95.0,30.0,60.0
3,4,L. SuÃ¡rez,30,182,86,FC Barcelona,510000,ST,82.0,90.0,79.0,87.0,42.0,81.0
4,5,R. Lewandowski,28,185,79,FC Bayern MÃ¼nchen,355000,ST,81.0,88.0,75.0,86.0,38.0,82.0


In [15]:
x_18 = p18_slim[['age', 'height_cm', 'weight_kg', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']]
y_18 = p18_slim['wage_eur']

x_19 = p19_slim[['age', 'height_cm', 'weight_kg', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']]
y_19 = p19_slim['wage_eur']

In [16]:
x_train_18, x_test_18 , y_train_18, y_test_18 = train_test_split(x_18, y_18, test_size=0.3, random_state=1)
x_train_19, x_test_19 , y_train_19, y_test_19 = train_test_split(x_19, y_19, test_size=0.3, random_state=1)

In [17]:
clf_app_18 = DecisionTreeClassifier(random_state=0, max_depth=3, min_samples_leaf = 10)
clf_app_18.fit(x_train_18, y_train_18)

DecisionTreeClassifier(max_depth=3, min_samples_leaf=10, random_state=0)

In [20]:
app_tree_text_18 = export_text(clf_app_18, feature_names = list(x_18.columns))
print(app_tree_text_18)

|--- passing <= 58.50
|   |--- defending <= 63.50
|   |   |--- shooting <= 58.50
|   |   |   |--- class: 1000
|   |   |--- shooting >  58.50
|   |   |   |--- class: 2000
|   |--- defending >  63.50
|   |   |--- defending <= 66.50
|   |   |   |--- class: 2000
|   |   |--- defending >  66.50
|   |   |   |--- class: 5000
|--- passing >  58.50
|   |--- dribbling <= 68.50
|   |   |--- defending <= 64.50
|   |   |   |--- class: 1000
|   |   |--- defending >  64.50
|   |   |   |--- class: 6000
|   |--- dribbling >  68.50
|   |   |--- dribbling <= 72.50
|   |   |   |--- class: 3000
|   |   |--- dribbling >  72.50
|   |   |   |--- class: 10000



In [23]:
pred_18 = clf_app_18.predict(x_test_18)
mse_18 = mean_squared_error(y_test_18, pred_18)
print("Test MSE: "+str(mse_18))

Test MSE: 554061912.8949616
