In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score,f1_score
%matplotlib inline

In [47]:
df = pd.read_csv("../data/preprocessed_players_data.csv")

In [48]:
df['KIT_NUMBER'] = df['KIT_NUMBER'].astype('Int64')

In [49]:
def map_positions(position):
    if position == 'GK':
        return 'GOALKEEPER'
    elif position in ['CB', 'RB', 'LB', 'RWB', 'LWB']:
        return 'DEFENDER'
    elif position in ['CM', 'CDM', 'CAM', 'RM', 'LM']:
        return 'MIDFIELDER'
    elif position in ['ST', 'CF', 'RF', 'LF', 'RW', 'LW']:
        return 'FORWARD'
    else:
        return 'OTHER'

df['BROAD_POSITION'] = df['POSITION'].apply(map_positions)

In [51]:
df.drop(columns=['POTENTIAL', 'WEAK_FOOT', 'POSITION', 'WAGE', 'RELEASE_CLAUSE', 'PREFERRED_FOOT_Left', 'PREFERRED_FOOT_Right'], inplace=True)
df.head()

Unnamed: 0,OVERALL_RATING,VALUE,HEIGHT,KIT_NUMBER,SKILL_MOVES,ATTACKING,SKILL,MOVEMENT,POWER,MENTALITY,DEFENDING,GOALKEEPING,AGE,BROAD_POSITION
0,77,21.5,170,11,4,62,76,80,67,58,44,11,24,MIDFIELDER
1,85,73.5,186,10,5,79,81,84,80,74,37,11,26,FORWARD
2,70,3.6,180,64,3,58,72,74,60,63,65,11,21,DEFENDER
3,83,53.0,193,12,2,57,64,72,63,67,85,9,22,DEFENDER
4,85,80.0,178,7,3,74,79,85,72,77,67,10,22,FORWARD


In [52]:
x = df.drop(columns='BROAD_POSITION')
y = df['BROAD_POSITION']

In [53]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=0)

In [54]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=0, criterion='entropy')
tree.fit(x_train,y_train)

In [55]:
y_pred_test = tree.predict(x_test)

In [56]:
y_pred_train = tree.predict(x_train)
f1_score(y_train,y_pred_train, average='weighted')

0.9757206750106023

In [57]:
accuracy_score(y_test,y_pred_test)

0.764367816091954

In [58]:
f1_score(y_test,y_pred_test, average='weighted')

0.7641434842009986

In [60]:
from sklearn.model_selection import GridSearchCV

param_grid = {"criterion":["gini", "entropy", "log_loss"],
             "max_depth":[5,10,15,20,30,50],
             "min_samples_split":[20,30,50,75,100,250]}
grid = GridSearchCV(tree, cv=5,param_grid=param_grid,scoring='accuracy',verbose=1)
grid.fit(x_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [61]:
grid.best_estimator_

In [62]:
grid.best_params_

{'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 50}

In [63]:
grid.best_score_

0.7889978413099563