In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from catboost import Pool, CatBoostClassifier, cv, CatBoostRegressor



In [2]:
df = pd.read_csv('data_cleaned.csv')
df.head()

Unnamed: 0,ID,Name,Age,Nationality,Overall,Potential,Club,Value,Wage,Preferred Foot,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,158023,L. Messi,31,Argentina,94,94,FC Barcelona,110500000.0,565000.0,Left,...,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M
1,20801,Cristiano Ronaldo,33,Portugal,94,94,Juventus,77000000.0,405000.0,Right,...,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M
2,190871,Neymar Jr,26,Brazil,92,93,Paris Saint-Germain,118500000.0,290000.0,Right,...,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M
3,193080,De Gea,27,Spain,91,93,Manchester United,72000000.0,260000.0,Right,...,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M
4,192985,K. De Bruyne,27,Belgium,91,92,Manchester City,102000000.0,355000.0,Right,...,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M


In [3]:
def face_to_num(df): #returns 1 for real face , else 0
    if (df['Real Face'] == 'Yes'):
        return 1
    else:
        return 0

def right_footed(df): #preferred foot 
    if (df['Preferred Foot'] == 'Right'):
        return 1
    else:
        return 0

#Create a simplified position varaible to account for all player positions
def simple_position(df):
    if (df['Position'] == 'GK'):
        return 'GK'
    elif ((df['Position'] == 'RB') | (df['Position'] == 'LB') | (df['Position'] == 'CB') | (df['Position'] == 'LCB') | (df['Position'] == 'RCB') | (df['Position'] == 'RWB') | (df['Position'] == 'LWB') ):
        return 'DF'
    elif ((df['Position'] == 'LDM') | (df['Position'] == 'CDM') | (df['Position'] == 'RDM')):
        return 'DM'
    elif ((df['Position'] == 'LM') | (df['Position'] == 'LCM') | (df['Position'] == 'CM') | (df['Position'] == 'RCM') | (df['Position'] == 'RM')):
        return 'MF'
    elif ((df['Position'] == 'LAM') | (df['Position'] == 'CAM') | (df['Position'] == 'RAM') | (df['Position'] == 'LW') | (df['Position'] == 'RW')):
        return 'AM'
    elif ((df['Position'] == 'RS') | (df['Position'] == 'ST') | (df['Position'] == 'LS') | (df['Position'] == 'CF') | (df['Position'] == 'LF') | (df['Position'] == 'RF')):
        return 'ST'
    else:
        return df.Position

#Get a count of Nationalities in the Dataset, make of list of those with over 250 Players (our Major Nations)
nat_counts = df.Nationality.value_counts()
nat_list = nat_counts[nat_counts > 250].index.tolist()

#Replace Nationality with a binary indicator variable for 'Major Nation'
def major_nation(df):
    if (df.Nationality in nat_list):
        return 1
    else:
        return 0

#Create a copy of the original dataframe to avoid indexing errors
df1 = df.copy()

#Apply changes to dataset to create new column
df1['Real_Face'] = df1.apply(face_to_num, axis=1)
df1['Right_Foot'] = df1.apply(right_footed, axis=1)
df1['Simple_Position'] = df1.apply(simple_position,axis = 1)
df1['Major_Nation'] = df1.apply(major_nation,axis = 1)

#Split the Work Rate Column in two
tempwork = df1["Work Rate"].str.split("/ ", n = 1, expand = True) 
#Create new column for first work rate
df1["WorkRate1"]= tempwork[0]   
#Create new column for second work rate
df1["WorkRate2"]= tempwork[1]
#Drop original columns used
df1 = df1.drop(['Work Rate','Preferred Foot','Real Face', 'Position','Nationality'], axis = 1)
df1.head()

Unnamed: 0,ID,Name,Age,Overall,Potential,Club,Value,Wage,International Reputation,Weak Foot,...,GKKicking,GKPositioning,GKReflexes,Release Clause,Real_Face,Right_Foot,Simple_Position,Major_Nation,WorkRate1,WorkRate2
0,158023,L. Messi,31,94,94,FC Barcelona,110500000.0,565000.0,5.0,4.0,...,15.0,14.0,8.0,€226.5M,1,0,ST,1,Medium,Medium
1,20801,Cristiano Ronaldo,33,94,94,Juventus,77000000.0,405000.0,5.0,4.0,...,15.0,14.0,11.0,€127.1M,1,1,ST,1,High,Low
2,190871,Neymar Jr,26,92,93,Paris Saint-Germain,118500000.0,290000.0,5.0,5.0,...,15.0,15.0,11.0,€228.1M,1,1,AM,1,High,Medium
3,193080,De Gea,27,91,93,Manchester United,72000000.0,260000.0,4.0,3.0,...,87.0,88.0,94.0,€138.6M,1,1,GK,1,Medium,Medium
4,192985,K. De Bruyne,27,91,92,Manchester City,102000000.0,355000.0,4.0,5.0,...,5.0,10.0,13.0,€196.4M,1,1,MF,1,High,High


In [4]:
df1.columns

Index(['ID', 'Name', 'Age', 'Overall', 'Potential', 'Club', 'Value', 'Wage',
       'International Reputation', 'Weak Foot', 'Skill Moves', 'Body Type',
       'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until',
       'Height', 'Weight', 'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW',
       'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM',
       'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'Crossing',
       'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
       'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes', 'Release Clause',
       'Real_Face', 'Right_Foot', 'Si

In [5]:
df1.shape

(18159, 85)

In [6]:
df1 = df1.drop(['Club','LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW',
       'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM',
       'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB','Real_Face','Weak Foot'], axis = 1)
df1.shape

(18159, 56)

In [7]:
ot1=pd.get_dummies(df1['WorkRate1'],drop_first=True)
ot2=pd.get_dummies(df1['Simple_Position'],drop_first=True)
ot3=pd.get_dummies(df1['WorkRate2'],drop_first=True)
ot3=pd.get_dummies(df1['Body Type'],drop_first=True)



In [8]:
df1 = df1.drop(['Name','ID', 'Jersey Number'],axis=1)
df1.shape

(18159, 53)

In [9]:
df1 = pd.concat([df1,ot1,ot2,ot3], axis=1)
df1.head()

Unnamed: 0,Age,Overall,Potential,Value,Wage,International Reputation,Skill Moves,Body Type,Joined,Loaned From,...,WorkRate2,Low,Medium,DF,DM,GK,MF,ST,Normal,Stocky
0,31,94,94,110500000.0,565000.0,5.0,4.0,Lean,"Jul 1, 2004",,...,Medium,0,1,0,0,0,0,1,0,0
1,33,94,94,77000000.0,405000.0,5.0,5.0,Normal,"Jul 10, 2018",,...,Low,0,0,0,0,0,0,1,1,0
2,26,92,93,118500000.0,290000.0,5.0,5.0,Lean,"Aug 3, 2017",,...,Medium,0,0,0,0,0,0,0,0,0
3,27,91,93,72000000.0,260000.0,4.0,1.0,Lean,"Jul 1, 2011",,...,Medium,0,1,0,0,1,0,0,0,0
4,27,91,92,102000000.0,355000.0,4.0,4.0,Normal,"Aug 30, 2015",,...,High,0,0,0,0,0,1,0,1,0


In [10]:
df1.columns

Index(['Age', 'Overall', 'Potential', 'Value', 'Wage',
       'International Reputation', 'Skill Moves', 'Body Type', 'Joined',
       'Loaned From', 'Contract Valid Until', 'Height', 'Weight', 'Crossing',
       'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
       'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes', 'Release Clause',
       'Right_Foot', 'Simple_Position', 'Major_Nation', 'WorkRate1',
       'WorkRate2', 'Low', 'Medium', 'DF', 'DM', 'GK', 'MF', 'ST', 'Normal',
       'Stocky'],
      dtype='object')

In [11]:
df1.shape

(18159, 62)

In [12]:
df1['Body Type'].value_counts()

Normal    10597
Lean       6420
Stocky     1142
Name: Body Type, dtype: int64

In [13]:
df1 = df1.drop(['Simple_Position','WorkRate1','WorkRate2','Body Type'], axis = 1)
df1.shape

(18159, 58)

In [14]:
df1.columns

Index(['Age', 'Overall', 'Potential', 'Value', 'Wage',
       'International Reputation', 'Skill Moves', 'Joined', 'Loaned From',
       'Contract Valid Until', 'Height', 'Weight', 'Crossing', 'Finishing',
       'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling', 'Curve',
       'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes', 'Release Clause',
       'Right_Foot', 'Major_Nation', 'Low', 'Medium', 'DF', 'DM', 'GK', 'MF',
       'ST', 'Normal', 'Stocky'],
      dtype='object')

In [15]:
df1 = df1.drop(['Height','Weight','Joined', 'Loaned From','Release Clause', 'Contract Valid Until'],axis=1)

In [16]:
df1.columns

Index(['Age', 'Overall', 'Potential', 'Value', 'Wage',
       'International Reputation', 'Skill Moves', 'Crossing', 'Finishing',
       'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling', 'Curve',
       'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes', 'Right_Foot',
       'Major_Nation', 'Low', 'Medium', 'DF', 'DM', 'GK', 'MF', 'ST', 'Normal',
       'Stocky'],
      dtype='object')

In [17]:
from sklearn.ensemble import RandomForestClassifier


In [18]:
df1.shape

(18159, 52)

In [19]:
df1.to_csv("For_model_fifa.csv",index=False)

In [20]:
target = df1.Overall
df2 = df1.drop(['Overall'], axis = 1)

#Splitting into test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df2, target, test_size=0.2)

In [23]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(criterion = "gini", 
                                       min_samples_leaf = 1, 
                                       min_samples_split = 10,   
                                       n_estimators=2100, 
                                       max_features='auto', 
                                       oob_score=True, 
                                       random_state=1, 
                                       n_jobs=-1)

random_forest.fit(X_train, y_train)
Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, y_train)

0.9989674399394232

In [24]:
Y_prediction

array([65, 56, 63, ..., 75, 70, 65], dtype=int64)