# Build the training data set for the classification model

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
% matplotlib inline

### Load historical match results data and build the target

In [2]:
fifa = pd.read_csv('match_results.csv')

In [3]:
fifa.head(2)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False


In [4]:
fifa.shape

(38898, 9)

In [5]:
fifa.isnull().sum()

date          0
home_team     0
away_team     0
home_score    0
away_score    0
tournament    0
city          0
country       0
neutral       0
dtype: int64

In [6]:
fifa.dtypes

date          object
home_team     object
away_team     object
home_score     int64
away_score     int64
tournament    object
city          object
country       object
neutral         bool
dtype: object

In [7]:
fifa1 = fifa.loc[:,['date','home_team','away_team','home_score','away_score']].copy()

In [8]:
fifa1.head(2)

Unnamed: 0,date,home_team,away_team,home_score,away_score
0,1872-11-30,Scotland,England,0,0
1,1873-03-08,England,Scotland,4,2


###### Extract the year of the results to join with other data sources for 2010, 2014 and 2018

In [9]:
fifa1['date'] = pd.to_datetime(fifa1.date)

In [10]:
fifa1.dtypes

date          datetime64[ns]
home_team             object
away_team             object
home_score             int64
away_score             int64
dtype: object

In [11]:
fifa1['date1'] = fifa1.date.dt.year

In [12]:
fifa1.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,date1
0,1872-11-30,Scotland,England,0,0,1872
1,1873-03-08,England,Scotland,4,2,1873
2,1874-03-07,Scotland,England,2,1,1874
3,1875-03-06,England,Scotland,2,2,1875
4,1876-03-04,Scotland,England,3,0,1876


In [13]:
fifa2 = fifa1[fifa1['date1'].isin([2010, 2014, 2018]) ].copy()

In [14]:
fifa2.head(2)

Unnamed: 0,date,home_team,away_team,home_score,away_score,date1
31408,2010-01-02,Iran,Korea DPR,1,0,2010
31409,2010-01-02,Qatar,Mali,0,0,2010


In [15]:
fifa2.shape

(1823, 6)

###### Assign target label to each match ( 0 -> home wins, 1 -> away wins, 2 -> tie)

In [16]:
fifa2['label'] = 2

In [17]:
index_label0 = fifa2[fifa2['home_score'] > fifa2['away_score']].index

In [18]:
index_label1 = fifa2[fifa2['home_score'] < fifa2['away_score']].index

In [19]:
fifa2.loc[index_label0,'label'] = 0

In [20]:
fifa2.loc[index_label1,'label'] = 1

In [21]:
fifa2.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,date1,label
31408,2010-01-02,Iran,Korea DPR,1,0,2010,0
31409,2010-01-02,Qatar,Mali,0,0,2010,2
31410,2010-01-02,Syria,Zimbabwe,6,0,2010,0
31411,2010-01-02,Yemen,Tajikistan,0,1,2010,1
31412,2010-01-03,Angola,Gambia,1,1,2010,2


### Load FIFA players scores to create team features for each year

In [22]:
fifa_stats = pd.read_csv('fifa_stats.csv')

In [23]:
fifa_stats['year'] = ('20' + fifa_stats['year'].astype(str)).astype(int)

In [24]:
fifa_stats.head(2)

Unnamed: 0,age,country_code,dob,full_name,image,nation,potential,rating,short_name,value,wage,year
0,23.0,1,"Feb 2, 1994",E. Hysaj,https://cdn.sofifa.org/1x/18/players/210864.png,Albania,84.0,80.0,E. Hysaj,16.0,58.0,2018
1,22.0,1,"Mar 19, 1995",T. Strakosha,https://cdn.sofifa.org/1x/18/players/212151.png,Albania,1.0,79.0,T. Strakosha,89.0,1.0,2018


In [25]:
fifa_agg = fifa_stats.loc[:,['nation','rating',
                  'year'] ].groupby(['nation', 'year']).mean()

In [26]:
fifa_reset = fifa_agg.reset_index()

In [27]:
fifa_reset.head()

Unnamed: 0,nation,year,rating
0,Afghanistan,2018,59.5
1,Albania,2010,67.727273
2,Albania,2014,68.125
3,Albania,2018,73.818182
4,Algeria,2010,62.090909


### Load FIFA rank values for years 2010, 2014 and 2018

In [28]:
fifarank = pd.read_csv('fifa_rank.csv')

In [29]:
fifarank.head()

Unnamed: 0,nation,Value of Starting 11,Fifa Ranking,year
0,Russia,855.0,66,2018
1,Brazil,933.0,2,2018
2,Iran,778.0,36,2018
3,Japan,839.0,60,2018
4,Mexico,863.0,15,2018


### Build feature vector

###### Join FIFA ranking + players ratings

In [30]:
fifa_reset['nation'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahrain', 'Barbados', 'Belarus',
       'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bolivia',
       'Bosnia Herzegovina', 'Botswana', 'Brazil', 'Bulgaria',
       'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada',
       'Cape Verde', 'Central African Rep.', 'Chad', 'Chile', 'China PR',
       'Chinese Taipei', 'Colombia', 'Comoros', 'Congo', 'Costa Rica',
       'Croatia', 'Cuba', 'Curacao', 'Cyprus', 'Czech Republic',
       'DR Congo', 'Denmark', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'England', 'Equatorial Guinea', 'Eritrea',
       'Estonia', 'Ethiopia', 'FYR Macedonia', 'Faroe Islands', 'Fiji',
       'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany',
       'Ghana', 'Gibraltar', 'Greece', 'Grenada', 'Guam', 'Guatemala',
       'Guinea', 'Guinea Bissau', 'Guyana', 'Haiti', 'H

In [31]:
fifa_reset[fifa_reset['nation'].str.contains('Korea')]

Unnamed: 0,nation,year,rating
234,Korea DPR,2010,69.0
235,Korea DPR,2014,72.0
236,Korea DPR,2018,62.333333
237,Korea Republic,2010,63.272727
238,Korea Republic,2014,73.181818
239,Korea Republic,2018,74.909091


In [32]:
fifa_reset['nation'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahrain', 'Barbados', 'Belarus',
       'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bolivia',
       'Bosnia Herzegovina', 'Botswana', 'Brazil', 'Bulgaria',
       'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada',
       'Cape Verde', 'Central African Rep.', 'Chad', 'Chile', 'China PR',
       'Chinese Taipei', 'Colombia', 'Comoros', 'Congo', 'Costa Rica',
       'Croatia', 'Cuba', 'Curacao', 'Cyprus', 'Czech Republic',
       'DR Congo', 'Denmark', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'England', 'Equatorial Guinea', 'Eritrea',
       'Estonia', 'Ethiopia', 'FYR Macedonia', 'Faroe Islands', 'Fiji',
       'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany',
       'Ghana', 'Gibraltar', 'Greece', 'Grenada', 'Guam', 'Guatemala',
       'Guinea', 'Guinea Bissau', 'Guyana', 'Haiti', 'H

In [33]:
fifa_reset2 = fifa_reset.replace('Korea DPR', 'South Korea')
fifa_reset2 = fifa_reset2.replace('Korea Republic', 'South Korea')
fifa_reset2.head()

Unnamed: 0,nation,year,rating
0,Afghanistan,2018,59.5
1,Albania,2010,67.727273
2,Albania,2014,68.125
3,Albania,2018,73.818182
4,Algeria,2010,62.090909


In [34]:
team_features=fifarank.merge(fifa_reset2, left_on=['nation','year'], right_on=['nation','year'], how='inner')
team_features['nation'].unique()

array(['Russia', 'Brazil', 'Iran', 'Japan', 'Mexico', 'Belgium',
       'South Korea', 'Saudi Arabia', 'Germany', 'England', 'Spain',
       'Nigeria', 'Costa Rica', 'Poland', 'Egypt', 'Iceland', 'Serbia',
       'Portugal', 'France', 'Uruguay', 'Argentina', 'Colombia', 'Panama',
       'Senegal', 'Morocco', 'Tunisia', 'Switzerland', 'Croatia',
       'Sweden', 'Denmark', 'Australia', 'Peru'], dtype=object)

In [35]:
team_fer=team_features.loc[:,['nation', 'Fifa Ranking', 'year', 'rating']]
team_fer.head()

Unnamed: 0,nation,Fifa Ranking,year,rating
0,Russia,66,2018,77.166667
1,Brazil,2,2018,86.0
2,Iran,36,2018,72.272727
3,Japan,60,2018,76.272727
4,Mexico,15,2018,79.181818


###### Join nation features + historical target

In [36]:
fifa3=fifa2.loc[:,['home_team', 'away_team', 'date1', 'label']]
fifa3.head()

Unnamed: 0,home_team,away_team,date1,label
31408,Iran,Korea DPR,2010,0
31409,Qatar,Mali,2010,2
31410,Syria,Zimbabwe,2010,0
31411,Yemen,Tajikistan,2010,1
31412,Angola,Gambia,2010,2


In [37]:
fifa4=fifa3.merge(team_fer, left_on=["home_team", "date1"], right_on=["nation", "year"], how='inner')

In [38]:
team_fer['nation'].unique().shape

(32,)

In [39]:
team_fer[team_fer.year == 2018].shape

(33, 4)

In [40]:
fifa3['home_team'].unique().shape

(205,)

In [41]:
fifa5=fifa4.rename(columns={'Fifa Ranking': "home_rank", 'rating': 'home_rating'})

In [42]:
fifa6=fifa5.merge(team_fer, left_on=["away_team", "date1"], right_on=["nation", "year"], how='inner')

In [43]:
fifa7=fifa6.rename(columns={'Fifa Ranking': "away_rank", 'rating': 'away_rating', 'date1': 'year'})
fifa7.head()

Unnamed: 0,home_team,away_team,year,label,nation_x,home_rank,year_x,home_rating,nation_y,away_rank,year_y,away_rating
0,Denmark,Sweden,2018,1,Denmark,12,2018,79.454545,Sweden,23,2018,77.166667
1,Denmark,Panama,2018,0,Denmark,12,2018,79.454545,Panama,55,2018,68.0
2,Switzerland,Panama,2018,0,Switzerland,6,2018,78.636364,Panama,55,2018,68.0
3,Mexico,Iceland,2018,0,Mexico,15,2018,79.181818,Iceland,22,2018,73.818182
4,Peru,Iceland,2018,0,Peru,11,2018,74.272727,Iceland,22,2018,73.818182


In [44]:
fifa8=fifa7.loc[:,["year", "home_team", "away_team", "home_rank", "away_rank", "home_rating", "away_rating", "label"]]
fifa8.head()

Unnamed: 0,year,home_team,away_team,home_rank,away_rank,home_rating,away_rating,label
0,2018,Denmark,Sweden,12,23,79.454545,77.166667,1
1,2018,Denmark,Panama,12,55,79.454545,68.0,0
2,2018,Switzerland,Panama,6,55,78.636364,68.0,0
3,2018,Mexico,Iceland,15,22,79.181818,73.818182,0
4,2018,Peru,Iceland,11,22,74.272727,73.818182,0


In [45]:
fifa8["rank_dif"]=fifa8.home_rank-fifa8.away_rank
fifa8.head()

Unnamed: 0,year,home_team,away_team,home_rank,away_rank,home_rating,away_rating,label,rank_dif
0,2018,Denmark,Sweden,12,23,79.454545,77.166667,1,-11
1,2018,Denmark,Panama,12,55,79.454545,68.0,0,-43
2,2018,Switzerland,Panama,6,55,78.636364,68.0,0,-49
3,2018,Mexico,Iceland,15,22,79.181818,73.818182,0,-7
4,2018,Peru,Iceland,11,22,74.272727,73.818182,0,-11


In [46]:
fifa8["rating_dif"]=fifa8.home_rating-fifa8.away_rating

In [47]:
fifa9=fifa8.loc[:,["year", "home_team", "away_team", "rank_dif", "rating_dif", "label"]]

In [48]:
fifa9.head()

Unnamed: 0,year,home_team,away_team,rank_dif,rating_dif,label
0,2018,Denmark,Sweden,-11,2.287879,1
1,2018,Denmark,Panama,-43,11.454545,0
2,2018,Switzerland,Panama,-49,10.636364,0
3,2018,Mexico,Iceland,-7,5.363636,0
4,2018,Peru,Iceland,-11,0.454545,0


In [49]:
fifa9.to_csv("Training_fifa2.csv",index=False)