# Data Cleaning

In [1]:
import pandas as pd

## 1. Total players by nationality in the top 5 leagues

In [3]:
# load 2021/2022 stats
stats_2122_df = pd.read_csv("../datasets/Big5/2021-2022/stats.csv")

In [4]:
stats_2122_df.head()

Unnamed: 0,Player,Nation,Pos,Squad,Comp,Age,Born,Playing Time MP,Playing Time Starts,Playing Time Min,...,Per 90 Minutes G+A-PK,Expected xG,Expected npxG,Expected xA,Expected npxG+xA,Per 90 Minutes xG,Per 90 Minutes xA,Per 90 Minutes xG+xA,Per 90 Minutes npxG,Per 90 Minutes npxG+xA
0,Max Aarons,ENG,DF,Norwich City,Premier League,21.0,2000.0,34,32,2881,...,0.06,0.7,0.7,1.4,2.1,0.02,0.04,0.06,0.02,0.06
1,Yunis Abdelhamid,MAR,DF,Reims,Ligue 1,33.0,1987.0,34,34,2983,...,0.06,1.2,1.2,0.8,2.1,0.04,0.02,0.06,0.04,0.06
2,Salis Abdul Samed,GHA,MF,Clermont Foot,Ligue 1,21.0,2000.0,31,29,2462,...,0.04,0.8,0.8,1.1,1.9,0.03,0.04,0.07,0.03,0.07
3,Laurent Abergel,FRA,MF,Lorient,Ligue 1,28.0,1993.0,34,34,2956,...,0.06,2.0,2.0,2.9,4.9,0.06,0.09,0.15,0.06,0.15
4,Charles Abi,FRA,FW,Saint-Étienne,Ligue 1,21.0,2000.0,1,1,45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# check for nulls. We have nulls in Nation, Age and Born columnns
stats_2122_df.isnull().sum()

Player                    0
Nation                    1
Pos                       0
Squad                     0
Comp                      0
Age                       1
Born                      1
Playing Time MP           0
Playing Time Starts       0
Playing Time Min          0
Playing Time 90s          0
Performance Gls           0
Performance Ast           0
Performance G-PK          0
Performance PK            0
Performance PKatt         0
Performance CrdY          0
Performance CrdR          0
Per 90 Minutes Gls        0
Per 90 Minutes Ast        0
Per 90 Minutes G+A        0
Per 90 Minutes G-PK       0
Per 90 Minutes G+A-PK     0
Expected xG               0
Expected npxG             0
Expected xA               0
Expected npxG+xA          0
Per 90 Minutes xG         0
Per 90 Minutes xA         0
Per 90 Minutes xG+xA      0
Per 90 Minutes npxG       0
Per 90 Minutes npxG+xA    0
dtype: int64

In [7]:
# look up rows with null
stats_2122_df[stats_2122_df.isnull().any(axis=1)]

Unnamed: 0,Player,Nation,Pos,Squad,Comp,Age,Born,Playing Time MP,Playing Time Starts,Playing Time Min,...,Per 90 Minutes G+A-PK,Expected xG,Expected npxG,Expected xA,Expected npxG+xA,Per 90 Minutes xG,Per 90 Minutes xA,Per 90 Minutes xG+xA,Per 90 Minutes npxG,Per 90 Minutes npxG+xA
1531,Javier Llabrés,,FW,Mallorca,La Liga,,,5,1,132,...,0.0,0.1,0.1,0.1,0.2,0.1,0.05,0.15,0.1,0.15


In [8]:
# set values for player with null values based on web results
temp_idx = stats_2122_df[stats_2122_df.isnull().any(axis=1)].index.values.astype(int)[0]
print("Set values for", stats_2122_df.at[temp_idx, 'Player'])
stats_2122_df.at[temp_idx, 'Nation'] = 'ESP'
stats_2122_df.at[temp_idx, 'Age'] = 18
stats_2122_df.at[temp_idx, 'Born'] = 2002 

Set values for Javier Llabrés


In [9]:
# check result of cleaning
print(f'Number of rows with null values: {stats_2122_df.isnull().any(axis=1).sum()}')

Number of rows with null values: 0


In [10]:
stats_2122_df.head()

Unnamed: 0,Player,Nation,Pos,Squad,Comp,Age,Born,Playing Time MP,Playing Time Starts,Playing Time Min,...,Per 90 Minutes G+A-PK,Expected xG,Expected npxG,Expected xA,Expected npxG+xA,Per 90 Minutes xG,Per 90 Minutes xA,Per 90 Minutes xG+xA,Per 90 Minutes npxG,Per 90 Minutes npxG+xA
0,Max Aarons,ENG,DF,Norwich City,Premier League,21.0,2000.0,34,32,2881,...,0.06,0.7,0.7,1.4,2.1,0.02,0.04,0.06,0.02,0.06
1,Yunis Abdelhamid,MAR,DF,Reims,Ligue 1,33.0,1987.0,34,34,2983,...,0.06,1.2,1.2,0.8,2.1,0.04,0.02,0.06,0.04,0.06
2,Salis Abdul Samed,GHA,MF,Clermont Foot,Ligue 1,21.0,2000.0,31,29,2462,...,0.04,0.8,0.8,1.1,1.9,0.03,0.04,0.07,0.03,0.07
3,Laurent Abergel,FRA,MF,Lorient,Ligue 1,28.0,1993.0,34,34,2956,...,0.06,2.0,2.0,2.9,4.9,0.06,0.09,0.15,0.06,0.15
4,Charles Abi,FRA,FW,Saint-Étienne,Ligue 1,21.0,2000.0,1,1,45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We create two csv files with the following data:
1. Total players by country
2. Total players by country and leage

In [11]:
player_count_by_nation_df = stats_2122_df.groupby(['Nation']).size().reset_index()
player_count_by_nation_df = player_count_by_nation_df.rename({0:'Count'}, axis=1)
player_count_by_nation_df.head()

Unnamed: 0,Nation,Count
0,ALB,14
1,ALG,29
2,ANG,4
3,ARG,73
4,ARM,3


In [248]:
player_count_by_nation_df.to_csv('../vis_data/total_players_by_nation.csv', index=False)

In [12]:
player_count_by_nation_and_league_df = stats_2122_df.groupby(['Nation', 'Comp']).size().reset_index()
player_count_by_nation_and_league_df = player_count_by_nation_and_league_df.rename({'Comp':'League', 0:'Count'}, axis=1)
player_count_by_nation_and_league_df.head()

Unnamed: 0,Nation,League,Count
0,ALB,La Liga,3
1,ALB,Premier League,1
2,ALB,Serie A,10
3,ALG,Bundesliga,2
4,ALG,La Liga,1


In [250]:
player_count_by_nation_and_league_df.to_csv('../vis_data/total_players_by_nation_and_league.csv', index=False)

## 2. Get coordinates for all countries

### Scrape country coordinates and country code (ISO, FIFA) information

The country coordinates can be obtained from Google's Dataset Publishing Site [here](https://developers.google.com/public-data/docs/canonical/countries_csv).

In [13]:
country_coords_df = pd.read_html("https://developers.google.com/public-data/docs/canonical/countries_csv",flavor='bs4')[0]

In [14]:
country_coords_df.head()

Unnamed: 0,country,latitude,longitude,name
0,AD,42.546245,1.601554,Andorra
1,AE,23.424076,53.847818,United Arab Emirates
2,AF,33.93911,67.709953,Afghanistan
3,AG,17.060816,-61.796428,Antigua and Barbuda
4,AI,18.220554,-63.068615,Anguilla


The ISO2 and ISO3 codes for all countries are obtained from WorldData.info [here](https://www.worlddata.info/countrycodes.php).

In [16]:
country_iso_df = pd.read_html("https://www.worlddata.info/countrycodes.php", flavor='bs4')[0]

In [17]:
country_iso_df.head()

Unnamed: 0,Country,ISO 3166-1alpha2,ISO 3166-1alpha3,ISO 3166-1numeric,IOC,Fips 10,LicensePlate,Domain
0,Afghanistan,AF,AFG,4.0,AFG,AF,AFG,.af
1,Åland Islands,AX,ALA,248.0,,,AX,.ax
2,Albania,AL,ALB,8.0,ALB,AL,AL,.al
3,Algeria,DZ,DZA,12.0,ALG,AG,DZ,.dz
4,American Samoa,AS,ASM,16.0,ASA,AQ,USA,.as


Next, we check for country entries which are not found in tables.

In [18]:
print(f'# countries in coords dataset: {country_coords_df.shape[0]}')
print(f'# countries in iso dataset: {country_iso_df.shape[0]}')

# countries in coords dataset: 245
# countries in iso dataset: 249


First, we look for entries in the ISO2-to-ISO3 table that are not in the coordinates table.

In [19]:
coords_countries_set = set(country_coords_df['country'])
iso_countries_set = set(country_iso_df['ISO 3166-1alpha2'])

In [20]:
iso_not_in_coords_df_countries = iso_countries_set.difference(coords_countries_set)
print(iso_not_in_coords_df_countries)

{'CW', 'MF', 'AX', 'BL', 'BQ', 'SX', 'SS'}


In [21]:
filt = country_iso_df['ISO 3166-1alpha2'].isin(list(iso_not_in_coords_df_countries))
country_iso_df[filt]

Unnamed: 0,Country,ISO 3166-1alpha2,ISO 3166-1alpha3,ISO 3166-1numeric,IOC,Fips 10,LicensePlate,Domain
1,Åland Islands,AX,ALA,248.0,,,AX,.ax
27,"Bonaire, Saint Eustatius and Saba",BQ,BES,535.0,,,NL,.bq
56,Curacao,CW,CUW,531.0,,UC,,.cw
186,Saint Barthelemy,BL,BLM,652.0,,TB,,.bl
190,Saint Martin,MF,MAF,663.0,,RN,F,.mf
202,Sint Maarten,SX,SXM,534.0,,NN,,.sx
210,South Sudan,SS,SSD,728.0,,,SSD,.ss


Secondly, we look for entries in the coordinates table that are not in the ISO2-to-ISO3 table.

In [22]:
# 2 digit ISO in coords df but not in iso df
coord_not_in_iso_df_countries = coords_countries_set.difference(iso_countries_set)
print(coord_not_in_iso_df_countries)

{'AN', 'UM', 'GZ'}


In [23]:
filt = country_coords_df['country'].isin(list(coord_not_in_iso_df_countries))
country_coords_df[filt]

Unnamed: 0,country,latitude,longitude,name
7,AN,12.226079,-69.060087,Netherlands Antilles
91,GZ,31.354676,34.308825,Gaza Strip
226,UM,,,U.S. Minor Outlying Islands


We see that most of the countries are rather uncommon countries. Instead of dropping these entries, we should consider the countries which are actually in the football dataset.

We now determine if 3 digit ISO code of all Nations in *"total_players_by_nation.csv"* is in the ISO2-to-ISO3 table.

In [24]:
stats_2122_df.head() # peek at data in football dataset

Unnamed: 0,Player,Nation,Pos,Squad,Comp,Age,Born,Playing Time MP,Playing Time Starts,Playing Time Min,...,Per 90 Minutes G+A-PK,Expected xG,Expected npxG,Expected xA,Expected npxG+xA,Per 90 Minutes xG,Per 90 Minutes xA,Per 90 Minutes xG+xA,Per 90 Minutes npxG,Per 90 Minutes npxG+xA
0,Max Aarons,ENG,DF,Norwich City,Premier League,21.0,2000.0,34,32,2881,...,0.06,0.7,0.7,1.4,2.1,0.02,0.04,0.06,0.02,0.06
1,Yunis Abdelhamid,MAR,DF,Reims,Ligue 1,33.0,1987.0,34,34,2983,...,0.06,1.2,1.2,0.8,2.1,0.04,0.02,0.06,0.04,0.06
2,Salis Abdul Samed,GHA,MF,Clermont Foot,Ligue 1,21.0,2000.0,31,29,2462,...,0.04,0.8,0.8,1.1,1.9,0.03,0.04,0.07,0.03,0.07
3,Laurent Abergel,FRA,MF,Lorient,Ligue 1,28.0,1993.0,34,34,2956,...,0.06,2.0,2.0,2.9,4.9,0.06,0.09,0.15,0.06,0.15
4,Charles Abi,FRA,FW,Saint-Étienne,Ligue 1,21.0,2000.0,1,1,45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We need to use the *Nation* column in the football dataset and compare it with the set ISO3 codes.

In [25]:
stats_2122_iso3_set = set(stats_2122_df['Nation'])
iso_countries_iso3_set = set(country_iso_df['IOC'])

In [26]:
temp = stats_2122_iso3_set.difference(iso_countries_iso3_set)

In [27]:
filt = stats_2122_df['Nation'].isin(list(temp))
stats_2122_df[filt]['Nation'].unique()

array(['ENG', 'NGA', 'SCO', 'REU', 'EQG', 'GUF', 'WAL', 'KVX', 'IRN',
       'SVN', 'GNB', 'NIR', 'MTQ', 'GLP', 'BFA', 'CTA'], dtype=object)

The missing countries in the ISO2-to-ISO3 table are:
- ENG: england
- SCO: scotland
- WAL: wales
- NIR: northern ireland

- NGA: nigeria
- KVX: kosovo
- SVN: slovenia
- GNB: guinea bisseau
- BFA: burkina faso
- EQG: equatorial guinea
- GUF:french guiana
- IRN: iran
- CTA: central african republic
- MTQ: martinique (french)
- REU: reunion (french)
- GLP: guadeloupe (french)

Using [Wikipedia](https://en.wikipedia.org/wiki/Comparison_of_alphabetic_country_codes), we can make a lookup table from 3 digit FIFA country code to ISO3 code.

In [28]:
country_codes_comparison_tables = pd.read_html('https://en.wikipedia.org/wiki/Comparison_of_alphabetic_country_codes', flavor='bs4')

In [29]:
country_fifa_iso_df = country_codes_comparison_tables[0][['Country','FIFA','ISO']]

In [30]:
# save countries which have recognised FIFA codes only
filt = country_fifa_iso_df['FIFA'].isnull()
country_fifa_iso_df = country_fifa_iso_df[~filt]
# country_fifa_iso_df.to_csv('vis_data/country_fifa_iso_lookup.csv')

In [31]:
country_fifa_iso_df.head()

Unnamed: 0,Country,FIFA,ISO
0,Afghanistan,AFG,AFG
2,Albania,ALB,ALB
3,Algeria,ALG,DZA
4,American Samoa[2],ASA,ASM
5,Andorra,AND,AND


In [32]:
# based on nations in 2021/2022, there are 5 missing countries in the FIFA-to-ISO3 lookup:
set(stats_2122_df['Nation']).difference(set(country_fifa_iso_df['FIFA']))

{'GLP', 'GUF', 'KVX', 'MTQ', 'REU'}

We can use the list of FIFA country codes on [Wikipedia](https://en.wikipedia.org/wiki/List_of_FIFA_country_codes) to add new entries to the FIFA-to-ISO3 lookup table.

In [33]:
rem_countries = {
    'FIFA':['GLP', 'GUF','KVX','MTQ','REU'],
    'Country':['Guadeloupe','French Guiana','Kosovo','Martinique','Reunion'],
    'ISO':['GLP','GUF','XKX','MTQ','REU']
}
rem_countries_df = pd.DataFrame(rem_countries)

In [34]:
country_fifa_iso_df = country_fifa_iso_df.append(rem_countries_df, ignore_index=True)

In [35]:
# remove square bracket and numbers from refs in html table
country_fifa_iso_df['Country'] = country_fifa_iso_df['Country'].str.replace('[^a-zA-Z, ]', '', regex=True)

In [36]:
# sort on FIFA code and save to csv file
# country_fifa_iso_df.sort_values(by='FIFA').to_csv('../vis_data/country_fifa_iso_lookup.csv', index=False)

### Join tables to get final table with country code (FIFA, ISO2, ISO3) and coordinates

In [37]:
# output the three tables for clarity
country_coords_df.head()

Unnamed: 0,country,latitude,longitude,name
0,AD,42.546245,1.601554,Andorra
1,AE,23.424076,53.847818,United Arab Emirates
2,AF,33.93911,67.709953,Afghanistan
3,AG,17.060816,-61.796428,Antigua and Barbuda
4,AI,18.220554,-63.068615,Anguilla


In [38]:
country_iso_df.head()


Unnamed: 0,Country,ISO 3166-1alpha2,ISO 3166-1alpha3,ISO 3166-1numeric,IOC,Fips 10,LicensePlate,Domain
0,Afghanistan,AF,AFG,4.0,AFG,AF,AFG,.af
1,Åland Islands,AX,ALA,248.0,,,AX,.ax
2,Albania,AL,ALB,8.0,ALB,AL,AL,.al
3,Algeria,DZ,DZA,12.0,ALG,AG,DZ,.dz
4,American Samoa,AS,ASM,16.0,ASA,AQ,USA,.as


In [39]:
# we only need to alpha2 and alpha 3 codes from country_iso_df 
country_iso_df_subset = country_iso_df[['ISO 3166-1alpha2', 'ISO 3166-1alpha3']]

In [40]:
country_fifa_iso_df.head()

Unnamed: 0,Country,FIFA,ISO
0,Afghanistan,AFG,AFG
1,Albania,ALB,ALB
2,Algeria,ALG,DZA
3,American Samoa,ASA,ASM
4,Andorra,AND,AND


In [41]:
print(country_coords_df.shape[0])
print(country_iso_df.shape[0])
print(country_fifa_iso_df.shape[0])

245
249
216


In [42]:
temp_df = country_fifa_iso_df.merge(country_iso_df_subset, how='left', left_on='ISO', right_on='ISO 3166-1alpha3')
temp_df

Unnamed: 0,Country,FIFA,ISO,ISO 3166-1alpha2,ISO 3166-1alpha3
0,Afghanistan,AFG,AFG,AF,AFG
1,Albania,ALB,ALB,AL,ALB
2,Algeria,ALG,DZA,DZ,DZA
3,American Samoa,ASA,ASM,AS,ASM
4,Andorra,AND,AND,AD,AND
...,...,...,...,...,...
211,Guadeloupe,GLP,GLP,GP,GLP
212,French Guiana,GUF,GUF,GF,GUF
213,Kosovo,KVX,XKX,XK,XKX
214,Martinique,MTQ,MTQ,MQ,MTQ


In [43]:
temp_df = temp_df.merge(country_coords_df, how='left', left_on='ISO 3166-1alpha2', right_on='country')
temp_df

Unnamed: 0,Country,FIFA,ISO,ISO 3166-1alpha2,ISO 3166-1alpha3,country,latitude,longitude,name
0,Afghanistan,AFG,AFG,AF,AFG,AF,33.939110,67.709953,Afghanistan
1,Albania,ALB,ALB,AL,ALB,AL,41.153332,20.168331,Albania
2,Algeria,ALG,DZA,DZ,DZA,DZ,28.033886,1.659626,Algeria
3,American Samoa,ASA,ASM,AS,ASM,AS,-14.270972,-170.132217,American Samoa
4,Andorra,AND,AND,AD,AND,AD,42.546245,1.601554,Andorra
...,...,...,...,...,...,...,...,...,...
211,Guadeloupe,GLP,GLP,GP,GLP,GP,16.995971,-62.067641,Guadeloupe
212,French Guiana,GUF,GUF,GF,GUF,GF,3.933889,-53.125782,French Guiana
213,Kosovo,KVX,XKX,XK,XKX,XK,42.602636,20.902977,Kosovo
214,Martinique,MTQ,MTQ,MQ,MTQ,MQ,14.641528,-61.024174,Martinique


Now, we check the FIFA countries which do not have a corresponding ISO2 code (and thus coordinates). We can see that 8 countries do not have coordinates as their ISO2 is null.

In [44]:
temp_df[temp_df.isnull().any(axis=1)]

Unnamed: 0,Country,FIFA,ISO,ISO 3166-1alpha2,ISO 3166-1alpha3,country,latitude,longitude,name
51,Curaao,CUW,CUW,CW,CUW,,,,
61,England,ENG,[7],,,,-22.95764,18.49041,Namibia
104,Kosovo,KOS,[14],,,,-22.95764,18.49041,Namibia
133,Namibia,NAM,NAM,,NAM,,-22.95764,18.49041,Namibia
141,Northern Ireland,NIR,[7],,,,-22.95764,18.49041,Namibia
166,Scotland,SCO,[7],,,,-22.95764,18.49041,Namibia
177,South Sudan,SSD,SSD,SS,SSD,,,,
207,Wales,WAL,[7],,,,-22.95764,18.49041,Namibia


Kosovo is duplicated, so we should remove row 104

In [45]:
temp_df[temp_df['Country'] == 'Kosovo']

Unnamed: 0,Country,FIFA,ISO,ISO 3166-1alpha2,ISO 3166-1alpha3,country,latitude,longitude,name
104,Kosovo,KOS,[14],,,,-22.95764,18.49041,Namibia
213,Kosovo,KVX,XKX,XK,XKX,XK,42.602636,20.902977,Kosovo


In [46]:
temp_df = temp_df.drop(104, axis=0).reset_index(drop=True)

In [47]:
# check if Kosovo has no duplicates now
temp_df[temp_df['Country'] == 'Kosovo']

Unnamed: 0,Country,FIFA,ISO,ISO 3166-1alpha2,ISO 3166-1alpha3,country,latitude,longitude,name
212,Kosovo,KVX,XKX,XK,XKX,XK,42.602636,20.902977,Kosovo


Namibia has an ISO2 code, but it was not listed in the middle table (iso2 and iso3 lookup table)

In [48]:
temp_df[temp_df.isnull().any(axis=1)]

Unnamed: 0,Country,FIFA,ISO,ISO 3166-1alpha2,ISO 3166-1alpha3,country,latitude,longitude,name
51,Curaao,CUW,CUW,CW,CUW,,,,
61,England,ENG,[7],,,,-22.95764,18.49041,Namibia
132,Namibia,NAM,NAM,,NAM,,-22.95764,18.49041,Namibia
140,Northern Ireland,NIR,[7],,,,-22.95764,18.49041,Namibia
165,Scotland,SCO,[7],,,,-22.95764,18.49041,Namibia
176,South Sudan,SSD,SSD,SS,SSD,,,,
206,Wales,WAL,[7],,,,-22.95764,18.49041,Namibia


In [49]:
temp_df.at[132,'ISO 3166-1alpha2'] = 'NA'
temp_df.at[132,'country'] = 'NA'

In [50]:
temp_df[temp_df.isnull().any(axis=1)]

Unnamed: 0,Country,FIFA,ISO,ISO 3166-1alpha2,ISO 3166-1alpha3,country,latitude,longitude,name
51,Curaao,CUW,CUW,CW,CUW,,,,
61,England,ENG,[7],,,,-22.95764,18.49041,Namibia
140,Northern Ireland,NIR,[7],,,,-22.95764,18.49041,Namibia
165,Scotland,SCO,[7],,,,-22.95764,18.49041,Namibia
176,South Sudan,SSD,SSD,SS,SSD,,,,
206,Wales,WAL,[7],,,,-22.95764,18.49041,Namibia


For the remaining countries, we must manually handle their null values as:
- England, Northern Ireland, Scotland and Wales are part of Great Britain, and thus they do not have ISO codes.
- Curacao and South Sudan have ISO2 codes, but they do not have coordinate values.

Coordinates were found on Google Maps using the steps shown [here](https://support.google.com/maps/answer/18539?hl=en&co=GENIE.Platform%3DDesktop).

In [51]:
# curacao
temp_df.at[51,'latitude'] = 12.180859 
temp_df.at[51,'longitude'] = -69.013055

# south sudan
temp_df.at[176,'latitude'] = 7.591712
temp_df.at[176,'longitude'] = 29.931338

# england
temp_df.at[61,'latitude'] = 52.177943
temp_df.at[61,'longitude'] = -1.605872

# northern ireland
temp_df.at[140,'latitude'] = 54.703013
temp_df.at[140,'longitude'] = -6.769446

# scotland
temp_df.at[165,'latitude'] = 57.092069
temp_df.at[165,'longitude'] = -4.747962

# wales
temp_df.at[206,'latitude'] = 52.002437
temp_df.at[206,'longitude'] = -4.374426

In [52]:
temp_df[temp_df.isnull().any(axis=1)]

Unnamed: 0,Country,FIFA,ISO,ISO 3166-1alpha2,ISO 3166-1alpha3,country,latitude,longitude,name
51,Curaao,CUW,CUW,CW,CUW,,12.180859,-69.013055,
61,England,ENG,[7],,,,52.177943,-1.605872,Namibia
140,Northern Ireland,NIR,[7],,,,54.703013,-6.769446,Namibia
165,Scotland,SCO,[7],,,,57.092069,-4.747962,Namibia
176,South Sudan,SSD,SSD,SS,SSD,,7.591712,29.931338,
206,Wales,WAL,[7],,,,52.002437,-4.374426,Namibia


To fill in the ISO code for the British countries, we use the [ISO 3166-2:GB](https://en.wikipedia.org/wiki/ISO_3166-2:GB) entries.

In [53]:
# england
temp_df.at[61,'ISO'] = 'GB-ENG'
# northern ireland
temp_df.at[140,'ISO'] = 'GB-NIR'
# scotland
temp_df.at[165,'ISO'] = 'GB-SCT'
# wales
temp_df.at[206,'ISO'] = 'GB-WLS'

In [54]:
temp_df[temp_df.isnull().any(axis=1)]

Unnamed: 0,Country,FIFA,ISO,ISO 3166-1alpha2,ISO 3166-1alpha3,country,latitude,longitude,name
51,Curaao,CUW,CUW,CW,CUW,,12.180859,-69.013055,
61,England,ENG,GB-ENG,,,,52.177943,-1.605872,Namibia
140,Northern Ireland,NIR,GB-NIR,,,,54.703013,-6.769446,Namibia
165,Scotland,SCO,GB-SCT,,,,57.092069,-4.747962,Namibia
176,South Sudan,SSD,SSD,SS,SSD,,7.591712,29.931338,
206,Wales,WAL,GB-WLS,,,,52.002437,-4.374426,Namibia


Finally, we obtain the final table with FIFA, ISO3 code and coordinates data

In [55]:
final_country_coord_df = temp_df[['Country', 'FIFA', 'ISO', 'latitude', 'longitude']]

In [56]:
final_country_coord_df.head()

Unnamed: 0,Country,FIFA,ISO,latitude,longitude
0,Afghanistan,AFG,AFG,33.93911,67.709953
1,Albania,ALB,ALB,41.153332,20.168331
2,Algeria,ALG,DZA,28.033886,1.659626
3,American Samoa,ASA,ASM,-14.270972,-170.132217
4,Andorra,AND,AND,42.546245,1.601554


In [57]:
# save country coordinates to csv file
final_country_coord_df.to_csv('../vis_data/country_coordinates.csv', index=False)

Note: Minor changes were made to country names manually.

Additional References:
- https://datahub.io/core/country-codes#resource-country-codes
- https://en.wikipedia.org/wiki/List_of_FIFA_country_codes
- https://en.wikipedia.org/wiki/Comparison_of_alphabetic_country_codes
- https://www.countryflagsapi.com/

## 3. Add coordinates data to the Total Players dataset

Load tables in csv files

In [59]:
player_count_by_nation_df = pd.read_csv('../vis_data/total_players_by_nation.csv')
print(player_count_by_nation_df.shape)
player_count_by_nation_df.head()

(104, 2)


Unnamed: 0,Nation,Count
0,ALB,14
1,ALG,29
2,ANG,4
3,ARG,73
4,ARM,3


In [60]:
player_count_by_nation_and_league_df = pd.read_csv('../vis_data/total_players_by_nation_and_league.csv')
print(player_count_by_nation_and_league_df.shape)
player_count_by_nation_and_league_df.head()

(328, 3)


Unnamed: 0,Nation,League,Count
0,ALB,La Liga,3
1,ALB,Premier League,1
2,ALB,Serie A,10
3,ALG,Bundesliga,2
4,ALG,La Liga,1


In [61]:
country_coords_df = pd.read_csv('../vis_data/country_coordinates.csv')
print(country_coords_df.shape)
country_coords_df.head()

(215, 5)


Unnamed: 0,Country,FIFA,ISO,latitude,longitude
0,Afghanistan,AFG,AFG,33.93911,67.709953
1,Albania,ALB,ALB,41.153332,20.168331
2,Algeria,ALG,DZA,28.033886,1.659626
3,American Samoa,ASA,ASM,-14.270972,-170.132217
4,Andorra,AND,AND,42.546245,1.601554


Add coordinates to total players by country dataset

In [62]:
player_count_by_nation_coords_df = player_count_by_nation_df.merge(country_coords_df, left_on='Nation', right_on='FIFA')
print(player_count_by_nation_coords_df.shape)
player_count_by_nation_coords_df.head()

(104, 7)


Unnamed: 0,Nation,Count,Country,FIFA,ISO,latitude,longitude
0,ALB,14,Albania,ALB,ALB,41.153332,20.168331
1,ALG,29,Algeria,ALG,DZA,28.033886,1.659626
2,ANG,4,Angola,ANG,AGO,-11.202692,17.873887
3,ARG,73,Argentina,ARG,ARG,-38.416097,-63.616672
4,ARM,3,Armenia,ARM,ARM,40.069099,45.038189


In [None]:
player_count_by_nation_coords_df[['Country', 'FIFA', 'ISO', 'latitude','longitude','Count']].to_csv('../vis_data/total_player_by_nation_with_coords.csv')

Add coordinates to total players by country and league dataset

In [260]:
player_count_by_nation_and_league__coords_df = player_count_by_nation_and_league_df.merge(country_coords_df, left_on='Nation', right_on='FIFA')
print(player_count_by_nation_and_league__coords_df.shape)
player_count_by_nation_and_league__coords_df.head()

(328, 8)


Unnamed: 0,Nation,League,Count,Country,FIFA,ISO,latitude,longitude
0,ALB,La Liga,3,Albania,ALB,ALB,41.153332,20.168331
1,ALB,Premier League,1,Albania,ALB,ALB,41.153332,20.168331
2,ALB,Serie A,10,Albania,ALB,ALB,41.153332,20.168331
3,ALG,Bundesliga,2,Algeria,ALG,DZA,28.033886,1.659626
4,ALG,La Liga,1,Algeria,ALG,DZA,28.033886,1.659626


In [264]:
player_count_by_nation_and_league__coords_df[['Country', 'League', 'FIFA', 'ISO', 'latitude','longitude','Count']].to_csv('../vis_data/total_player_by_nation_and_league_with_coords.csv')