## Data Cleaning

imports: 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

%matplotlib inline
%config InlineBackend.figure_format='retina'

# 
Concatenate Country DataFrames:

In [2]:
de = pd.read_csv('data/germany_robust.csv')
fr = pd.read_csv('data/france_robust.csv')
es = pd.read_csv('data/spain_robust.csv')
it = pd.read_csv('data/italy_robust.csv')
ne = pd.read_csv('data/netherlands_robust.csv')
en = pd.read_csv('data/england_robust.csv')
tk = pd.read_csv('data/turkey_robust.csv')

In [3]:
df = pd.concat([de, fr, es, it, ne, en, tk])

In [4]:
df.reset_index(inplace = True, drop = True)

# 
Remove non-domestic leagues or leagues we don't have enough data for:

In [5]:
df.groupby('tournament').count()

Unnamed: 0_level_0,name,age,season,club,apps,mins,shots (off target),shots (on target),shots (blocked),goals,...,total_passes,accurate_long_pass,inaccurate_long_pass,accurate_short_pass,inaccurate_short_pass,accurate_crosses,inaccurate_crosses,long_key_pass,short_key_pass,assists
tournament,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACoN,119,119,119,119,119,119,119,119,119,119,...,119,119,119,119,119,119,119,119,119,119
APD,43,43,43,43,43,43,43,43,43,43,...,43,43,43,43,43,43,43,43,43,43
BJL,16,16,16,16,16,16,16,16,16,16,...,16,16,16,16,16,16,16,16,16,16
BSA,63,63,63,63,63,63,63,63,63,63,...,63,63,63,63,63,63,63,63,63,63
CSl,111,111,111,111,111,111,111,111,111,111,...,111,111,111,111,111,111,111,111,111,111
EC,397,397,397,397,397,397,397,397,397,397,...,397,397,397,397,397,397,397,397,397,397
EL1,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
EL2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
EPL,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725,...,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725
FL1,1458,1458,1458,1458,1458,1458,1458,1458,1458,1458,...,1458,1458,1458,1458,1458,1458,1458,1458,1458,1458


In [6]:
to_drop = ['ACoN', 'APD', 'BJL', 'BSA', 'CSI', 'EC', 'GB2', 'ICA', 'ICC', 'IEU', 'SA', 'UCL', 'UEC', 'UEL', 'UMLS', 'UNL', 'WC', 'CSl', 'EL1', 'EL2', 'SP']

In [7]:
for index, row in df.iterrows():
    if row['tournament'] in to_drop:
        df = df.drop([index])

In [8]:
df.reset_index(inplace = True, drop = True)

In [9]:
df

Unnamed: 0,name,age,season,tournament,club,apps,mins,shots (off target),shots (on target),shots (blocked),...,total_passes,accurate_long_pass,inaccurate_long_pass,accurate_short_pass,inaccurate_short_pass,accurate_crosses,inaccurate_crosses,long_key_pass,short_key_pass,assists
0,José Manuel Jurado,31.0,2018,SLL,Espanyol,29.0,1509.0,0.2,0.1,0.1,...,20.9,1.0,0.6,16.3,3.0,20.9,1.0,0.1,0.5,0.1
1,José Manuel Jurado,30.0,2017,SLL,Espanyol,31.0,2460.0,0.5,0.4,0.3,...,39.5,2.7,1.5,30.1,5.3,39.5,2.7,0.3,0.6,0.1
2,José Manuel Jurado,29.0,2016,RPL,SpartakMoscow,1.0,69.0,4.0,1.0,2.0,...,32.0,undefined,1.0,28.0,3.0,32.0,undefined,undefined,undefined,undefined
3,José Manuel Jurado,29.0,2016,EPL,Watford,27.0,2027.0,0.7,0.4,0.3,...,35.3,1.2,0.6,28.5,5.0,35.3,1.2,0.2,1.3,undefined
4,José Manuel Jurado,28.0,2015,RPL,SpartakMoscow,18.0,1033.0,0.8,0.6,0.4,...,30.9,1.6,0.8,24.6,4.0,30.9,1.6,0.2,0.8,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9005,Emre Tasdemir,27.0,2019,TS,Galatasaray,7.0,379.0,0.3,undefined,0.1,...,22.0,1.1,1.3,16.9,2.7,22.0,1.1,undefined,0.1,undefined
9006,Emre Tasdemir,26.0,2018,TS,Bursaspor,5.0,180.0,undefined,undefined,0.2,...,12.0,undefined,0.8,8.4,2.8,12.0,undefined,undefined,0.4,undefined
9007,Emre Tasdemir,25.0,2017,TS,Bursaspor,6.0,200.0,0.2,0.2,0.2,...,7.8,0.3,0.3,6.0,1.2,7.8,0.3,undefined,0.3,undefined
9008,Emre Tasdemir,24.0,2016,TS,Bursaspor,22.0,1530.0,0.5,0.2,0.1,...,19.6,0.9,1.8,12.9,4.0,19.6,0.9,0.1,0.6,undefined


# 
Find only players with 4 or more years of data while under 29 and making more than 5 appearances:

In [10]:
df = df.loc[df['age'] < 29]

In [11]:
df = df.loc[df['apps'] > 5]

In [12]:
df = df.loc[df['season'] < 2021]

In [13]:
for index, row in df['name'].value_counts().to_frame().iterrows():
    if row[0] < 4:
        df = df.loc[df['name'] != index]
    

In [14]:
df.groupby('name').count()

Unnamed: 0_level_0,age,season,tournament,club,apps,mins,shots (off target),shots (on target),shots (blocked),goals,...,total_passes,accurate_long_pass,inaccurate_long_pass,accurate_short_pass,inaccurate_short_pass,accurate_crosses,inaccurate_crosses,long_key_pass,short_key_pass,assists
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Hunt,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5
Aaron Lennon,9,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
Aaron Ramsey,10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10
Abdelaziz Barrada,8,8,8,8,8,8,8,8,8,8,...,8,8,8,8,8,8,8,8,8,8
Abdoul Camara,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Éder,9,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
Éver Banega,8,8,8,8,8,8,8,8,8,8,...,8,8,8,8,8,8,8,8,8,8
Óscar De Marcos,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5
Óscar Trejo,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5


In [17]:
df.drop_duplicates(inplace = True)

# 
Rearrange player data so there's only rows of 4 consecutive years, meaning some players have more than one row:

In [18]:
final_df = pd.DataFrame()
for player in df.groupby('name').count().index.values:
    player_rows = df.loc[df['name'] == player].sort_values('age').reset_index(drop = True)
    player_df = pd.DataFrame()
    for index, player_row in player_rows[0:4].reset_index(drop = True).iterrows():
        if index == 0:
            player_df = pd.concat([player_df, pd.DataFrame(data = [player_row[0:].values], columns = df.columns + str(index + 1))], axis = 0)
        else:
            player_df = pd.concat([player_df, pd.DataFrame(data = [player_row[0:].values[1:]], columns = df.columns[1:] + str(index + 1))], axis = 1)
        
    player_df2 = pd.DataFrame()
    if len(player_rows)/4 > 2:
        for index, player_row in player_rows[4:8].reset_index(drop = True).iterrows():
            if index == 0:
                player_df2 = pd.concat([player_df2, pd.DataFrame(data = [player_row[0:].values], columns = df.columns + str(index + 1))], axis = 0)
            else:
                player_df2 = pd.concat([player_df2, pd.DataFrame(data = [player_row[0:].values[1:]], columns = df.columns[1:] + str(index + 1))], axis = 1)
            
    player_df3 = pd.DataFrame()
    if len(player_rows)/4 > 3:
        for index, player_row in player_rows[8:12].reset_index(drop = True).iterrows():
            if index == 0:
                player_df3 = pd.concat([player_df3, pd.DataFrame(data = [player_row[0:].values], columns = df.columns + str(index + 1))], axis = 0)
            else:
                player_df3 = pd.concat([player_df3, pd.DataFrame(data = [player_row[0:].values[1:]], columns = df.columns[1:] + str(index + 1))], axis = 1)   
            
    player_df4 = pd.DataFrame()
    if len(player_rows)/4 > 4:
        for index, player_row in player_rows[12:16].reset_index(drop = True).iterrows():
            if index == 0:
                player_df4 = pd.concat([player_df4, pd.DataFrame(data = [player_row[0:].values], columns = df.columns + str(index + 1))], axis = 0)
            else:
                player_df4 = pd.concat([player_df4, pd.DataFrame(data = [player_row[0:].values[1:]], columns = df.columns[1:] + str(index + 1))], axis = 1)   
            
    player_df5 = pd.DataFrame()
    if len(player_rows)/4 > 5:
        for index, player_row in player_rows[16:20].reset_index(drop = True).iterrows():
            if index == 0:
                player_df5 = pd.concat([player_df5, pd.DataFrame(data = [player_row[0:].values], columns = df.columns + str(index + 1))], axis = 0)
            else:
                player_df5 = pd.concat([player_df5, pd.DataFrame(data = [player_row[0:].values[1:]], columns = df.columns[1:] + str(index + 1))], axis = 1)   
            
    player_df6 = pd.DataFrame()
    if len(player_rows)/4 > 6:
        for index, player_row in player_rows[20:24].reset_index(drop = True).iterrows():
            if index == 0:
                player_df6 = pd.concat([player_df6, pd.DataFrame(data = [player_row[0:].values], columns = df.columns + str(index + 1))], axis = 0)
            else:
                player_df6 = pd.concat([player_df6, pd.DataFrame(data = [player_row[0:].values[1:]], columns = df.columns[1:] + str(index + 1))], axis = 1)   
    final_df = pd.concat([final_df, player_df, player_df2, player_df3, player_df4, player_df5, player_df6])

In [19]:
final_df.reset_index(drop = True, inplace = True)

In [20]:
pd.set_option("max_rows", 100)
pd.set_option("max_columns", 200)

# 
Set name to be index:

In [22]:
final_df = final_df.iloc[:, :136].set_index('name1')

In [23]:
final_df.isnull().sum()

age1                   0
season1                0
tournament1            0
club1                  0
apps1                  0
                      ..
accurate_crosses4      1
inaccurate_crosses4    1
long_key_pass4         1
short_key_pass4        1
assists4               1
Length: 108, dtype: int64

# 
Replace undefineds with 0s and set to be float values if numeric column:

In [24]:
for column in final_df.drop(columns = ['season1', 'tournament1', 'club1', 
                                       'season2', 'tournament2', 'club2', 
                                       'season3', 'tournament3', 'club3', 
                                       'season4', 'tournament4', 'club4']).columns:
    final_df[column] = final_df[column].apply(lambda x: float(x) if x != 'undefined' else 0)

In [25]:
for column in final_df.drop(columns = ['season1', 'tournament1', 'club1', 
                                       'season2', 'tournament2', 'club2', 
                                       'season3', 'tournament3', 'club3', 
                                       'season4', 'tournament4', 'club4']):
    final_df[column] = final_df[column].apply(lambda x: float(x))

# 
Create new columns for shots blocked, cross accuracy, key passes, and pass accuracy:

In [27]:
for number in range(1,5):
    final_df['shots_blocked/on_target_shots' + str(number)] = round(final_df['shots (blocked)'+ str(number)]/final_df['shots (on target)' + str(number)], 3)

In [28]:
for number in range(1,5):
    final_df['cross_acc' + str(number)] = round(final_df['accurate_crosses'+ str(number)]/(final_df['accurate_crosses' + str(number)] + final_df['inaccurate_crosses' + str(number)]), 3)

In [29]:
for number in range(1,5):
    final_df['kp_revised' + str(number)] = round((final_df['long_key_pass' + str(number)] + final_df['short_key_pass' + str(number)])*final_df['total_passes' + str(number)], 3)
    
    
    

In [31]:
for number in range(1,5):
    final_df['short_p_acc' + str(number)] = round(final_df['accurate_short_pass' + str(number)]/final_df['total_passes' + str(number)], 3)
    final_df['long_p_acc' + str(number)] = round(final_df['accurate_long_pass' + str(number)]/final_df['total_passes' + str(number)], 3)


In [33]:
final_df.iloc[:, -20:]

Unnamed: 0_level_0,shots_blocked/on_target_shots1,shots_blocked/on_target_shots2,shots_blocked/on_target_shots3,shots_blocked/on_target_shots4,cross_acc1,cross_acc2,cross_acc3,cross_acc4,kp_revised1,kp_revised2,kp_revised3,kp_revised4,short_p_acc1,long_p_acc1,short_p_acc2,long_p_acc2,short_p_acc3,long_p_acc3,short_p_acc4,long_p_acc4
name1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Aaron Hunt,0.833,0.667,1.286,1.000,0.955,0.962,0.946,0.935,44.66,44.96,119.34,87.36,0.743,0.047,0.754,0.039,0.731,0.057,0.728,0.070
Aaron Lennon,0.667,1.500,1.500,2.000,0.973,0.976,0.988,0.981,46.08,46.55,41.99,51.80,0.816,0.027,0.820,0.024,0.850,0.012,0.826,0.019
Aaron Lennon,2.000,0.000,,0.250,0.986,0.971,0.992,0.974,39.90,25.74,12.00,17.19,0.835,0.014,0.748,0.030,0.842,0.008,0.785,0.026
Aaron Ramsey,1.000,2.333,0.800,1.333,0.935,0.935,0.948,0.949,27.84,34.32,90.56,69.81,0.787,0.069,0.825,0.070,0.820,0.055,0.829,0.054
Aaron Ramsey,0.417,1.000,1.000,1.200,0.952,0.966,0.973,0.973,92.26,90.56,83.40,40.00,0.794,0.050,0.823,0.035,0.833,0.027,0.855,0.028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Éder,0.667,1.000,0.667,0.667,0.955,0.959,0.969,0.968,22.80,4.26,19.92,15.00,0.779,0.047,0.789,0.042,0.779,0.032,0.807,0.033
Éver Banega,0.500,1.000,1.667,2.000,0.923,0.924,0.925,0.953,42.16,43.76,152.40,63.90,0.751,0.083,0.788,0.082,0.772,0.081,0.777,0.049
Óscar De Marcos,0.333,0.667,1.000,0.714,0.980,0.982,0.969,0.963,3.96,6.54,37.10,27.36,0.707,0.020,0.661,0.018,0.744,0.032,0.740,0.038
Óscar Trejo,8.688,16.675,13.740,13.960,0.978,0.976,0.979,0.986,29.26,40.10,26.40,54.56,0.741,0.023,0.796,0.025,0.767,0.021,0.745,0.015


In [34]:
final_df['shots_blocked/on_target_shots1'] = final_df['shots_blocked/on_target_shots1'].apply(lambda x: x if x != np.inf else 2.00)
final_df['shots_blocked/on_target_shots2'] = final_df['shots_blocked/on_target_shots2'].apply(lambda x: x if x != np.inf else 2.00)
final_df['shots_blocked/on_target_shots3'] = final_df['shots_blocked/on_target_shots3'].apply(lambda x: x if x != np.inf else 2.00)
final_df['shots_blocked/on_target_shots4'] = final_df['shots_blocked/on_target_shots4'].apply(lambda x: x if x != np.inf else 2.00)


In [36]:
final_df.dropna(inplace = True) # drop null values

# 
Create independent variable columns using goals, assists, and key passes and then aggregate as "future performance":

In [38]:
for number in range(1, 5):
    final_df['G&A&KP ' + str(number)] = (final_df['goals' + str(number)] + final_df['assists' + str(number)] + final_df['long_key_pass' + str(number)] + final_df['short_key_pass' + str(number)])

In [39]:
final_df.iloc[:, -2:]

Unnamed: 0_level_0,G&A&KP 3,G&A&KP 4
name1,Unnamed: 1_level_1,Unnamed: 2_level_1
Aaron Hunt,3.1,2.6
Aaron Lennon,2.0,2.3
Aaron Ramsey,1.8,1.4
Aaron Ramsey,1.5,1.2
Abdelaziz Barrada,1.0,2.9
...,...,...
Éder,1.5,1.3
Éver Banega,2.1,1.7
Óscar De Marcos,1.2,1.1
Óscar Trejo,1.0,1.8


In [40]:
final_df['future_performance'] = final_df.iloc[:, -2:].mean(axis = 1)

In [41]:
final_df.iloc[:, -3:]

Unnamed: 0_level_0,G&A&KP 3,G&A&KP 4,future_performance
name1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aaron Hunt,3.1,2.6,2.85
Aaron Lennon,2.0,2.3,2.15
Aaron Ramsey,1.8,1.4,1.60
Aaron Ramsey,1.5,1.2,1.35
Abdelaziz Barrada,1.0,2.9,1.95
...,...,...,...
Éder,1.5,1.3,1.40
Éver Banega,2.1,1.7,1.90
Óscar De Marcos,1.2,1.1,1.15
Óscar Trejo,1.0,1.8,1.40


# 
Create new columns for square root and cube root to analyze distributions:

In [42]:
for number in range(1, 5):
    final_df['g_square_root ' + str(number)] = final_df['goals' + str(number)]**(1/2)

In [43]:
for number in range(1, 5):
    final_df['g_cube_root ' + str(number)] = final_df['goals' + str(number)]**(1/3)

In [44]:
for column in final_df.iloc[:, -8:].columns:
    final_df[column] = final_df[column].apply(lambda x: x if x != -np.inf else 0)

In [45]:
for number in range(1, 5):
    final_df = final_df.rename(columns = {'outside_of_box_goals' + str(number): 'outside_of_box' + str(number)})

In [48]:
final_df.iloc[:, -33:]

Unnamed: 0_level_0,shots_blocked/on_target_shots1,shots_blocked/on_target_shots2,shots_blocked/on_target_shots3,shots_blocked/on_target_shots4,cross_acc1,cross_acc2,cross_acc3,cross_acc4,kp_revised1,kp_revised2,kp_revised3,kp_revised4,short_p_acc1,long_p_acc1,short_p_acc2,long_p_acc2,short_p_acc3,long_p_acc3,short_p_acc4,long_p_acc4,G&A&KP 1,G&A&KP 2,G&A&KP 3,G&A&KP 4,future_performance,g_square_root 1,g_square_root 2,g_square_root 3,g_square_root 4,g_cube_root 1,g_cube_root 2,g_cube_root 3,g_cube_root 4
name1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
Aaron Hunt,0.833,0.667,1.286,1.000,0.955,0.962,0.946,0.935,44.66,44.96,119.34,87.36,0.743,0.047,0.754,0.039,0.731,0.057,0.728,0.070,1.9,1.8,3.1,2.6,2.85,0.547723,0.316228,0.447214,0.632456,0.669433,0.464159,0.584804,0.736806
Aaron Lennon,0.667,1.500,1.500,2.000,0.973,0.976,0.988,0.981,46.08,46.55,41.99,51.80,0.816,0.027,0.820,0.024,0.850,0.012,0.826,0.019,2.3,2.1,2.0,2.3,2.15,0.316228,0.316228,0.316228,0.316228,0.464159,0.464159,0.464159,0.464159
Aaron Ramsey,1.000,2.333,0.800,1.333,0.935,0.935,0.948,0.949,27.84,34.32,90.56,69.81,0.787,0.069,0.825,0.070,0.820,0.055,0.829,0.054,1.2,0.9,1.8,1.4,1.60,0.447214,0.316228,0.316228,0.000000,0.584804,0.464159,0.464159,0.000000
Aaron Ramsey,0.417,1.000,1.000,1.200,0.952,0.966,0.973,0.973,92.26,90.56,83.40,40.00,0.794,0.050,0.823,0.035,0.833,0.027,0.855,0.028,2.1,2.0,1.5,1.2,1.35,0.632456,0.447214,0.447214,0.000000,0.736806,0.584804,0.584804,0.000000
Abdelaziz Barrada,1.000,0.500,1.000,1.000,0.960,0.959,0.940,0.960,39.15,75.24,12.69,93.34,0.674,0.042,0.705,0.043,0.723,0.064,0.760,0.042,1.6,2.2,1.0,2.9,1.95,0.316228,0.316228,0.316228,0.000000,0.464159,0.464159,0.464159,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Éder,0.667,1.000,0.667,0.667,0.955,0.959,0.969,0.968,22.80,4.26,19.92,15.00,0.779,0.047,0.789,0.042,0.779,0.032,0.807,0.033,1.6,0.4,1.5,1.3,1.40,0.547723,0.316228,0.774597,0.447214,0.669433,0.464159,0.843433,0.584804
Éver Banega,0.500,1.000,1.667,2.000,0.923,0.924,0.925,0.953,42.16,43.76,152.40,63.90,0.751,0.083,0.788,0.082,0.772,0.081,0.777,0.049,1.1,1.0,2.1,1.7,1.90,0.316228,0.316228,0.000000,0.316228,0.464159,0.464159,0.000000,0.464159
Óscar De Marcos,0.333,0.667,1.000,0.714,0.980,0.982,0.969,0.963,3.96,6.54,37.10,27.36,0.707,0.020,0.661,0.018,0.744,0.032,0.740,0.038,0.5,0.8,1.2,1.1,1.15,0.316228,0.000000,0.316228,0.447214,0.464159,0.000000,0.464159,0.584804
Óscar Trejo,8.688,16.675,13.740,13.960,0.978,0.976,0.979,0.986,29.26,40.10,26.40,54.56,0.741,0.023,0.796,0.025,0.767,0.021,0.745,0.015,1.3,1.1,1.0,1.8,1.40,0.316228,0.316228,0.316228,0.316228,0.464159,0.464159,0.464159,0.464159


# 
Save no dummies dataframe to be used in EDA and a with dummies df for modeling:

In [49]:
final_df.to_csv('data/cleaned_data_no_dummies.csv')

In [50]:
final_df = pd.get_dummies(final_df, columns = ['tournament1', 'tournament2', 'tournament3', 'tournament4'])

In [51]:
final_df.to_csv('data/cleaned_data_with_dummies.csv')