In [1]:
import pandas as pd
import os

In [4]:
train_df = pd.read_csv(os.path.abspath('../input/2007-2017.csv'), delimiter=';')

Drop empty line drop in primary data exploration

In [6]:
train_df = train_df.dropna(subset=['match_id'])

In [7]:
train_df.isnull().sum()

match_id                   0
tournament_name            0
season                     0
tournament_id              0
date                       0
level                      0
surface                  390
round                      0
best_of                    0
player_id                  0
player_rank              636
player_elo_rating          0
opponent_id                0
opponent_rank            636
opponent_elo_rating        0
outcome                63420
minutes                 8414
p_1st_in                5794
o_1st_in                5794
p_sv_pt                 5794
o_sv_pt                 5794
p_1st_won               5794
o_1st_won               5794
p_2nd_won               5794
o_2nd_won               5794
p_ace                   5794
o_ace                   5794
p_df                    5794
o_df                    5794
p_bp_sv                 5794
o_bp_sv                 5794
p_bp_fc                 5794
o_bp_fc                 5794
p_matches                  0
o_matches     

We can't afford to have missing statistics because almost all of our features will use those.

It would be too complicated to try to complete them + they only represent 8 % of our dataset, let's just check from which year those records are :

In [16]:
train_df[train_df['p_1st_in'].isnull()]["season"].value_counts()

2007.0    714
2009.0    664
2008.0    650
2010.0    646
2015.0    626
2011.0    624
2013.0    618
2012.0    618
2014.0    598
2017.0     26
2016.0     10
Name: season, dtype: int64

In [18]:
train_df[train_df['p_1st_in'].isnull()]["tournament_id"].value_counts()

24.0     2532
21.0     1612
28.0     1560
318.0      48
50.0        6
280.0       4
347.0       4
1.0         4
41.0        4
330.0       2
36.0        2
200.0       2
86.0        2
288.0       2
341.0       2
49.0        2
37.0        2
108.0       2
10.0        2
Name: tournament_id, dtype: int64

In [12]:
train_df[train_df['p_1st_in'].isnull()]["season"].describe()

count    5794.000000
mean     2010.927166
std         2.634907
min      2007.000000
25%      2009.000000
50%      2011.000000
75%      2013.000000
max      2017.000000
Name: season, dtype: float64

In [23]:
train_df[train_df['p_1st_in'].isnull()].groupby(by="season")['tournament_id'].value_counts()

season  tournament_id
2007.0  24.0             280
        28.0             190
        21.0             184
        318.0             48
        36.0               2
        86.0               2
        108.0              2
        200.0              2
        288.0              2
        330.0              2
2008.0  24.0             288
        21.0             180
        28.0             180
        280.0              2
2009.0  24.0             286
        21.0             184
        28.0             184
        1.0                4
        41.0               4
        10.0               2
2010.0  24.0             284
        21.0             182
        28.0             180
2011.0  24.0             276
        21.0             180
        28.0             168
2012.0  24.0             278
        21.0             176
        28.0             162
        280.0              2
2013.0  24.0             266
        28.0             178
        21.0             174
2014.0  24.0         

In [41]:
train_df[
    (train_df['tournament_id'] == 21)
    | (train_df['tournament_id'] == 24)
    | (train_df['tournament_id'] == 28)
][['tournament_name', 'season', 'level', 'surface', 'best_of']]

Unnamed: 0,tournament_name,season,level,surface,best_of
26588,Davis Cup G2,2016.0,D,H,5.0
26589,Davis Cup G2,2016.0,D,H,3.0
26590,Davis Cup G2,2016.0,D,H,3.0
26591,Davis Cup WG,2016.0,D,H,5.0
26592,Davis Cup WG,2016.0,D,H,5.0
...,...,...,...,...,...
65682,Davis Cup WG,2015.0,D,H,3.0
65683,Davis Cup G1,2013.0,D,H,5.0
65684,Davis Cup G2,2007.0,D,H,3.0
65685,Davis Cup WG,2012.0,D,C,5.0


We see that for Davis Cups we have a lot of missing values, it would be to hard to complete those values, so we will just drop these matches

We also want to drop the records where player_rank or opponent_rank is missing

In [49]:
train_df = train_df.dropna(subset=['p_1st_in', 'player_rank', 'opponent_rank'])

In [50]:
train_df.isnull().sum()

match_id                   0
tournament_name            0
season                     0
tournament_id              0
date                       0
level                      0
surface                  114
round                      0
best_of                    0
player_id                  0
player_rank                0
player_elo_rating          0
opponent_id                0
opponent_rank              0
opponent_elo_rating        0
outcome                57592
minutes                 2618
p_1st_in                   0
o_1st_in                   0
p_sv_pt                    0
o_sv_pt                    0
p_1st_won                  0
o_1st_won                  0
p_2nd_won                  0
o_2nd_won                  0
p_ace                      0
o_ace                      0
p_df                       0
o_df                       0
p_bp_sv                    0
o_bp_sv                    0
p_bp_fc                    0
o_bp_fc                    0
p_matches                  0
o_matches     

In [51]:
train_df['surface'].value_counts()

H    33690
C    18750
G     6850
P      264
Name: surface, dtype: int64

We also want to drop records that have carpet surface (value P), because that surface is no longer played

In [56]:
train_df = train_df.drop(train_df[train_df['surface'] == 'P'].index)

In [57]:
train_df.isnull().sum()

match_id                   0
tournament_name            0
season                     0
tournament_id              0
date                       0
level                      0
surface                  114
round                      0
best_of                    0
player_id                  0
player_rank                0
player_elo_rating          0
opponent_id                0
opponent_rank              0
opponent_elo_rating        0
outcome                57334
minutes                 2618
p_1st_in                   0
o_1st_in                   0
p_sv_pt                    0
o_sv_pt                    0
p_1st_won                  0
o_1st_won                  0
p_2nd_won                  0
o_2nd_won                  0
p_ace                      0
o_ace                      0
p_df                       0
o_df                       0
p_bp_sv                    0
o_bp_sv                    0
p_bp_fc                    0
o_bp_fc                    0
p_matches                  0
o_matches     

In [59]:
train_df[train_df['surface'].isnull()]['tournament_id'].value_counts()

24.0    68
28.0    46
Name: tournament_id, dtype: int64

We see that again those matches are from Davis Cups tournaments so we can't complete them because it changes depending on location and year

In [60]:
train_df = train_df.dropna(subset=['surface'])

In [61]:
train_df.isnull().sum()

match_id                   0
tournament_name            0
season                     0
tournament_id              0
date                       0
level                      0
surface                    0
round                      0
best_of                    0
player_id                  0
player_rank                0
player_elo_rating          0
opponent_id                0
opponent_rank              0
opponent_elo_rating        0
outcome                57224
minutes                 2614
p_1st_in                   0
o_1st_in                   0
p_sv_pt                    0
o_sv_pt                    0
p_1st_won                  0
o_1st_won                  0
p_2nd_won                  0
o_2nd_won                  0
p_ace                      0
o_ace                      0
p_df                       0
o_df                       0
p_bp_sv                    0
o_bp_sv                    0
p_bp_fc                    0
o_bp_fc                    0
p_matches                  0
o_matches     

Minutes isn't a predictor so we will drop this column

In [62]:
train_df = train_df.drop(columns=['minutes'])

In [65]:
train_df.columns.values

array(['match_id', 'tournament_name', 'season', 'tournament_id', 'date',
       'level', 'surface', 'round', 'best_of', 'player_id', 'player_rank',
       'player_elo_rating', 'opponent_id', 'opponent_rank',
       'opponent_elo_rating', 'outcome', 'p_1st_in', 'o_1st_in',
       'p_sv_pt', 'o_sv_pt', 'p_1st_won', 'o_1st_won', 'p_2nd_won',
       'o_2nd_won', 'p_ace', 'o_ace', 'p_df', 'o_df', 'p_bp_sv',
       'o_bp_sv', 'p_bp_fc', 'o_bp_fc', 'p_matches', 'o_matches',
       'p_sv_gms'], dtype=object)

In [66]:
train_df.isnull().sum()

match_id                   0
tournament_name            0
season                     0
tournament_id              0
date                       0
level                      0
surface                    0
round                      0
best_of                    0
player_id                  0
player_rank                0
player_elo_rating          0
opponent_id                0
opponent_rank              0
opponent_elo_rating        0
outcome                57224
p_1st_in                   0
o_1st_in                   0
p_sv_pt                    0
o_sv_pt                    0
p_1st_won                  0
o_1st_won                  0
p_2nd_won                  0
o_2nd_won                  0
p_ace                      0
o_ace                      0
p_df                       0
o_df                       0
p_bp_sv                    0
o_bp_sv                    0
p_bp_fc                    0
o_bp_fc                    0
p_matches                  0
o_matches                  0
p_sv_gms      

Let's see for validation and test sets :

In [67]:
validation_df = pd.read_csv(os.path.abspath('../input/2018.csv'), delimiter=';')

In [68]:
validation_df.isnull().sum()

match_id                  0
tournament_name           0
season                    0
tournament_id             0
date                      0
level                     0
surface                   0
round                     0
best_of                   0
player_id                 0
player_rank              28
player_elo_rating         0
opponent_id               0
opponent_rank            28
opponent_elo_rating       0
outcome                5744
minutes                  42
p_1st_in                 24
o_1st_in                 24
p_sv_pt                  24
o_sv_pt                  24
p_1st_won                24
o_1st_won                24
p_2nd_won                24
o_2nd_won                24
p_ace                    24
o_ace                    24
p_df                     24
o_df                     24
p_bp_sv                  24
o_bp_sv                  24
p_bp_fc                  24
o_bp_fc                  24
p_matches                 0
o_matches                 0
p_sv_gms            

In [69]:
validation_df = validation_df.dropna(subset=['p_1st_in', 'player_rank', 'opponent_rank', 'surface'])

In [70]:
validation_df = validation_df.drop(columns=['minutes'])

In [71]:
validation_df.isnull().sum()

match_id                  0
tournament_name           0
season                    0
tournament_id             0
date                      0
level                     0
surface                   0
round                     0
best_of                   0
player_id                 0
player_rank               0
player_elo_rating         0
opponent_id               0
opponent_rank             0
opponent_elo_rating       0
outcome                5672
p_1st_in                  0
o_1st_in                  0
p_sv_pt                   0
o_sv_pt                   0
p_1st_won                 0
o_1st_won                 0
p_2nd_won                 0
o_2nd_won                 0
p_ace                     0
o_ace                     0
p_df                      0
o_df                      0
p_bp_sv                   0
o_bp_sv                   0
p_bp_fc                   0
o_bp_fc                   0
p_matches                 0
o_matches                 0
p_sv_gms                  0
dtype: int64

In [72]:
validation_df['surface'].value_counts()

H    3380
C    1776
G     654
Name: surface, dtype: int64

In [73]:
test_df = pd.read_csv(os.path.abspath('../input/2019.csv'), delimiter=';')

In [74]:
test_df.isnull().sum()

match_id                  0
tournament_name           0
season                    0
tournament_id             0
date                      0
level                     0
surface                   0
round                     0
best_of                   0
player_id                 0
player_rank              20
player_elo_rating         0
opponent_id               0
opponent_rank            20
opponent_elo_rating       0
outcome                5232
minutes                 170
p_1st_in                162
o_1st_in                162
p_sv_pt                 162
o_sv_pt                 162
p_1st_won               162
o_1st_won               162
p_2nd_won               162
o_2nd_won               162
p_ace                   162
o_ace                   162
p_df                    162
o_df                    162
p_bp_sv                 162
o_bp_sv                 162
p_bp_fc                 162
o_bp_fc                 162
p_matches                 0
o_matches                 0
p_sv_gms            

In [75]:
test_df = test_df.dropna(subset=['p_1st_in', 'player_rank', 'opponent_rank', 'surface'])
test_df = test_df.drop(columns=['minutes'])

In [76]:
test_df.isnull().sum()

match_id                  0
tournament_name           0
season                    0
tournament_id             0
date                      0
level                     0
surface                   0
round                     0
best_of                   0
player_id                 0
player_rank               0
player_elo_rating         0
opponent_id               0
opponent_rank             0
opponent_elo_rating       0
outcome                5068
p_1st_in                  0
o_1st_in                  0
p_sv_pt                   0
o_sv_pt                   0
p_1st_won                 0
o_1st_won                 0
p_2nd_won                 0
o_2nd_won                 0
p_ace                     0
o_ace                     0
p_df                      0
o_df                      0
p_bp_sv                   0
o_bp_sv                   0
p_bp_fc                   0
o_bp_fc                   0
p_matches                 0
o_matches                 0
p_sv_gms                  0
dtype: int64