### Import Data from CSV

In [153]:
import pandas as pd
import os
import glob
  
  
path = os.getcwd()
csv_files = glob.glob(os.path.join(path, "data/*.csv"))
  
all_data = pd.DataFrame()

for f in csv_files:
    df = pd.read_csv(f)
    all_data = pd.concat([all_data, df])      

### Data Cleaning

In [154]:
# Only take matches that are Grand Slams (G)
all_data = all_data[all_data["tourney_level"] == "G"]

# Reset the Index
all_data.reset_index(inplace=True)

# Drop Unused Collumns
collumns_to_drop = [
    'tourney_id', 'tourney_date', 'index', 'draw_size', 'match_num', 
    'winner_id', 'winner_seed', 'winner_entry', 'winner_hand', 'winner_ht', 
    'winner_ioc', 'winner_age', 'loser_id', 'loser_name', 'loser_seed', 'loser_entry', 
    'loser_hand', 'loser_ht', 'loser_ioc', 'loser_age', 'w_SvGms',
    'l_SvGms', 'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'
    ]

all_data.drop(collumns_to_drop, axis=1, inplace=True)

# Drop Rows with Na as values
all_data.dropna(inplace=True)
all_data.head()

Unnamed: 0,tourney_name,surface,tourney_level,winner_name,score,best_of,round,minutes,w_ace,w_df,...,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_bpSaved,l_bpFaced
0,Roland Garros,Clay,G,Roger Federer,6-4 6-4 6-3,5,R128,95.0,5.0,1.0,...,1.0,2.0,6.0,4.0,92.0,41.0,26.0,28.0,7.0,12.0
1,Roland Garros,Clay,G,Albert Montanes,7-6(5) 7-6(2) 7-6(3),5,R128,147.0,6.0,2.0,...,4.0,7.0,8.0,2.0,114.0,67.0,48.0,24.0,0.0,3.0
2,Roland Garros,Clay,G,David Marrero,6-3 6-3 5-7 6-4,5,R128,153.0,5.0,1.0,...,5.0,7.0,14.0,3.0,114.0,86.0,57.0,16.0,10.0,15.0
3,Roland Garros,Clay,G,Mario Ancic,6-2 7-6(1) 6-2,5,R128,137.0,9.0,1.0,...,6.0,8.0,4.0,4.0,103.0,72.0,38.0,14.0,7.0,13.0
4,Roland Garros,Clay,G,Alejandro Falla,3-6 7-6(4) 7-6(6) 5-7 6-4,5,R128,216.0,9.0,2.0,...,8.0,11.0,35.0,5.0,163.0,103.0,82.0,37.0,3.0,5.0


### Organize Other Player Data

Remove the big three from the data, and set the data frame to `data_other`.

In [155]:
data_other = all_data[~all_data.winner_name.str.contains("Roger Federer")]
data_other = data_other[~data_other.winner_name.str.contains("Rafael Nadal")]
data_other = data_other[~data_other.winner_name.str.contains("Novak Djokovic")]
data_other.head()

Unnamed: 0,tourney_name,surface,tourney_level,winner_name,score,best_of,round,minutes,w_ace,w_df,...,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_bpSaved,l_bpFaced
1,Roland Garros,Clay,G,Albert Montanes,7-6(5) 7-6(2) 7-6(3),5,R128,147.0,6.0,2.0,...,4.0,7.0,8.0,2.0,114.0,67.0,48.0,24.0,0.0,3.0
2,Roland Garros,Clay,G,David Marrero,6-3 6-3 5-7 6-4,5,R128,153.0,5.0,1.0,...,5.0,7.0,14.0,3.0,114.0,86.0,57.0,16.0,10.0,15.0
3,Roland Garros,Clay,G,Mario Ancic,6-2 7-6(1) 6-2,5,R128,137.0,9.0,1.0,...,6.0,8.0,4.0,4.0,103.0,72.0,38.0,14.0,7.0,13.0
4,Roland Garros,Clay,G,Alejandro Falla,3-6 7-6(4) 7-6(6) 5-7 6-4,5,R128,216.0,9.0,2.0,...,8.0,11.0,35.0,5.0,163.0,103.0,82.0,37.0,3.0,5.0
5,Roland Garros,Clay,G,Julien Benneteau,6-4 6-4 3-6 3-6 6-3,5,R128,191.0,5.0,9.0,...,4.0,10.0,9.0,4.0,166.0,101.0,66.0,30.0,9.0,16.0


### Organize Federer Data

In [156]:
data_federer = all_data[all_data['winner_name'] == "Roger Federer"]
data_federer.drop([ 'winner_name' ], axis=1, inplace=True)
data_federer.head()

Unnamed: 0,tourney_name,surface,tourney_level,score,best_of,round,minutes,w_ace,w_df,w_svpt,...,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_bpSaved,l_bpFaced
0,Roland Garros,Clay,G,6-4 6-4 6-3,5,R128,95.0,5.0,1.0,79.0,...,1.0,2.0,6.0,4.0,92.0,41.0,26.0,28.0,7.0,12.0
64,Roland Garros,Clay,G,6-7(5) 6-1 6-0 6-4,5,R64,145.0,13.0,1.0,110.0,...,6.0,6.0,2.0,8.0,135.0,83.0,49.0,25.0,12.0,18.0
96,Roland Garros,Clay,G,6-3 6-4 6-2,5,R32,96.0,8.0,1.0,71.0,...,2.0,3.0,2.0,0.0,75.0,47.0,31.0,10.0,2.0,7.0
112,Roland Garros,Clay,G,6-4 7-5 7-5,5,R16,146.0,6.0,3.0,98.0,...,7.0,10.0,5.0,1.0,118.0,74.0,38.0,28.0,10.0,16.0
120,Roland Garros,Clay,G,2-6 6-2 6-3 6-4,5,QF,121.0,11.0,0.0,96.0,...,3.0,6.0,3.0,0.0,100.0,72.0,40.0,17.0,8.0,13.0


### Organize Nadal Data

In [157]:
data_nadal = all_data[all_data['winner_name'] == "Rafael Nadal"]
data_nadal.drop([ 'winner_name' ], axis=1, inplace=True)
data_nadal.head()

Unnamed: 0,tourney_name,surface,tourney_level,score,best_of,round,minutes,w_ace,w_df,w_svpt,...,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_bpSaved,l_bpFaced
63,Roland Garros,Clay,G,7-5 6-3 6-1,5,R128,154.0,2.0,1.0,90.0,...,2.0,4.0,3.0,5.0,91.0,49.0,25.0,22.0,6.0,13.0
95,Roland Garros,Clay,G,6-4 6-0 6-1,5,R64,114.0,1.0,1.0,74.0,...,7.0,8.0,1.0,3.0,69.0,52.0,25.0,5.0,2.0,9.0
111,Roland Garros,Clay,G,6-1 6-3 6-1,5,R32,118.0,3.0,1.0,78.0,...,7.0,8.0,0.0,0.0,74.0,49.0,26.0,5.0,6.0,13.0
119,Roland Garros,Clay,G,6-1 6-0 6-2,5,R16,114.0,1.0,1.0,69.0,...,3.0,5.0,1.0,6.0,70.0,47.0,20.0,7.0,6.0,15.0
123,Roland Garros,Clay,G,6-1 6-1 6-1,5,QF,104.0,1.0,1.0,65.0,...,5.0,5.0,3.0,3.0,65.0,25.0,13.0,14.0,5.0,13.0


### Organize Djokovic Data

In [158]:
data_djoker = all_data[all_data['winner_name'] == "Novak Djokovic"]
data_djoker.drop([ 'winner_name' ], axis=1, inplace=True)
data_djoker.head()

Unnamed: 0,tourney_name,surface,tourney_level,score,best_of,round,minutes,w_ace,w_df,w_svpt,...,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_bpSaved,l_bpFaced
47,Roland Garros,Clay,G,4-6 6-3 7-5 6-2,5,R128,149.0,5.0,3.0,112.0,...,3.0,7.0,3.0,5.0,100.0,46.0,30.0,29.0,5.0,12.0
87,Roland Garros,Clay,G,6-1 6-1 6-3,5,R64,80.0,3.0,2.0,68.0,...,0.0,0.0,6.0,2.0,69.0,46.0,26.0,6.0,4.0,10.0
107,Roland Garros,Clay,G,7-5 6-4 6-2,5,R32,137.0,10.0,0.0,85.0,...,3.0,6.0,10.0,3.0,119.0,62.0,37.0,24.0,5.0,12.0
117,Roland Garros,Clay,G,6-4 6-3 6-4,5,R16,140.0,14.0,1.0,77.0,...,7.0,8.0,8.0,4.0,111.0,63.0,45.0,18.0,7.0,12.0
122,Roland Garros,Clay,G,7-5 7-6(3) 7-5,5,QF,187.0,6.0,1.0,125.0,...,4.0,6.0,12.0,7.0,138.0,85.0,58.0,24.0,12.0,16.0
