DATA EXTRACTION

In [8]:
#Data extraction
import pandas as pd
import os
import glob
  
  
path = os.getcwd()
csv_files = glob.glob(os.path.join(path, "../data/*.csv"))
  
all_data = pd.DataFrame()

for f in csv_files:
    df = pd.read_csv(f)
    all_data = pd.concat([all_data, df])      

DATA CLEANING

In [9]:
#Data cleaning code here
# Only take matches that are Grand Slams (G)
all_data = all_data[all_data["tourney_level"] == "G"]

# Reset the Index
all_data.reset_index(inplace=True)

# Drop Unused Collumns
collumns_to_drop = [
    'tourney_id', 'tourney_date', 'index', 'draw_size', 'match_num', 
    'winner_id', 'winner_seed', 'winner_entry', 'winner_hand', 'winner_ht', 
    'winner_ioc', 'winner_age', 'loser_id', 'loser_name', 'loser_seed', 'loser_entry', 
    'loser_hand', 'loser_ht', 'loser_ioc', 'loser_age', 'w_SvGms',
    'l_SvGms', 'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'
    ]

all_data.drop(collumns_to_drop, axis=1, inplace=True)

# Drop Rows with Na as values
all_data.dropna(inplace=True)
all_data.head()

Unnamed: 0,tourney_name,surface,tourney_level,winner_name,score,best_of,round,minutes,w_ace,w_df,...,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_bpSaved,l_bpFaced
0,Australian Open,Hard,G,Novak Djokovic,6-3 6-2 6-3,5,F,124.0,8.0,0.0,...,1.0,1.0,3.0,2.0,73.0,47.0,24.0,16.0,3.0,8.0
1,Australian Open,Hard,G,Stefanos Tsitsipas,6-7(11) 7-6(3) 7-5 7-6(5),5,R16,225.0,20.0,1.0,...,12.0,12.0,12.0,0.0,156.0,105.0,83.0,31.0,2.0,3.0
2,Australian Open,Hard,G,Milos Raonic,6-1 6-1 7-6(5),5,R16,119.0,15.0,1.0,...,0.0,1.0,6.0,10.0,90.0,56.0,38.0,12.0,14.0,20.0
3,Australian Open,Hard,G,Frances Tiafoe,4-6 6-4 6-4 7-5,5,R64,179.0,10.0,2.0,...,4.0,6.0,13.0,4.0,134.0,87.0,64.0,23.0,10.0,14.0
4,Australian Open,Hard,G,Roberto Bautista Agut,6-7(6) 6-3 6-2 4-6 6-4,5,R16,238.0,10.0,3.0,...,3.0,5.0,17.0,6.0,167.0,102.0,77.0,28.0,8.0,13.0


SANKEY DATA CREATION BY CONSIDERDING TOP 20 TENNIS PLAYERS

In [10]:
#data taken for Sankey diagram is for top 20 players, so that the visualization is clear
tgs = len(all_data) #total grand slam matches played
sanky_data = all_data.groupby('winner_name',as_index=False).size().sort_values('size', ascending=False).head(20) #data for sanky diagram
sanky_data 

Unnamed: 0,winner_name,size
428,Roger Federer,324
377,Novak Djokovic,284
407,Rafael Nadal,279
50,Andy Murray,182
104,David Ferrer,143
495,Tomas Berdych,140
459,Stan Wawrinka,140
51,Andy Roddick,116
243,Jo-Wilfried Tsonga,115
317,Marin Cilic,115


SOURCE AND TARGET SETUP FOR SANKEY PLOT

In [11]:
sanky_data['total_grand_slams'] = 'Grand slam matches from 2003 to 2020: ' + str(+ tgs)
sanky_data["winner_name"] = sanky_data["winner_name"] +": "+ (sanky_data['size']).astype(str)
sanky_data

Unnamed: 0,winner_name,size,total_grand_slams
428,Roger Federer: 324,324,Grand slam matches from 2003 to 2020: 8732
377,Novak Djokovic: 284,284,Grand slam matches from 2003 to 2020: 8732
407,Rafael Nadal: 279,279,Grand slam matches from 2003 to 2020: 8732
50,Andy Murray: 182,182,Grand slam matches from 2003 to 2020: 8732
104,David Ferrer: 143,143,Grand slam matches from 2003 to 2020: 8732
495,Tomas Berdych: 140,140,Grand slam matches from 2003 to 2020: 8732
459,Stan Wawrinka: 140,140,Grand slam matches from 2003 to 2020: 8732
51,Andy Roddick: 116,116,Grand slam matches from 2003 to 2020: 8732
243,Jo-Wilfried Tsonga: 115,115,Grand slam matches from 2003 to 2020: 8732
317,Marin Cilic: 115,115,Grand slam matches from 2003 to 2020: 8732


PLOTTING DIAGRAM

In [12]:
#Code to generate sanky diagram
import plotly.graph_objects as go
 
# First, we get a list of all of sources, remove duplicates, and make this a list
sources = sanky_data['total_grand_slams'].drop_duplicates().tolist()
 
# Then, we get a list of all of platforms (our targets), remove duplicates, and make this a list
platforms = sanky_data['winner_name'].drop_duplicates().tolist()
 
# Finally, create a list of all our nodes. We will use this for giving an id to each node for plot.ly
all_nodes = sources + platforms
 
# Keeping the size of our dataframes, this would be useful for applying the same color for each "node" and "link" of our sankey diagram, if we so choose to do so
n = len(all_nodes)
n2 = len(sanky_data['total_grand_slams'])
 
# Create a dataframe that has all of the node ids. We will join this to the original data frame to reference later
df1 = pd.DataFrame(all_nodes, columns = ['node'])
df1 = df1.reset_index()
df2 = pd.merge(pd.merge(sanky_data, df1, how = 'inner', left_on = "total_grand_slams", right_on ="node"), df1, how = 'inner', left_on = "winner_name", right_on ="node", suffixes = ('_source','_target'))
 
# Setting up the data in the plotly "data" argument.
# The nodes are described in the "node" dictionary (these are the vertical rectangles in the diagram)
# The links are described in the "link" dictionary. These have 3 attributes, the "source" (the index of the node they start at), the "target" (the index of the node they end at), and the "value" the thickness of the band. Additional attributes, such as color can also be specified.
data = dict(
    type='sankey',
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(
        color = "#435951",
        width = 0.5
      ),
      label = all_nodes,
      color = ["#84baa6"] * n
    ),
    link = dict(
      source = df2["index_source"],
      target = df2["index_target"],
      value = df2["size"],
      color = ['#bdf9e5'] * n2
  ))
 
# Setting up the layout settings in the "layout" argument
layout =  dict(
    title = "Distribution of Grand Slam match winners (top 20)",
    font = dict(
      size = 12
    )
)
 
fig = go.Figure(data=[data], layout=layout)
 
fig.show()