In [38]:
import numpy as np
import pandas as pd
import plotly as plt
import scipy

## Loading the data set

Reading the .csv file containing the player's data.


In [64]:
df_male_players = pd.read_csv("../data/male_players.csv")
df_female_players = pd.read_csv("../data/female_players.csv")
df_matches = pd.read_csv("../data/tennis_matches.csv")

ValueError: cannot safely convert passed user dtype of int64 for float64 dtyped data in column 14

## Data Understanding

Visualize info about the dataframes: summary, data types.

In [40]:
df_male_players.info()
df_female_players.info()
df_matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55208 entries, 0 to 55207
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     55031 non-null  object
 1   surname  55166 non-null  object
dtypes: object(2)
memory usage: 862.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46172 entries, 0 to 46171
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     44505 non-null  object
 1   surname  46172 non-null  object
dtypes: object(2)
memory usage: 721.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186128 entries, 0 to 186127
Data columns (total 50 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Unnamed: 0          186128 non-null  int64  
 1   tourney_id          186073 non-null  object 
 2   tourney_name        186103 non-null  object 
 3   surface             185940 non-null  o

Print the data types of each attribute of the dataframes

In [63]:
print(df_male_players.dtypes)
print(df_female_players.dtypes)
df_matches.dtypes

name       object
surname    object
dtype: object
name       object
surname    object
dtype: object


Unnamed: 0              int64
tourney_id             object
tourney_name           object
surface                object
draw_size             float64
tourney_level          object
tourney_date          float64
match_num             float64
winner_id             float64
winner_entry           object
winner_name            object
winner_hand            object
winner_ht             float64
winner_ioc             object
winner_age            float64
loser_id              float64
loser_entry            object
loser_name             object
loser_hand             object
loser_ht              float64
loser_ioc              object
loser_age             float64
score                  object
best_of               float64
round                  object
minutes               float64
w_ace                 float64
w_df                  float64
w_svpt                float64
w_1stIn               float64
w_1stWon              float64
w_2ndWon              float64
w_SvGms               float64
w_bpSaved 

Display how many null values are in the dataframes.

In [42]:
df_female_players.isnull()
df_male_players.isnull()
df_matches.isnull()

Unnamed: 0.1,Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_entry,...,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,tourney_spectators,tourney_revenue
0,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186123,False,True,True,False,False,True,False,False,True,True,...,True,True,False,False,True,False,False,False,True,True
186124,False,True,False,True,False,False,False,True,True,True,...,True,True,True,True,False,False,False,True,True,True
186125,False,True,True,True,True,False,True,True,True,True,...,True,True,True,False,True,True,True,True,True,True
186126,False,True,True,True,False,False,False,True,True,True,...,False,True,False,True,False,True,True,True,False,True


Drop null values in the dataframes.

For the `df_male_players` dataframe, we drop all rows in which either `name` or `surname` is missing.

For the `df_female_players` dataframe, we drop all rows in which either `name` or `surname` is missing.

For the `df_matches` dataframe, we drop all rows in which either `winner_name` or `loser_name` is missing.

In [43]:
df_male_players.dropna(inplace=True)
df_female_players.dropna(inplace=True)
df_matches = df_matches[(df_matches['winner_name'].notna()) & (df_matches['loser_name'].notna())] #46 elements

Remove duplicates from dataframes.


In [44]:
## There are no duplicates

df_male_players.drop_duplicates(inplace=True)
df_female_players.drop_duplicates(inplace=True)
df_matches.drop_duplicates(inplace=True)

We now analyze each attribute to:
 1. fix values to comply with doimains
 2. enforce the correct datatype on the attribute

In [None]:
# Tourney level
# 'O' and 'W' are not possible values for the attribute. We also notice that 'E', 'J', and 'T' are not present in the dataset.
# We replace such values with U that stands for Undefined
df_matches['tourney_level'].replace(['W', 'O'], 'U', inplace=True)
df_matches['tourney_level'].fillna('U', inplace=True)
df_matches['tourney_level'].unique()

In [None]:
# Tourney date
# Undefined date is 18000101 (January 1st, 1800)
# df_matches['tourney_date'].head()
df_matches['tourney_date'].fillna(18000101, inplace=True)
df_matches['tourney_date'] = pd.to_datetime(df_matches['tourney_date'], format="%Y%m%d")
df_matches.head()

## Data Integration



Create a dataframe with all the male matches and another one for the female matches

In [45]:
df_male_players_tmp = pd.DataFrame()
df_male_players_tmp['player_name'] = df_male_players['name'] + ' ' + df_male_players['surname']
df_male_winners = df_male_players_tmp.join(df_matches.set_index('winner_name'), on='player_name', how='inner')
df_male_losers = df_male_players_tmp.join(df_matches.set_index('loser_name'), on='player_name', how='inner')
df_male_matches = pd.concat([df_male_winners, df_male_losers])

In [62]:
df_male_matches.describe()

Unnamed: 0.1,Unnamed: 0,draw_size,tourney_date,match_num,winner_id,winner_ht,winner_age,loser_id,loser_ht,loser_age,...,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,tourney_spectators,tourney_revenue
count,116822.0,116822.0,116822.0,116822.0,116822.0,56918.0,116791.0,116822.0,45994.0,116661.0,...,113313.0,113317.0,113313.0,113313.0,115459.0,115431.0,111729.0,111699.0,116822.0,116822.0
mean,74959.319794,48.098406,20180990.0,284.05469,117099.151367,186.52732,26.224712,119370.796947,186.343153,26.050706,...,13.367389,11.213569,4.542983,8.397068,250.926874,644.63988,319.705036,440.645941,4820.201204,967007.9
std,33704.73952,30.187381,15805.75,317.432686,25595.6988,7.050625,4.43147,28192.43563,6.927988,4.559639,...,6.272846,3.488622,3.061492,3.724259,240.466802,1306.114844,327.758876,756.380896,3207.48336,718100.7
min,0.0,4.0,20160100.0,1.0,100644.0,145.0,14.798084,100644.0,145.0,14.409309,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,173.0,35879.62
25%,63473.0,32.0,20170310.0,247.0,105254.0,183.0,22.847365,105343.0,183.0,22.628337,...,9.0,9.0,2.0,6.0,98.0,135.0,118.0,100.0,2923.0,574481.2
50%,79035.5,32.0,20180530.0,274.0,106022.0,185.0,25.924709,106078.0,185.0,25.776865,...,13.0,10.0,4.0,8.0,193.0,288.0,228.0,234.0,3611.0,714066.5
75%,102269.0,64.0,20190800.0,288.0,117357.0,190.0,29.218344,123743.0,190.0,29.081451,...,17.0,14.0,6.0,11.0,329.0,607.0,385.0,504.0,5808.0,1105200.0
max,163138.0,128.0,20210820.0,7316.0,222632.0,211.0,95.0,222632.0,211.0,53.932923,...,55.0,50.0,27.0,35.0,2220.0,16950.0,2257.0,16950.0,18086.0,5002794.0


In [46]:
df_female_players_tmp = pd.DataFrame()
df_female_players_tmp['player_name'] = df_female_players['name'] + ' ' + df_female_players['surname']
df_female_winners = df_female_players_tmp.join(df_matches.set_index('winner_name'), on='player_name', how='inner')
df_female_losers = df_female_players_tmp.join(df_matches.set_index('loser_name'), on='player_name', how='inner')
df_female_matches = pd.concat([df_female_winners, df_female_losers])