In [None]:
import numpy as np
import pandas as pd
import plotly as plt
import scipy
import math
from datetime import datetime

## Loading the data set

Reading the .csv file containing the player's data.


In [None]:
df_male_players = pd.read_csv("../data/male_players.csv")
df_female_players = pd.read_csv("../data/female_players.csv")
df_matches = pd.read_csv("../data/tennis_matches.csv")

## Data Understanding

Visualize info about the dataframes: summary, data types.

In [None]:
df_male_players.info()
df_female_players.info()
df_matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55208 entries, 0 to 55207
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     55031 non-null  object
 1   surname  55166 non-null  object
dtypes: object(2)
memory usage: 862.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46172 entries, 0 to 46171
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     44505 non-null  object
 1   surname  46172 non-null  object
dtypes: object(2)
memory usage: 721.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186128 entries, 0 to 186127
Data columns (total 50 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Unnamed: 0          186128 non-null  int64  
 1   tourney_id          186073 non-null  object 
 2   tourney_name        186103 non-null  object 
 3   surface             185940 non-null  o

Print the data types of each attribute of the dataframes

In [None]:
print(df_male_players.dtypes)
print(df_female_players.dtypes)
df_matches.dtypes

name       object
surname    object
dtype: object
name       object
surname    object
dtype: object


Unnamed: 0              int64
tourney_id             object
tourney_name           object
surface                object
draw_size             float64
tourney_level          object
tourney_date          float64
match_num             float64
winner_id             float64
winner_entry           object
winner_name            object
winner_hand            object
winner_ht             float64
winner_ioc             object
winner_age            float64
loser_id              float64
loser_entry            object
loser_name             object
loser_hand             object
loser_ht              float64
loser_ioc              object
loser_age             float64
score                  object
best_of               float64
round                  object
minutes               float64
w_ace                 float64
w_df                  float64
w_svpt                float64
w_1stIn               float64
w_1stWon              float64
w_2ndWon              float64
w_SvGms               float64
w_bpSaved 

Display how many null values are in the dataframes.

In [None]:
df_female_players.isnull()
df_male_players.isnull()
df_matches.isnull()

Unnamed: 0.1,Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_entry,...,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,tourney_spectators,tourney_revenue
0,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186123,False,True,True,False,False,True,False,False,True,True,...,True,True,False,False,True,False,False,False,True,True
186124,False,True,False,True,False,False,False,True,True,True,...,True,True,True,True,False,False,False,True,True,True
186125,False,True,True,True,True,False,True,True,True,True,...,True,True,True,False,True,True,True,True,True,True
186126,False,True,True,True,False,False,False,True,True,True,...,False,True,False,True,False,True,True,True,False,True


Drop null values in the dataframes.

For the `df_male_players` dataframe, we drop all rows in which either `name` or `surname` is missing.

For the `df_female_players` dataframe, we drop all rows in which either `name` or `surname` is missing.

In [None]:
df_male_players.dropna(inplace=True)
df_female_players.dropna(inplace=True)

### Filling missing values and handlig outliers

#### Winner/Loser ID, Winner/Loser Name

Try to fill missing `winner_id`, `loser_id`, `winner_name`, `loser_name`

In [None]:
# Fix missing winner_id

#Listing winner names which have winner_id field = null
df_null_winner_names = df_matches[df_matches['winner_id'].isnull()]['winner_name']
df_null_winner_names.dropna(inplace=True)

for name in df_null_winner_names:
    w_id = [x for x in (df_matches[df_matches['winner_name'] == name]['winner_id']) if pd.notna(x)]
    if len(w_id) > 0:
        df_matches.loc[df_matches.winner_name == name, 'winner_id'] = w_id[0]
    else:
        l_id = [x for x in (df_matches[df_matches['winner_name'] == name]['loser_id']) if pd.notna(x)]
        if len(l_id) > 0:
            df_matches.loc[df_matches.winner_name == name, 'winner_id'] = l_id[0]

# Fix missing loser_id
df_null_loser_names = df_matches[df_matches['loser_id'].isnull()]['loser_name']
df_null_loser_names.dropna(inplace=True)

for name in df_null_loser_names:
    w_id = [x for x in (df_matches[df_matches['winner_name'] == name]['winner_id']) if pd.notna(x)]
    if len(w_id) > 0:
        df_matches.loc[df_matches.loser_name == name, 'loser_id'] = w_id[0]
    else:
        l_id = [x for x in (df_matches[df_matches['winner_name'] == name]['loser_id']) if pd.notna(x)]
        if len(l_id) > 0:
            df_matches.loc[df_matches.loser_name == name, 'loser_id'] = l_id[0]

# Fix missing winner_name
df_null_winner_ids = df_matches[df_matches['winner_name'].isnull()]['winner_id']
df_null_winner_ids.dropna(inplace=True)

for idx in df_null_winner_ids:
    w_id = [x for x in (df_matches[df_matches['winner_id'] == idx]['winner_name'])]
    if len(w_id) > 0:
        df_matches.loc[df_matches.winner_id == idx, 'winner_name'] = w_id[0]
    else:
        l_id =  [x for x in (df_matches[df_matches['winner_id'] == idx]['loser_name'])]
        if len(l_id) > 0:
            df_matches.loc[df_matches.winner_id == idx, 'winner_name'] = l_id[0]

# Fix missing loser_name
df_null_loser_ids = df_matches[df_matches['loser_name'].isnull()]['loser_id']
df_null_loser_ids.dropna(inplace=True)

for idx in df_null_loser_ids:
    w_id = [x for x in (df_matches[df_matches['loser_id'] == idx]['winner_name'])]
    if len(w_id) > 0:
        df_matches.loc[df_matches.loser_id == idx, 'loser_name'] = w_id[0]
    else:
        l_id =  [x for x in (df_matches[df_matches['loser_id'] == idx]['loser_name'])]
        if len(l_id) > 0:
            df_matches.loc[df_matches.loser_id == idx, 'loser_name'] = l_id[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


For the `df_matches` dataframe, we drop all rows in which `winner_name`, `loser_name`, `winner_id` or `loser_id` are missing.

In [None]:
df_matches = df_matches[(df_matches['winner_name'].notna()) & (df_matches['loser_name'].notna()) & (df_matches['loser_id'].notna()) & (df_matches['winner_id'].notna())]

In [None]:
df_matches['winner_id'] = df_matches['winner_id'].astype('int64')
df_matches['loser_id'] = df_matches['loser_id'].astype('int64')

We now analyze each attribute to:
 1. fix values to comply with doimains
 2. enforce the correct datatype on the attribute

#### Surface

We did not perform anything on the surface on the surface attribute

In [None]:
# Possible values = {'Clay', 'Hard', 'Carpet', 'Grass'}
df_matches['surface'].unique()

array(['Hard', 'Clay', 'Grass', 'Carpet', nan], dtype=object)

#### Draw Size

For draw size:
- we fill null values with 32 (the mean value of draw size)
- since the draw size has to be a power of 2, we substitute all the draw sizes which are not a power of 2 with an appropriate value (e.g.: 31 turns into 32)

In [None]:
# Possible values = {2**x}
df_matches['draw_size'].fillna(32, inplace=True)
list_of_drawsize = df_matches['draw_size'].unique()
for item in list_of_drawsize:
    if (math.log(item, 2))%1 != 0:
        df_matches.loc[df_matches.draw_size == item, 'draw_size'] = 2**math.ceil(math.log(item, 2))

df_matches['draw_size'] = df_matches['draw_size'].astype('int64')

#### Tourney Level

Changing tourney_dates from float to date type:

In [None]:
# Tourney level
# 'O' and 'W' are not possible values for the attribute. We also notice that 'E', 'J', and 'T' are not present in the dataset.
# We replace such values with U that stands for Undefined
df_matches['tourney_level'].replace(['W', 'O'], 'U', inplace=True)
df_matches['tourney_level'].fillna('U', inplace=True)
df_matches['tourney_level'].unique()

array(['A', 'P', 'G', 'I', 'M', 'PM', 'F', 'D', 'C', '15', '25', '60',
       '100', '80', '10', '50', '75', 'U'], dtype=object)

#### Tourney Date

In [None]:
# Tourney date
# Undefined date is 18000101 (January 1st, 1800)
# df_matches['tourney_date'].head()
df_matches['tourney_date'].fillna(18000101, inplace=True)
df_matches['tourney_date'] = pd.to_datetime(df_matches['tourney_date'], format="%Y%m%d")
df_matches['tourney_date'].head()

0   2018-12-31
1   2018-12-31
2   2018-12-31
3   2018-12-31
4   2018-12-31
Name: tourney_date, dtype: datetime64[ns]

#### Match Num

In [None]:
# Match_num. Undefined match_num is -1
# df_matches['match_num'].describe()
df_matches['match_num'].fillna(-1, inplace=True)
df_matches['match_num'] = df_matches['match_num'].astype('int64')
df_matches['match_num'].head()

0    300
1    299
2    298
3    297
4    296
Name: match_num, dtype: int64

#### Winner Hand

In [None]:
# winner_hand and loser_hand can only assume values: {R, L, U}

df_matches['winner_hand'].fillna('U', inplace=True)
df_matches['loser_hand'].fillna('U', inplace=True)

#### Height

In [None]:
# winner_ht and loser_ht casted to integers
df_matches['winner_ht'].fillna(-1, inplace=True)
df_matches['winner_ht'] = df_matches['winner_ht'].astype('int64')
df_matches['loser_ht'].fillna(-1, inplace=True)
df_matches['loser_ht'] = df_matches['loser_ht'].astype('int64')
df_matches['loser_ht'].head()

#give -1 to outliers winners
quantiles_winner_height = df_matches['winner_ht'].quantile([0.25,0.5,0.75])

Q1 = quantiles_winner_height.values[0]
Q2 = quantiles_winner_height.values[1]
Q3 = quantiles_winner_height.values[2]

winner_lower_limit = Q1 - 1.5 * (Q3-Q1)
winner_upper_limit = Q3 + 1.5 * (Q3-Q1)

df_matches.loc[df_matches.winner_ht < winner_lower_limit, 'winner_ht'] = -1
df_matches.loc[df_matches.winner_ht > winner_upper_limit, 'winner_ht'] = -1

#give -1 to outliers losers
quantiles_loser_height = df_matches['loser_ht'].quantile([0.25,0.5,0.75])

Q1 = quantiles_loser_height.values[0]
Q2 = quantiles_loser_height.values[1]
Q3 = quantiles_loser_height.values[2]

loser_lower_limit = Q1 - 1.5 * (Q3-Q1)
loser_upper_limit = Q3 + 1.5 * (Q3-Q1)

df_matches.loc[df_matches.loser_ht < winner_lower_limit, 'loser_ht'] = -1
df_matches.loc[df_matches.loser_ht > winner_upper_limit, 'loser_ht'] = -1


#### IOC

In [None]:
print(len(df_matches[df_matches['winner_ioc'].isnull()]))
print(len(df_matches[df_matches['loser_ioc'].isnull()]))

9
4


In [None]:
# fixing winner_ioc 
df_null_winner_ioc = df_matches[df_matches['winner_ioc'].isnull()]['winner_name']
df_null_winner_ioc.dropna(inplace=True)

for name in df_null_winner_ioc:
    w_ioc = [x for x in (df_matches[df_matches['winner_name'] == name]['winner_ioc']) if pd.notna(x)]
    if len(w_ioc) > 0:
        df_matches.loc[df_matches.winner_name == name, 'winner_ioc'] = w_ioc[0]
    else:
        l_ioc = [x for x in (df_matches[df_matches['winner_name'] == name]['loser_ioc']) if pd.notna(x)]
        if len(l_ioc) > 0:
            df_matches.loc[df_matches.winner_name == name, 'winner_ioc'] = l_ioc[0]

#fixing loser_ioc
df_null_loser_ioc = df_matches[df_matches['loser_ioc'].isnull()]['loser_name']
df_null_loser_ioc.dropna(inplace=True)

for name in df_null_loser_ioc:
    w_ioc = [x for x in (df_matches[df_matches['loser_name'] == name]['loser_ioc']) if pd.notna(x)]
    if len(w_ioc) > 0:
        df_matches.loc[df_matches.winner_name == name, 'loser_ioc'] = w_ioc[0]
    else:
        l_ioc = [x for x in (df_matches[df_matches['loser_name'] == name]['winner_ioc']) if pd.notna(x)]
        if len(l_ioc) > 0:
            df_matches.loc[df_matches.winner_name == name, 'loser_ioc'] = l_ioc[0]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [None]:
print(len(df_matches[df_matches['winner_ioc'].isnull()]))
print(len(df_matches[df_matches['loser_ioc'].isnull()]))

0
3


### Age

Substituting all outliers with -1 value:

In [None]:
df_matches[df_matches['winner_age'] == -1].shape[0]

0

In [None]:
df_matches[df_matches['loser_age'] == -1].shape[0]

0

In [None]:
#Q1 – 1.5 * (Q3-Q1) lower limit
#Q3 + 1.5 * (Q3-Q1) upper limit

quantiles_winner_age = df_matches['winner_age'].quantile([0.25,0.5,0.75])

Q1 = quantiles_winner_age.values[0]
Q2 = quantiles_winner_age.values[1]
Q3 = quantiles_winner_age.values[2]

winner_lower_limit = Q1 - 1.5 * (Q3-Q1)
winner_upper_limit = Q3 + 1.5 * (Q3-Q1)

df_matches.loc[df_matches.winner_age < winner_lower_limit, 'winner_age'] = -1
df_matches.loc[df_matches.winner_age > winner_upper_limit, 'winner_age'] = -1

#df.loc[df.my_channel > 20000, 'my_channel'] = 0


In [None]:
#Q1 – 1.5 * (Q3-Q1) lower limit
#Q3 + 1.5 * (Q3-Q1) upper limit

quantiles_loser_age = df_matches['loser_age'].quantile([0.25,0.5,0.75])

Q1 = quantiles_loser_age.values[0]
Q2 = quantiles_loser_age.values[1]
Q3 = quantiles_loser_age.values[2]

loser_lower_limit = Q1 - 1.5 * (Q3-Q1)
loser_upper_limit = Q3 + 1.5 * (Q3-Q1)

df_matches.loc[df_matches.loser_age < loser_lower_limit, 'loser_age'] = -1
df_matches.loc[df_matches.loser_age > loser_upper_limit, 'loser_age'] = -1

#df.loc[df.my_channel > 20000, 'my_channel'] = 0

Doing the same for null values:

In [None]:
df_matches['winner_age'].fillna(-1, inplace=True)
df_matches['winner_age'] = df_matches['winner_age'].astype('int64')
df_matches['loser_age'].fillna(-1, inplace=True)
df_matches['loser_age'] = df_matches['loser_age'].astype('int64')

#taking all players that have null age row
no_age_winners = df_matches[df_matches['winner_age'] == -1]['winner_id'].unique()
no_age_losers = df_matches[df_matches['loser_age'] == -1]['loser_id'].unique()

#2833

In [None]:
df_matches[df_matches['winner_age'] == -1].shape[0]

4051

In [None]:
df_matches[df_matches['loser_age'] == -1].shape[0]


7743

Try to compute, if possible, the correct age of the players:

In [None]:
for loser_id in no_age_losers:
    winner_date_row = df_matches[(df_matches['winner_id'] == loser_id) & (df_matches['winner_age'] != -1) & (df_matches['tourney_date'].dt.year > 1900)]
    if(not(winner_date_row.empty)):
        name_date = winner_date_row.iloc[[0]]
    else:
        loser_date_row = df_matches[(df_matches['loser_id'] == loser_id) & (df_matches['loser_age'] != -1) & (df_matches['tourney_date'].dt.year > 1900)]
        if(not(loser_date_row.empty)):
            name_date = loser_date_row.iloc[[0]]
        else:
            continue
    #computing birth year
    if(not(winner_date_row.empty)):
        name_dot = np.int64((name_date['tourney_date']).dt.year - name_date['winner_age'])[0]
    else:
        name_dot = np.int64((name_date['tourney_date']).dt.year - name_date['loser_age'])[0]
    
    #df.at[4, 'B'] = 10
    for index, match in (df_matches[(df_matches['loser_id'] == loser_id ) & (df_matches['loser_age'] == -1) & (df_matches['tourney_date'].dt.year > 1900)]).iterrows():
        df_matches.at[index, 'loser_age'] = match['tourney_date'].year - name_dot

In [None]:
for winner_id in no_age_winners:
    winner_date_row = df_matches[(df_matches['winner_id'] == winner_id ) & (df_matches['winner_age'] != -1) & (df_matches['tourney_date'].dt.year > 1900)]
    if(not(winner_date_row.empty)):
        name_date = winner_date_row.iloc[[0]]
    else:
        loser_date_row = df_matches[(df_matches['loser_id'] == winner_id) & (df_matches['loser_age'] != -1) & (df_matches['tourney_date'].dt.year > 1900)]
        if(not(loser_date_row.empty)):
            name_date = loser_date_row.iloc[[0]]
        else:
            continue
    #computing birth year
    if(not(winner_date_row.empty)):
        name_dot = np.int64((name_date['tourney_date']).dt.year - name_date['winner_age'])[0]
    else:
        name_dot = np.int64((name_date['tourney_date']).dt.year - name_date['loser_age'])[0]
    
    #df.at[4, 'B'] = 10
    for index, match in (df_matches[(df_matches['winner_id'] == winner_id) & (df_matches['winner_age'] == -1) & (df_matches['tourney_date'].dt.year > 1900)]).iterrows():
        df_matches.at[index, 'winner_age'] = match['tourney_date'].year - name_dot

In [None]:
df_matches[df_matches['winner_age'] == -1].shape[0]

3209

In [None]:
df_matches[df_matches['loser_age'] == -1].shape[0]

7000

In [None]:
df_matches.to_csv("../data/cleaned/tennis_matches_age_cleaned.csv", index=False)

#### start from here

In [None]:
df_matches = pd.read_csv("../data/cleaned/tennis_matches_age_cleaned.csv")

#### Best of

Filling null values with 3 (the mean value)

In [None]:
df_matches['best_of'].fillna(3, inplace=True)
df_matches['best_of'] = df_matches['best_of'].astype('int64')

#### Round

Fixing BR typo and filling null values with 'U' (unknown)

In [None]:
df_matches.loc[df_matches['round'] == "BR", 'round'] = "RR"
df_matches['round'].fillna('U', inplace=True)

#### Minutes

Setting to -1 all the null values and then converting the attribute to integer type

In [None]:
# Change type of attribute "minutes" and setting error value "0" to -1
df_matches['minutes'].fillna(-1, inplace=True)

Setting all the outliers to the average value of minute 

In [None]:
#Q1 – 1.5 * (Q3-Q1) lower limit
#Q3 + 1.5 * (Q3-Q1) upper limit

quantiles_minutes = df_matches['minutes'].quantile([0.25,0.5,0.75])

Q1 = quantiles_minutes.values[0]
Q2 = quantiles_minutes.values[1]
Q3 = quantiles_minutes.values[2]

minutes_lower_limit = Q1 - 1.5 * (Q3-Q1)
minutes_upper_limit = Q3 + 1.5 * (Q3-Q1)

df_matches.loc[df_matches.minutes < minutes_lower_limit, 'minutes'] = Q2
df_matches.loc[df_matches.minutes > minutes_upper_limit, 'minutes'] = Q2

#df.loc[df.my_channel > 20000, 'my_channel'] = 0
df_matches['minutes'] = df_matches['minutes'].astype('int64')

#### Ace

In [None]:
"""
df_matches['w_ace'].fillna(-1, inplace=True)
df_matches['w_ace'] = df_matches['w_ace'].astype('int64')

df_matches['l_ace'].fillna(-1, inplace=True)
df_matches['l_ace'] = df_matches['l_ace'].astype('int64')
"""

"\ndf_matches['w_ace'].fillna(-1, inplace=True)\ndf_matches['w_ace'] = df_matches['w_ace'].astype('int64')\n\ndf_matches['l_ace'].fillna(-1, inplace=True)\ndf_matches['l_ace'] = df_matches['l_ace'].astype('int64')\n"

In [None]:
"""
#Q1 – 1.5 * (Q3-Q1) lower limit
#Q3 + 1.5 * (Q3-Q1) upper limit

quantiles_waces = df_matches['w_ace'].quantile([0.25,0.5,0.75])

Q1 = quantiles_waces.values[0]
Q2 = quantiles_waces.values[1]
Q3 = quantiles_waces.values[2]

w_ace_lower_limit = Q1 - 1.5 * (Q3-Q1)
w_ace_upper_limit = Q3 + 1.5 * (Q3-Q1)

df_matches.loc[df_matches.w_ace < w_ace_lower_limit, 'w_ace'] = Q2
df_matches.loc[df_matches.w_ace > w_ace_upper_limit, 'w_ace'] = Q2

quantiles_laces = df_matches['l_ace'].quantile([0.25,0.5,0.75])

Q1 = quantiles_laces.values[0]
Q2 = quantiles_laces.values[1]
Q3 = quantiles_laces.values[2]

l_ace_lower_limit = Q1 - 1.5 * (Q3-Q1)
l_ace_upper_limit = Q3 + 1.5 * (Q3-Q1)

df_matches.loc[df_matches.l_ace < l_ace_lower_limit, 'l_ace'] = Q2
df_matches.loc[df_matches.l_ace > l_ace_upper_limit, 'l_ace'] = Q2
"""

"\n#Q1 – 1.5 * (Q3-Q1) lower limit\n#Q3 + 1.5 * (Q3-Q1) upper limit\n\nquantiles_waces = df_matches['w_ace'].quantile([0.25,0.5,0.75])\n\nQ1 = quantiles_waces.values[0]\nQ2 = quantiles_waces.values[1]\nQ3 = quantiles_waces.values[2]\n\nw_ace_lower_limit = Q1 - 1.5 * (Q3-Q1)\nw_ace_upper_limit = Q3 + 1.5 * (Q3-Q1)\n\ndf_matches.loc[df_matches.w_ace < w_ace_lower_limit, 'w_ace'] = Q2\ndf_matches.loc[df_matches.w_ace > w_ace_upper_limit, 'w_ace'] = Q2\n\nquantiles_laces = df_matches['l_ace'].quantile([0.25,0.5,0.75])\n\nQ1 = quantiles_laces.values[0]\nQ2 = quantiles_laces.values[1]\nQ3 = quantiles_laces.values[2]\n\nl_ace_lower_limit = Q1 - 1.5 * (Q3-Q1)\nl_ace_upper_limit = Q3 + 1.5 * (Q3-Q1)\n\ndf_matches.loc[df_matches.l_ace < l_ace_lower_limit, 'l_ace'] = Q2\ndf_matches.loc[df_matches.l_ace > l_ace_upper_limit, 'l_ace'] = Q2\n"

#### TODO (Attributi Brevi)

#### Rank

In [None]:
df_matches['winner_rank'].fillna(-1,inplace=True)
df_matches['winner_rank'] = df_matches['winner_rank'].astype('int64')

df_matches['loser_rank'].fillna(-1,inplace=True)
df_matches['loser_rank'] = df_matches['loser_rank'].astype('int64')

#### Points

We delete winner_rank_points and loser_rank_points since:
- winner_rank and loser_rank give us similar information
- data distribution does not hold significant information

In [None]:
#df_matches.drop(['winner_rank_points', 'loser_rank_points'], axis=1, inplace=True)

## Outliers

In [None]:
## There are no duplicates

df_male_players.drop_duplicates(inplace=True)
df_female_players.drop_duplicates(inplace=True)
df_matches.drop_duplicates(inplace=True)

In [None]:
df_matches.to_csv("../data/cleaned/tennis_matches_cleaned.csv", index=False)
df_male_players.to_csv("../data/cleaned/male_players_cleaned.csv", index=False)
df_female_players.to_csv("../data/cleaned/female_players_cleaned.csv", index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=de87b132-0371-4d99-94f9-c61923e2507d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>