In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('Sources/final_dataset_secondclean.csv')

In [3]:
# (Re)Définitions des taux suivants :
# The Serve Rating© adds four service metrics percentages plus the average number of aces per match and subtracts the average number of double faults per match.
# A player's Return Rating© is determined by adding his winning percentage in the four service return categories.
# A player's Under Pressure Rating© is calculated by adding the percentage of break points converted and saved, percentage of tie-breaks won and percentage of deciding sets won.

In [4]:
winner_deciding_sets_won = []
for value1, value2, value3 in zip(list(df['winner_sets_won'].astype('int')), list(df['loser_sets_won'].astype('int')), list(df['Best of'])):
    if (value1 + value2) == value3:
        winner_deciding_sets_won.append(1)
    else:
        winner_deciding_sets_won.append(0)

df['winner_deciding_sets_won'] = pd.Series(winner_deciding_sets_won)

In [5]:
# transformation en pourcentage
df['winner_sets_won'] = df['winner_sets_won'] / df['nb_sets']
df['loser_sets_won'] = df['loser_sets_won'] / df['nb_sets']

df['winner_games_won'] = df['winner_games_won'] / df['nb_games']
df['loser_games_won'] = df['loser_games_won'] / df['nb_games']

df['winner_tiebreaks_won'] = df['winner_tiebreaks_won'] / df['nb_tiebreaks']
df['loser_tiebreaks_won'] = df['loser_tiebreaks_won'] / df['nb_tiebreaks']

df['winner_points_won'] = df['winner_points_won'] / df['nb_points']
df['loser_points_won'] = df['loser_points_won'] / df['nb_points']

In [6]:
df['winner_tiebreaks_won'].fillna(0, inplace=True)
df['loser_tiebreaks_won'].fillna(0, inplace=True)

In [7]:
# serve_rating
df['winner_serve_rating'] = (df['winner_first_serves_in'] + df['winner_first_serve_points_won'] + df['winner_second_serve_points_won'] + df['winner_break_points_saved']) * 100 + df['winner_aces'] - df['winner_double_faults']
df['loser_serve_rating'] = (df['loser_first_serves_in'] + df['loser_first_serve_points_won'] + df['loser_second_serve_points_won'] + df['loser_break_points_saved']) * 100 + df['loser_aces'] - df['loser_double_faults']

In [8]:
# return_rating
df['winner_return_rating'] = (df['winner_first_serve_return_won'] + df['winner_second_serve_return_won'] + df['winner_break_points_converted']) * 100
df['loser_return_rating'] = (df['loser_first_serve_return_won'] + df['loser_second_serve_return_won'] + df['loser_break_points_converted']) * 100

In [9]:
# under_pressure_rating
df['winner_under_pressure_rating'] = (df['winner_break_points_saved'] + df['winner_break_points_converted'] + df['winner_tiebreaks_won'] + df['winner_deciding_sets_won'])*100
df['loser_under_pressure_rating'] = (df['loser_break_points_saved'] + df['loser_break_points_converted'] + df['loser_tiebreaks_won'])*100

In [10]:
#col_to_drop suite aux regroupements des stats selon les trois indicateurs
col_to_drop = ['winner_total_points_won', 'loser_total_points_won', 'winner_deciding_sets_won']

In [11]:
df = df.drop(col_to_drop, axis=1)

In [12]:
df.rename({'WRank':'winner_rank', 'LRank':'loser_rank'}, axis=1, inplace=True)

In [13]:
df = df[['Tournament', 'Date', 'year', 'month', 'day', 'Series', 'Court', 'Surface', 'prize_money_euro', 'Round', 'Best of', 'match_duration', 
         'nb_sets', 'nb_games', 'nb_tiebreaks', 'nb_points','B365W', 'B365L', 'PSW', 'PSL',
         'winner_name', 'winner_rank', 'winner_height_cm', 'winner_weight_kg', 'winner_handedness', 'winner_backhand', 
         'winner_at_home', 'winner_age', 'winner_nb_year_xp',
         'winner_sets_won', 'winner_games_won', 'winner_tiebreaks_won', 'winner_points_won', 
         'winner_serve_rating', 'winner_aces', 'winner_double_faults', 'winner_first_serves_in', 'winner_first_serves_total', 
         'winner_first_serve_points_won', 'winner_first_serve_points_total', 'winner_second_serve_points_won', 
         'winner_second_serve_points_total', 'winner_break_points_saved', 'winner_break_points_serve_total', 'winner_service_games_played', 
         'winner_return_rating', 'winner_first_serve_return_won', 'winner_first_serve_return_total',
         'winner_second_serve_return_won', 'winner_second_serve_return_total', 'winner_break_points_converted',
         'winner_break_points_return_total', 'winner_return_games_played', 
         'winner_service_points_won', 'winner_service_points_total', 'winner_return_points_won', 'winner_return_points_total', 
         'winner_under_pressure_rating', 
         'loser_name', 'loser_rank', 'loser_height_cm', 'loser_weight_kg', 'loser_handedness', 'loser_backhand', 
         'loser_at_home', 'loser_age', 'loser_nb_year_xp',
         'loser_sets_won', 'loser_games_won', 'loser_tiebreaks_won', 'loser_points_won', 
         'loser_serve_rating', 'loser_aces', 'loser_double_faults', 'loser_first_serves_in', 'loser_first_serves_total',
         'loser_first_serve_points_won', 'loser_first_serve_points_total', 'loser_second_serve_points_won', 
         'loser_second_serve_points_total', 'loser_break_points_saved', 'loser_break_points_serve_total', 
         'loser_service_games_played', 'loser_return_rating', 'loser_first_serve_return_won', 
         'loser_first_serve_return_total', 'loser_second_serve_return_won', 'loser_second_serve_return_total',
         'loser_break_points_converted', 'loser_break_points_return_total', 'loser_return_games_played', 
         'loser_service_points_won', 'loser_service_points_total', 'loser_return_points_won', 'loser_return_points_total', 
         'loser_under_pressure_rating']]

In [14]:
# conversion de certaines données
df['prize_money_euro'] = df['prize_money_euro'].astype('int')
df['match_duration'] = df['match_duration'].astype('int')
df['nb_sets'] = df['nb_sets'].astype('int')
df['nb_games'] = df['nb_games'].astype('int')
df['nb_tiebreaks'] = df['nb_tiebreaks'].astype('int')
df['nb_points'] = df['nb_points'].astype('int')
df['winner_height_cm'] = df['winner_height_cm'].astype('int')
df['winner_weight_kg'] = df['winner_weight_kg'].astype('int')
df['winner_at_home'] = df['winner_at_home'].astype('int')
df['winner_nb_year_xp'] = df['winner_nb_year_xp'].astype('int')
df['winner_service_games_played'] = df['winner_service_games_played'].astype('int')
df['winner_return_games_played'] = df['winner_return_games_played'].astype('int')
df['loser_height_cm'] = df['loser_height_cm'].astype('int')
df['loser_weight_kg'] = df['loser_weight_kg'].astype('int')
df['loser_at_home'] = df['loser_at_home'].astype('int')
df['loser_nb_year_xp'] = df['loser_nb_year_xp'].astype('int')
df['loser_service_games_played'] = df['loser_service_games_played'].astype('int')
df['loser_return_games_played'] = df['loser_return_games_played'].astype('int')


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33545 entries, 0 to 33544
Data columns (total 96 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Tournament                        33545 non-null  object 
 1   Date                              33545 non-null  object 
 2   year                              33545 non-null  int64  
 3   month                             33545 non-null  int64  
 4   day                               33545 non-null  int64  
 5   Series                            33545 non-null  object 
 6   Court                             33545 non-null  object 
 7   Surface                           33545 non-null  object 
 8   prize_money_euro                  33545 non-null  int32  
 9   Round                             33545 non-null  object 
 10  Best of                           33545 non-null  int64  
 11  match_duration                    33545 non-null  int32  
 12  nb_s

In [16]:
df.to_csv('Sources/final_dataset_thirdclean.csv', index=False)