# Data Preprocessing

- Feature Selection
- Data split

    - Train
    - Test
    - 2024 data (Tests)
    - 1968-2023 data (Full Open era)
    - 1990-2023 data (Modern Tennis era)

- Standardize features
- Dimensionality Reduction


In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler 

In [2]:
df = pd.read_csv('../data/processed/3_engineered_data.csv', sep=';', encoding='utf-8')
df

Unnamed: 0,SURFACE,TOURNEY_DATE,TOURNEY_LEVEL,PLAYER_0_ID,PLAYER_0_HAND,PLAYER_0_HT,PLAYER_0_AGE,PLAYER_1_ID,PLAYER_1_HAND,PLAYER_1_HT,...,PERC_0_WIN_LAST_50,PERC_0_WIN_LAST_100,PERC_1_WIN_LAST_10,PERC_1_WIN_LAST_25,PERC_1_WIN_LAST_50,PERC_1_WIN_LAST_100,LAST_10_WIN_DIFF,LAST_25_WIN_DIFF,LAST_50_WIN_DIFF,LAST_100_WIN_DIFF
0,Clay,1967-12-28,A,100092,R,185.0,23.1,100035,R,185.0,...,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.00,0.00,0.00
1,Clay,1967-12-28,A,100070,R,184.0,25.2,110991,R,185.0,...,0.00,0.00,0.0,0.00,0.00,0.00,-0.0,-0.00,-0.00,-0.00
2,Clay,1967-12-28,A,114613,U,184.0,21.2,100035,R,185.0,...,0.00,0.00,0.0,0.00,0.00,0.00,-0.0,-0.00,-0.00,-0.00
3,Clay,1967-12-28,A,110991,R,184.0,25.6,100092,R,185.0,...,1.00,1.00,0.0,0.00,0.00,0.00,-1.0,-1.00,-1.00,-1.00
4,Clay,1967-12-28,A,100051,R,184.0,27.1,100035,R,185.0,...,0.50,0.50,1.0,1.00,1.00,1.00,0.5,0.50,0.50,0.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375461,Hard,2024-12-18,F,210506,R,193.0,20.3,210460,R,180.0,...,0.46,0.49,0.5,0.50,0.50,0.50,0.1,-0.02,-0.04,-0.01
375462,Hard,2024-12-18,F,210506,R,193.0,20.3,209414,R,178.0,...,0.46,0.50,0.3,0.32,0.36,0.36,0.3,0.16,0.10,0.14
375463,Hard,2024-12-18,F,210150,R,193.0,19.2,210530,L,180.0,...,0.45,0.45,0.7,0.64,0.62,0.62,0.2,0.19,0.17,0.17
375464,Hard,2024-12-18,F,210460,R,180.0,19.6,209414,R,178.0,...,0.36,0.34,0.0,0.00,0.00,0.00,-0.3,-0.32,-0.36,-0.34


## Feature Selection

- Data analysis done in **data_visualization.ipynb**

In [3]:
df.columns

Index(['SURFACE', 'TOURNEY_DATE', 'TOURNEY_LEVEL', 'PLAYER_0_ID',
       'PLAYER_0_HAND', 'PLAYER_0_HT', 'PLAYER_0_AGE', 'PLAYER_1_ID',
       'PLAYER_1_HAND', 'PLAYER_1_HT', 'PLAYER_1_AGE', 'SCORE', 'BEST_OF',
       'AGE_DIFF', 'HT_DIFF', 'PLAYER_0_RATING', 'PLAYER_1_RATING',
       'RATING_DIFF', 'PLAYER_0_SURFACE_RATING', 'PLAYER_1_SURFACE_RATING',
       'SURFACE_RATING_DIFF', 'PERC_0_WIN_LAST_10', 'PERC_0_WIN_LAST_25',
       'PERC_0_WIN_LAST_50', 'PERC_0_WIN_LAST_100', 'PERC_1_WIN_LAST_10',
       'PERC_1_WIN_LAST_25', 'PERC_1_WIN_LAST_50', 'PERC_1_WIN_LAST_100',
       'LAST_10_WIN_DIFF', 'LAST_25_WIN_DIFF', 'LAST_50_WIN_DIFF',
       'LAST_100_WIN_DIFF'],
      dtype='object')

In [4]:
df = df.drop(df[['SURFACE', 'TOURNEY_LEVEL', 'PLAYER_0_ID',
       'PLAYER_0_HAND', 'PLAYER_0_HT', 'PLAYER_0_AGE', 'PLAYER_1_ID',
       'PLAYER_1_HAND', 'PLAYER_1_HT', 'PLAYER_1_AGE', 'BEST_OF']], axis=1)

In [5]:
df

Unnamed: 0,TOURNEY_DATE,SCORE,AGE_DIFF,HT_DIFF,PLAYER_0_RATING,PLAYER_1_RATING,RATING_DIFF,PLAYER_0_SURFACE_RATING,PLAYER_1_SURFACE_RATING,SURFACE_RATING_DIFF,...,PERC_0_WIN_LAST_50,PERC_0_WIN_LAST_100,PERC_1_WIN_LAST_10,PERC_1_WIN_LAST_25,PERC_1_WIN_LAST_50,PERC_1_WIN_LAST_100,LAST_10_WIN_DIFF,LAST_25_WIN_DIFF,LAST_50_WIN_DIFF,LAST_100_WIN_DIFF
0,1967-12-28,0,-5.5,-5.5,2000.000,2000.000,0.000,2000.000,2000.000,0.000,...,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.00,0.00,0.00
1,1967-12-28,1,-0.4,-0.4,2000.000,1997.682,2.318,2000.000,1997.682,2.318,...,0.00,0.00,0.0,0.00,0.00,0.00,-0.0,-0.00,-0.00,-0.00
2,1967-12-28,1,-7.4,-7.4,2000.000,1996.842,3.158,2000.000,1996.842,3.158,...,0.00,0.00,0.0,0.00,0.00,0.00,-0.0,-0.00,-0.00,-0.00
3,1967-12-28,1,2.5,2.5,2000.000,2004.211,-4.211,2000.000,2004.211,-4.211,...,1.00,1.00,0.0,0.00,0.00,0.00,-1.0,-1.00,-1.00,-1.00
4,1967-12-28,1,-1.5,-1.5,2000.741,2001.024,-0.283,2000.741,2001.024,-0.283,...,0.50,0.50,1.0,1.00,1.00,1.00,0.5,0.50,0.50,0.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375461,2024-12-18,0,0.7,0.7,2172.068,2015.096,156.972,2129.714,2011.262,118.452,...,0.46,0.49,0.5,0.50,0.50,0.50,0.1,-0.02,-0.04,-0.01
375462,2024-12-18,0,-0.2,-0.2,2165.139,2100.983,64.156,2124.686,2059.176,65.510,...,0.46,0.50,0.3,0.32,0.36,0.36,0.3,0.16,0.10,0.14
375463,2024-12-18,1,0.2,0.2,2163.977,2051.259,112.718,2110.263,2033.247,77.016,...,0.45,0.45,0.7,0.64,0.62,0.62,0.2,0.19,0.17,0.17
375464,2024-12-18,1,-0.9,-0.9,2000.000,2098.214,-98.214,2000.000,2053.909,-53.909,...,0.36,0.34,0.0,0.00,0.00,0.00,-0.3,-0.32,-0.36,-0.34


In [6]:
df.describe()

Unnamed: 0,SCORE,AGE_DIFF,HT_DIFF,PLAYER_0_RATING,PLAYER_1_RATING,RATING_DIFF,PLAYER_0_SURFACE_RATING,PLAYER_1_SURFACE_RATING,SURFACE_RATING_DIFF,PERC_0_WIN_LAST_10,...,PERC_0_WIN_LAST_50,PERC_0_WIN_LAST_100,PERC_1_WIN_LAST_10,PERC_1_WIN_LAST_25,PERC_1_WIN_LAST_50,PERC_1_WIN_LAST_100,LAST_10_WIN_DIFF,LAST_25_WIN_DIFF,LAST_50_WIN_DIFF,LAST_100_WIN_DIFF
count,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,...,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0
mean,0.5,8.989038e-19,8.989038e-19,2155.878052,2155.878052,-3.9059730000000006e-17,2086.075768,2086.075768,4.4207140000000005e-17,0.526473,...,0.540374,0.539536,0.446978,0.455909,0.458362,0.45613,-4.7310719999999996e-21,2.483813e-20,-1.005353e-19,1.005353e-19
std,0.500001,5.520826,5.520826,76.487966,76.487966,85.52164,64.108162,64.108162,72.43444,0.216344,...,0.174712,0.169038,0.220605,0.194398,0.185374,0.180101,0.2959526,0.2535424,0.2376087,0.2283735
min,0.0,-41.7,-41.7,1918.139,1918.139,-390.268,1942.864,1942.864,-354.838,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0
25%,0.0,-3.6,-3.6,2112.5105,2112.5105,-52.29,2028.80225,2028.80225,-46.177,0.4,...,0.44,0.45,0.3,0.36,0.38,0.38,-0.2,-0.16,-0.14,-0.13
50%,0.5,0.0,0.0,2167.8295,2167.8295,0.0,2085.299,2085.299,0.0,0.5,...,0.54,0.53,0.5,0.48,0.48,0.47,0.0,0.0,0.0,-0.0
75%,1.0,3.6,3.6,2208.60375,2208.60375,52.29,2132.39775,2132.39775,46.177,0.7,...,0.66,0.65,0.6,0.57,0.57,0.56,0.2,0.16,0.14,0.13
max,1.0,41.7,41.7,2427.004,2427.004,390.268,2373.133,2373.133,354.838,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Data Split

In [7]:
feature = df.drop('SCORE', axis=1)
feature.head(3)

Unnamed: 0,TOURNEY_DATE,AGE_DIFF,HT_DIFF,PLAYER_0_RATING,PLAYER_1_RATING,RATING_DIFF,PLAYER_0_SURFACE_RATING,PLAYER_1_SURFACE_RATING,SURFACE_RATING_DIFF,PERC_0_WIN_LAST_10,...,PERC_0_WIN_LAST_50,PERC_0_WIN_LAST_100,PERC_1_WIN_LAST_10,PERC_1_WIN_LAST_25,PERC_1_WIN_LAST_50,PERC_1_WIN_LAST_100,LAST_10_WIN_DIFF,LAST_25_WIN_DIFF,LAST_50_WIN_DIFF,LAST_100_WIN_DIFF
0,1967-12-28,-5.5,-5.5,2000.0,2000.0,0.0,2000.0,2000.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1967-12-28,-0.4,-0.4,2000.0,1997.682,2.318,2000.0,1997.682,2.318,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0
2,1967-12-28,-7.4,-7.4,2000.0,1996.842,3.158,2000.0,1996.842,3.158,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0


In [8]:
target = df[['SCORE']]
target.value_counts()

SCORE
0        187733
1        187733
Name: count, dtype: int64

In [9]:
feature.to_csv('../data/processed/5.1_data_feature.csv', sep=';', encoding='utf-8', index=False)
target.to_csv('../data/processed/5.1_data_target.csv', sep=';', encoding='utf-8', index=False)

## Standardize Features

#### StandardScaler (mean = 0, std = 1)

In [10]:
scaler = StandardScaler()

In [11]:
feature_std_esc = scaler.fit_transform(feature.loc[:, feature.columns != 'TOURNEY_DATE'])
feature_std_esc = pd.DataFrame(feature_std_esc)

In [12]:
feature_std_esc['TOURNEY_DATE'] = feature['TOURNEY_DATE']
feature_std_esc.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,TOURNEY_DATE
0,-0.996229,-0.996229,-2.037945,-2.037945,4.56724e-19,-1.342667,-1.342667,-6.103063999999999e-19,-2.433507,-2.884624,...,-3.191816,-2.026146,-2.345243,-2.472635,-2.532638,1.598593e-20,-9.796454e-20,4.2311339999999996e-19,-4.402236e-19,1967-12-28
1,-0.072453,-0.072453,-2.037945,-2.06825,0.02710429,-1.342667,-1.378824,0.0320014,-2.433507,-2.884624,...,-3.191816,-2.026146,-2.345243,-2.472635,-2.532638,1.598593e-20,-9.796454e-20,4.2311339999999996e-19,-4.402236e-19,1967-12-28
2,-1.340381,-1.340381,-2.037945,-2.079232,0.03692637,-1.342667,-1.391927,0.0435981,-2.433507,-2.884624,...,-3.191816,-2.026146,-2.345243,-2.472635,-2.532638,1.598593e-20,-9.796454e-20,4.2311339999999996e-19,-4.402236e-19,1967-12-28


In [13]:
feature_std_esc.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0
mean,-1.2773899999999998e-19,-1.2773899999999998e-19,3.410611e-15,-2.53858e-15,2.5074679999999996e-19,4.369846e-15,4.350467e-15,4.257965e-20,-3.512348e-16,6.758242e-16,1.356493e-16,5.886211e-16,-1.090039e-16,2.252747e-16,-1.114262e-16,-3.512348e-16,6.339636999999999e-19,5.582664999999999e-19,-2.7440219999999996e-19,-1.040836e-19
std,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001
min,-7.553228,-7.553228,-3.108193,-3.108193,-4.563389,-2.233911,-2.233911,-4.898754,-2.433507,-2.884624,-3.092944,-3.191816,-2.026146,-2.345243,-2.472635,-2.532638,-3.378924,-3.944119,-4.208606,-4.378797
25%,-0.6520772,-0.6520772,-0.566986,-0.566986,-0.611425,-0.8933901,-0.8933901,-0.6375015,-0.5845952,-0.6238505,-0.5745138,-0.5296834,-0.6662498,-0.4933655,-0.4227219,-0.4227083,-0.6757848,-0.631059,-0.5892049,-0.5692436
50%,-1.6282069999999998e-19,-1.6282069999999998e-19,0.1562529,0.1562529,4.56724e-19,-0.01211654,-0.01211654,-6.103063999999999e-19,-0.1223672,-0.0855711,-0.002143344,-0.05641538,0.2403476,0.123927,0.1167291,0.07701194,1.598593e-20,-9.796454e-20,4.2311339999999996e-19,-4.402236e-19
75%,0.6520772,0.6520772,0.6893342,0.6893342,0.611425,0.7225608,0.7225608,0.6375015,0.8020887,0.7218481,0.6847013,0.6534867,0.6936463,0.5868963,0.6022349,0.5767321,0.6757848,0.631059,0.5892049,0.5692436
max,7.553228,7.553228,3.544692,3.544692,4.563389,4.477708,4.477708,4.898754,2.188773,2.49817,2.630761,2.724034,2.506841,2.798861,2.921874,3.019809,3.378924,3.944119,4.208606,4.378797


#### RobustScaler (better with outliers)

In [14]:
robust = RobustScaler()

In [15]:
feature_robust_esc = robust.fit_transform(feature.loc[:, feature.columns != 'TOURNEY_DATE'])
feature_robust_esc = pd.DataFrame(feature_robust_esc)

In [16]:
feature_robust_esc['TOURNEY_DATE'] = feature['TOURNEY_DATE']
feature_robust_esc.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,TOURNEY_DATE
0,-0.763889,-0.763889,-1.746527,-1.746527,0.0,-0.823385,-0.823385,0.0,-1.666667,-2.08,...,-2.65,-1.666667,-2.285714,-2.526316,-2.611111,0.0,0.0,0.0,0.0,1967-12-28
1,-0.055556,-0.055556,-1.746527,-1.77065,0.022165,-0.823385,-0.845761,0.025099,-1.666667,-2.08,...,-2.65,-1.666667,-2.285714,-2.526316,-2.611111,-0.0,-0.0,-0.0,-0.0,1967-12-28
2,-1.027778,-1.027778,-1.746527,-1.779391,0.030197,-0.823385,-0.853869,0.034195,-1.666667,-2.08,...,-2.65,-1.666667,-2.285714,-2.526316,-2.611111,-0.0,-0.0,-0.0,-0.0,1967-12-28


In [17]:
feature_robust_esc.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0,375466.0
mean,9.462145e-20,9.462145e-20,-0.1243734,-0.1243734,2.081672e-19,0.007498,0.007498,4.636451e-19,0.088244,0.063589,0.001702,0.047682,-0.17674,-0.114719,-0.113887,-0.077055,-1.6085649999999998e-19,4.7310719999999996e-21,-1.1827679999999999e-21,3.713892e-19
std,0.7667814,0.7667814,0.7959765,0.7959765,0.8177629,0.618832,0.618832,0.7843129,0.721146,0.743109,0.794147,0.845188,0.735351,0.925703,0.975652,1.000561,0.7398815,0.7923199,0.8486024,0.8783598
min,-5.791667,-5.791667,-2.598419,-2.598419,-3.731765,-1.374915,-1.374915,-3.842151,-1.666667,-2.08,-2.454545,-2.65,-1.666667,-2.285714,-2.526316,-2.611111,-2.5,-3.125,-3.571429,-3.846154
25%,-0.5,-0.5,-0.5756804,-0.5756804,-0.5,-0.545359,-0.545359,-0.5,-0.333333,-0.4,-0.454545,-0.4,-0.666667,-0.571429,-0.526316,-0.5,-0.5,-0.5,-0.5,-0.5
50%,0.0,0.0,2.366176e-15,2.366176e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0
75%,0.5,0.5,0.4243196,0.4243196,0.5,0.454641,0.454641,0.5,0.666667,0.6,0.545455,0.6,0.333333,0.428571,0.473684,0.5,0.5,0.5,0.5,0.5
max,5.791667,5.791667,2.697115,2.697115,3.731765,2.778441,2.778441,3.842151,1.666667,1.92,2.090909,2.35,1.666667,2.47619,2.736842,2.944444,2.5,3.125,3.571429,3.846154


In [18]:
feature_std_esc.to_csv('../data/processed/5.2_data_feature_std_esc.csv', sep=';', encoding='utf-8', index=False)
feature_robust_esc.to_csv('../data/processed/5.2_data_feature_robust_esc.csv', sep=';', encoding='utf-8', index=False)

## Dimensionality Reduction