**Model Input Construction**

In [36]:
import pandas as pd

gdf = pd.read_csv("../Data/gdf.csv")
gdf.head()

Unnamed: 0,surface,round,player1,w_name,player2,l_name,height_diff,age_diff,rank_diff,rank_point_diff,...,df_diff,w_1stIn_%,l_1stIn_%,1stIn_diff,w_1stWon_%,l_1stWon_%,1stWon_diff,w_2ndWon_%,l_2ndWon_%,2ndWon_diff
0,Hard,R128,101222,Stefan Edberg,101432,Dimitri Poliakov,5.0,2.0,161.0,3687.0,...,-1.268412,75.531915,69.230769,6.301146,74.647887,59.722222,14.925665,39.130435,34.375,4.755435
1,Hard,R128,100995,Eduardo Masso,101242,Brad Pearce,3.0,2.2,-43.0,-175.0,...,1.74959,77.956989,62.146893,15.810097,61.37931,60.909091,0.470219,65.853659,43.283582,22.570076
2,Hard,R128,101138,Pat Cash,101120,Karel Novacek,-7.0,-0.1,-52.0,-413.0,...,-2.509111,65.853659,67.816092,-1.962433,66.666667,52.542373,14.124294,64.285714,42.857143,21.428571
3,Hard,R128,100870,Christo Van Rensburg,101256,Mark Kratzmann,7.0,3.6,23.0,79.0,...,0.013028,57.894737,48.514851,9.379885,75.0,53.061224,21.938776,68.75,50.0,18.75
4,Hard,R128,101731,Nuno Marques,101416,Joao Cunha Silva,17.0,-2.4,12.0,41.0,...,0.877193,75.18797,61.111111,14.076859,73.0,74.025974,-1.025974,48.484848,57.142857,-8.658009


In [37]:
diff_columns = gdf.columns[gdf.columns.str.contains('diff')]

# Dropping Pre match stats as they were responsible for data leakage. 😅
diff_columns = diff_columns.drop(['ace_diff', 'df_diff', '1stIn_diff', '1stWon_diff', '2ndWon_diff',])
diff_columns

Index(['height_diff', 'age_diff', 'rank_diff', 'rank_point_diff', 'seed_diff',
       'exp_diff', 'h2h_diff', 'past_wins_diff'],
      dtype='object')

Why not take all three w_age, l_age and age_diff?

We are only taking their diffrences cuz when features are exact combinations of each other( like, age_diff = w_age - l_age ), tree models handle it okay (but they waste splits on duplicates), and linear models can suffer from multicollinearity, which inflates variance in coefficients.

### Data Doubling

**w_df is from winner persepective**

In [38]:
w_df = gdf[['surface', 'round', 'player1', 'player2'] + diff_columns.to_list()].copy()
w_df['target'] = 1 #
w_df.head()

Unnamed: 0,surface,round,player1,player2,height_diff,age_diff,rank_diff,rank_point_diff,seed_diff,exp_diff,h2h_diff,past_wins_diff,target
0,Hard,R128,101222,101432,5.0,2.0,161.0,3687.0,-1.0,2.0,0,0,1
1,Hard,R128,100995,101242,3.0,2.2,-43.0,-175.0,0.0,2.2,0,0,1
2,Hard,R128,101138,101120,-7.0,-0.1,-52.0,-413.0,0.0,-0.1,0,0,1
3,Hard,R128,100870,101256,7.0,3.6,23.0,79.0,0.0,3.6,0,0,1
4,Hard,R128,101731,101416,17.0,-2.4,12.0,41.0,0.0,-2.4,0,0,1


**w_df from loser perspective**

In [39]:
l_df = -1 * w_df[diff_columns]
l_df['player1'] = w_df.player2
l_df['player2'] = w_df.player1
l_df['surface'] = w_df.surface
l_df['round'] = w_df['round']
l_df['target'] = 0
l_df = l_df[w_df.columns]
l_df.head()

Unnamed: 0,surface,round,player1,player2,height_diff,age_diff,rank_diff,rank_point_diff,seed_diff,exp_diff,h2h_diff,past_wins_diff,target
0,Hard,R128,101432,101222,-5.0,-2.0,-161.0,-3687.0,1.0,-2.0,0,0,0
1,Hard,R128,101242,100995,-3.0,-2.2,43.0,175.0,-0.0,-2.2,0,0,0
2,Hard,R128,101120,101138,7.0,0.1,52.0,413.0,-0.0,0.1,0,0,0
3,Hard,R128,101256,100870,-7.0,-3.6,-23.0,-79.0,-0.0,-3.6,0,0,0
4,Hard,R128,101416,101731,-17.0,2.4,-12.0,-41.0,-0.0,2.4,0,0,0


In [40]:
df = pd.concat([w_df, l_df], ignore_index=True)
df = df.sample(df.shape[0], random_state=42, ignore_index=True)
df.shape

(33328, 13)

In [41]:
df = df.drop(['player1', 'player2'], axis=1) # removing player1, player2 - as they won't be helpful in prediction
df

Unnamed: 0,surface,round,height_diff,age_diff,rank_diff,rank_point_diff,seed_diff,exp_diff,h2h_diff,past_wins_diff,target
0,Clay,R32,-16.0,-1.7,-82.0,-3104.0,9.0,-2.0,0,-17,0
1,Hard,R32,-8.0,0.3,41.0,930.0,-11.0,2.2,0,-15,0
2,Grass,R128,0.0,8.6,-43.0,-184.0,0.0,4.6,0,15,1
3,Hard,R16,11.0,-2.1,-9.0,-818.0,13.0,-0.9,0,0,1
4,Grass,R64,13.0,1.1,6.0,31.0,0.0,2.6,0,-2,1
...,...,...,...,...,...,...,...,...,...,...,...
33323,Grass,R128,-3.0,-5.9,-29.0,-132.0,-0.0,-6.4,0,1,0
33324,Clay,R128,5.0,5.0,-25.0,-343.0,32.0,-1.0,0,-8,1
33325,Hard,R128,2.0,-0.8,1.0,18.0,0.0,0.5,0,0,1
33326,Hard,R64,0.0,-5.7,224.0,769.0,0.0,-5.7,0,6,1


### Scaling

In [42]:
from sklearn.discriminant_analysis import StandardScaler

scaler = StandardScaler()
num_cols = list(
    df.drop(columns=['target'])
      .select_dtypes(exclude=['object'])
      .columns
)

df[num_cols] = scaler.fit_transform(df[num_cols])
df

Unnamed: 0,surface,round,height_diff,age_diff,rank_diff,rank_point_diff,seed_diff,exp_diff,h2h_diff,past_wins_diff,target
0,Clay,R32,-1.704919,-0.324874,-0.692203,-1.233859,0.729958,-0.579618,0.0,-0.285321,0
1,Hard,R32,-0.852460,0.057331,0.346101,0.369681,-0.892171,0.637580,0.0,-0.251754,0
2,Grass,R128,0.000000,1.643481,-0.362984,-0.073141,0.000000,1.333122,0.0,0.251754,1
3,Hard,R16,1.172132,-0.401315,-0.075973,-0.325160,1.054383,-0.260828,0.0,0.000000,1
4,Grass,R64,1.385247,0.210213,0.050649,0.012323,0.000000,0.753504,0.0,-0.033567,1
...,...,...,...,...,...,...,...,...,...,...,...
33323,Grass,R128,-0.319672,-1.127504,-0.244803,-0.052471,-0.000000,-1.854779,0.0,0.016784,0
33324,Clay,R128,0.532787,0.955512,-0.211037,-0.136345,2.595405,-0.289809,0.0,-0.134269,1
33325,Hard,R128,0.213115,-0.152882,0.008441,0.007155,0.000000,0.144905,0.0,0.000000,1
33326,Hard,R64,0.000000,-1.089284,1.890895,0.305682,0.000000,-1.651912,0.0,0.100701,1


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33328 entries, 0 to 33327
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   surface          33328 non-null  object 
 1   round            33328 non-null  object 
 2   height_diff      33328 non-null  float64
 3   age_diff         33328 non-null  float64
 4   rank_diff        33328 non-null  float64
 5   rank_point_diff  33328 non-null  float64
 6   seed_diff        33328 non-null  float64
 7   exp_diff         33328 non-null  float64
 8   h2h_diff         33328 non-null  float64
 9   past_wins_diff   33328 non-null  float64
 10  target           33328 non-null  int64  
dtypes: float64(8), int64(1), object(2)
memory usage: 2.8+ MB


### One hot encoding

In [44]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = df.select_dtypes(include=['object']).columns

enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

cats_encoded = enc.fit_transform(df[categorical_cols])

# Converts the encoded arrays back to DataFrames
cats_encoded_df = pd.DataFrame(cats_encoded, columns=enc.get_feature_names_out(categorical_cols))

# Concatenates them with the rest of the numeric features
df_enc = pd.concat([df.drop(columns=categorical_cols).reset_index(drop=True), cats_encoded_df], axis=1)

df_enc

Unnamed: 0,height_diff,age_diff,rank_diff,rank_point_diff,seed_diff,exp_diff,h2h_diff,past_wins_diff,target,surface_Clay,surface_Grass,surface_Hard,round_F,round_QF,round_R128,round_R16,round_R32,round_R64,round_SF
0,-1.704919,-0.324874,-0.692203,-1.233859,0.729958,-0.579618,0.0,-0.285321,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.852460,0.057331,0.346101,0.369681,-0.892171,0.637580,0.0,-0.251754,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.000000,1.643481,-0.362984,-0.073141,0.000000,1.333122,0.0,0.251754,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.172132,-0.401315,-0.075973,-0.325160,1.054383,-0.260828,0.0,0.000000,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.385247,0.210213,0.050649,0.012323,0.000000,0.753504,0.0,-0.033567,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33323,-0.319672,-1.127504,-0.244803,-0.052471,-0.000000,-1.854779,0.0,0.016784,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
33324,0.532787,0.955512,-0.211037,-0.136345,2.595405,-0.289809,0.0,-0.134269,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
33325,0.213115,-0.152882,0.008441,0.007155,0.000000,0.144905,0.0,0.000000,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
33326,0.000000,-1.089284,1.890895,0.305682,0.000000,-1.651912,0.0,0.100701,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Exporting pre-processed(final data) for modeling

In [45]:
target_col = df_enc.pop('target')
df_enc['target'] = target_col  # relocates target at the end of df

df_enc.to_csv("../Data/final_data.csv", index=False)