<h1 align="center">🎾 - Tennis Betting 👨‍💻🔬</h1>

<h2 align="center">EDA</h2>
<p style="text-align:center">
   Thomas Bury & Vilius Jaseliunas<br>
   Allianz<br>
</p>

# Database schema

In [23]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from arfs.preprocessing import OrdinalEncoderPandas
from lightgbm import LGBMClassifier

data = pd.read_csv("../data/atp_data.csv", low_memory=False)
data['Date'] = pd.to_datetime(data['Date'])
data.head()


Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,Wsets,Lsets,Comment,PSW,PSL,B365W,B365L,elo_winner,elo_loser,proba_elo
0,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Dosedel S.,...,2.0,0.0,Completed,,,,,1500.0,1500.0,0.5
1,3,Doha,Qatar Open,2000-01-03,International,Outdoor,Hard,1st Round,3,Kiefer N.,...,2.0,0.0,Completed,,,,,1500.0,1500.0,0.5
2,3,Doha,Qatar Open,2000-01-03,International,Outdoor,Hard,1st Round,3,Gaudio G.,...,2.0,1.0,Completed,,,,,1500.0,1500.0,0.5
3,3,Doha,Qatar Open,2000-01-03,International,Outdoor,Hard,1st Round,3,El Aynaoui Y.,...,2.0,1.0,Completed,,,,,1500.0,1500.0,0.5
4,3,Doha,Qatar Open,2000-01-03,International,Outdoor,Hard,1st Round,3,Cherkasov A.,...,2.0,0.0,Completed,,,,,1500.0,1500.0,0.5


In [24]:
data.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'Wsets',
       'Lsets', 'Comment', 'PSW', 'PSL', 'B365W', 'B365L', 'elo_winner',
       'elo_loser', 'proba_elo'],
      dtype='object')

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44708 entries, 0 to 44707
Data columns (total 23 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ATP         44708 non-null  int64         
 1   Location    44708 non-null  object        
 2   Tournament  44708 non-null  object        
 3   Date        44708 non-null  datetime64[ns]
 4   Series      44708 non-null  object        
 5   Court       44708 non-null  object        
 6   Surface     44708 non-null  object        
 7   Round       44708 non-null  object        
 8   Best of     44708 non-null  int64         
 9   Winner      44708 non-null  object        
 10  Loser       44708 non-null  object        
 11  WRank       44708 non-null  int64         
 12  LRank       44708 non-null  int64         
 13  Wsets       44521 non-null  float64       
 14  Lsets       44521 non-null  float64       
 15  Comment     44708 non-null  object        
 16  PSW         32743 non-

## Renaming columns

To avoid obvious data leakage, rename the columns by replacing the W by P1 and L by P2. Then permute P1 and P2 1 every two rows (preventing the winner to be always P1).

In [26]:
df = data.copy()
df.columns = df.columns.str.lower()
df.rename(columns={'wrank': 'rank_p1', 
                   'lrank': 'rank_p2',
                   'wsets': 'sets_p1',
                   'lsets': 'sets_p2',
                   'psw': 'ps_p1',
                   'psl': 'ps_p2', 
                   'b365w': 'b365_p1',
                   'b365l': 'b365_p2'},
          inplace=True)
df.columns = df.columns.str.lower()
df.rename(columns=lambda x: x.replace('winner', 'p1').replace('loser', 'p2'), inplace=True)
# Print the modified column names
print(df.columns)


Index(['atp', 'location', 'tournament', 'date', 'series', 'court', 'surface',
       'round', 'best of', 'p1', 'p2', 'rank_p1', 'rank_p2', 'sets_p1',
       'sets_p2', 'comment', 'ps_p1', 'ps_p2', 'b365_p1', 'b365_p2', 'elo_p1',
       'elo_p2', 'proba_elo'],
      dtype='object')


In [27]:
p1_columns = df.filter(like='p1').columns
p2_columns = df.filter(like='p2').columns

print("Columns containing P1:")
print(p1_columns)

print("Columns containing P2:")
print(p2_columns)

Columns containing P1:
Index(['p1', 'rank_p1', 'sets_p1', 'ps_p1', 'b365_p1', 'elo_p1'], dtype='object')
Columns containing P2:
Index(['p2', 'rank_p2', 'sets_p2', 'ps_p2', 'b365_p2', 'elo_p2'], dtype='object')


In [28]:
mask = df.index % 2 == 1
df_permuted = df.copy()
# swap the values P1 <-> P2 every two rows
mask = df.index % 2 == 1

df_permuted .loc[mask, p1_columns], df_permuted .loc[mask, p2_columns] = df.loc[mask, p2_columns].values, df.loc[mask, p1_columns].values

In [29]:
player_columns = sorted(list(set(p1_columns).union(set(p2_columns))))
df[player_columns ].head()

Unnamed: 0,b365_p1,b365_p2,elo_p1,elo_p2,p1,p2,ps_p1,ps_p2,rank_p1,rank_p2,sets_p1,sets_p2
0,,,1500.0,1500.0,Dosedel S.,Ljubicic I.,,,63,77,2.0,0.0
1,,,1500.0,1500.0,Kiefer N.,Tarango J.,,,6,59,2.0,0.0
2,,,1500.0,1500.0,Gaudio G.,Luxa P.,,,73,174,2.0,1.0
3,,,1500.0,1500.0,El Aynaoui Y.,Dupuis A.,,,33,78,2.0,1.0
4,,,1500.0,1500.0,Cherkasov A.,Arazi H.,,,206,35,2.0,0.0


In [30]:
df_permuted[player_columns ].head()

Unnamed: 0,b365_p1,b365_p2,elo_p1,elo_p2,p1,p2,ps_p1,ps_p2,rank_p1,rank_p2,sets_p1,sets_p2
0,,,1500.0,1500.0,Dosedel S.,Ljubicic I.,,,63,77,2.0,0.0
1,,,1500.0,1500.0,Tarango J.,Kiefer N.,,,59,6,0.0,2.0
2,,,1500.0,1500.0,Gaudio G.,Luxa P.,,,73,174,2.0,1.0
3,,,1500.0,1500.0,Dupuis A.,El Aynaoui Y.,,,78,33,1.0,2.0
4,,,1500.0,1500.0,Cherkasov A.,Arazi H.,,,206,35,2.0,0.0


## Define the target
The target is the mask, whether or not the P1 and P2 have been swapped.

In [31]:
df_permuted['target'] = mask
df_permuted.head()

Unnamed: 0,atp,location,tournament,date,series,court,surface,round,best of,p1,...,sets_p2,comment,ps_p1,ps_p2,b365_p1,b365_p2,elo_p1,elo_p2,proba_elo,target
0,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Dosedel S.,...,0.0,Completed,,,,,1500.0,1500.0,0.5,False
1,3,Doha,Qatar Open,2000-01-03,International,Outdoor,Hard,1st Round,3,Tarango J.,...,2.0,Completed,,,,,1500.0,1500.0,0.5,True
2,3,Doha,Qatar Open,2000-01-03,International,Outdoor,Hard,1st Round,3,Gaudio G.,...,1.0,Completed,,,,,1500.0,1500.0,0.5,False
3,3,Doha,Qatar Open,2000-01-03,International,Outdoor,Hard,1st Round,3,Dupuis A.,...,2.0,Completed,,,,,1500.0,1500.0,0.5,True
4,3,Doha,Qatar Open,2000-01-03,International,Outdoor,Hard,1st Round,3,Cherkasov A.,...,0.0,Completed,,,,,1500.0,1500.0,0.5,False


## Feature engineering
To be improved

In [32]:

# Extract year, month, and day into separate columns
df_permuted['year'] = df_permuted['date'].dt.year
df_permuted['month'] = df_permuted['date'].dt.month
df_permuted['day'] = df_permuted['date'].dt.day


## Split the data

As the time goes by, players improve or age. It's important to keep the time ordering to avoid leakage of information

In [33]:
df_permuted['date'].head()

0   2000-01-03
1   2000-01-03
2   2000-01-03
3   2000-01-03
4   2000-01-03
Name: date, dtype: datetime64[ns]

In [34]:
df_permuted = df_permuted.sort_values('date')


In [35]:
df_permuted['date'].head()

0    2000-01-03
68   2000-01-03
67   2000-01-03
66   2000-01-03
65   2000-01-03
Name: date, dtype: datetime64[ns]

In [36]:
df_permuted['date'].tail()

44703   2018-03-03
44704   2018-03-03
44705   2018-03-03
44706   2018-03-04
44707   2018-03-04
Name: date, dtype: datetime64[ns]

In [37]:
predictors = df_permuted.columns.drop(['target', 'date'])
X = df_permuted[predictors].copy()
y = df_permuted['target'].copy() * 1

In [38]:
ts_split = TimeSeriesSplit(n_splits=2)
all_splits = list(ts_split.split(X, y))
train_idx, test_idx = all_splits[0]

In [39]:
train_idx

array([    0,     1,     2, ..., 14901, 14902, 14903])

## Train the model
vanilla ligthGBM for now, not even early stopping. 

In [40]:
X_train = X.iloc[train_idx, :].copy()
y_train = y[train_idx].copy()


model= Pipeline([ ("encoder", OrdinalEncoderPandas()), ("gbm", LGBMClassifier())])
model.fit(X_train, y_train)

In [43]:
X_train= X.iloc[train_idx, :].copy()
y_pred_insample = model.predict(X_train)
target_names = ["winner", "loser"]
print(classification_report(y_train, y_pred_insample, target_names=target_names))

              precision    recall  f1-score   support

      winner       0.79      0.81      0.80      7452
       loser       0.81      0.79      0.80      7452

    accuracy                           0.80     14904
   macro avg       0.80      0.80      0.80     14904
weighted avg       0.80      0.80      0.80     14904



In [41]:
X_test= X.iloc[test_idx, :].copy()
y_test = y[test_idx].copy()
y_pred = model.predict(X_test)
target_names = ["winner", "loser"]
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

      winner       0.54      0.51      0.53      7451
       loser       0.54      0.57      0.55      7451

    accuracy                           0.54     14902
   macro avg       0.54      0.54      0.54     14902
weighted avg       0.54      0.54      0.54     14902



In [42]:
print(accuracy_score(y_test, y_pred))

0.5399275265065092
