##### Australian Open 2026 Winner Predition using Monte Carlo Simulation

Goal: Building a reliable pre-match win probability model and simulate the 2026 Australian Open bracket.

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

##### Collating ATP match level data downloaded from https://github.com/JeffSackmann/tennis_atp.git

In [2]:
# Reading data from 2017 to 2025
data_dir = Path("./data")
dfs = []
for year in range(2017, 2024):
    file_path = data_dir / f"atp_matches_{year}.csv"
    df_year = pd.read_csv(file_path)
    df_year["season"] = year
    dfs.append(df_year)
df = pd.concat(dfs, ignore_index=True)

##### Sample of the collated data

In [3]:
df.sample(10)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,season
14368,2022-540,Wimbledon,Grass,128,G,20220627,115,207989,5.0,,...,88.0,47.0,28.0,12.0,14.0,7.0,4890.0,155.0,363.0,2022
18190,2023-5014,Shanghai Masters,Hard,128,M,20231002,204,106415,,,...,47.0,19.0,16.0,1.0,2.0,48.0,970.0,47.0,971.0,2023
6305,2019-6932,Rio de Janeiro,Clay,32,A,20190218,293,111513,,,...,21.0,13.0,9.0,2.0,5.0,90.0,627.0,73.0,726.0,2019
14537,2022-0414,Hamburg,Clay,32,A,20220718,297,207989,1.0,,...,20.0,2.0,7.0,6.0,11.0,6.0,4845.0,26.0,1440.0,2022
17370,2023-540,Wimbledon,Grass,128,G,20230703,149,106234,,,...,53.0,35.0,21.0,14.0,20.0,50.0,888.0,75.0,758.0,2023
13402,2022-0807,Acapulco,Hard,32,A,20220221,270,100644,2.0,,...,48.0,26.0,15.0,3.0,5.0,3.0,7515.0,47.0,1153.0,2022
17165,2023-0440,s Hertogenbosch,Grass,32,A,20230612,286,206173,2.0,,...,29.0,13.0,9.0,9.0,12.0,9.0,3300.0,47.0,880.0,2023
16786,2023-1536,Madrid Masters,Clay,128,M,20230424,294,106234,,Q,...,27.0,19.0,11.0,1.0,2.0,121.0,497.0,99.0,606.0,2023
11254,2021-0499,Delray Beach,Hard,32,A,20210104,294,200624,,,...,39.0,12.0,14.0,4.0,7.0,119.0,566.0,25.0,1850.0,2021
11157,2021-0451,Doha,Hard,32,A,20210308,296,126094,3.0,,...,,,,,,8.0,5019.0,46.0,1372.0,2021


##### Step 1: Exploratory Data Analysis

##### a. Selecting necessary columns

In [4]:
df.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
       'season'],
      dtype='object')

In [5]:
columns = [
    # Tournament attributes
    "tourney_name",
    "surface",
    "draw_size",
    "tourney_level",
    "tourney_date",
    "season",
    "round",
    "best_of",
    # Player identity attributes
    "winner_id",
    "winner_name",
    "loser_id",
    "loser_name",
    # Winner pre-match attributes
    "winner_age",
    "winner_ht",
    "winner_hand",
    "winner_ioc",
    "winner_seed",
    "winner_entry",
    "winner_rank",
    "winner_rank_points",
    # Loser pre-match attributes
    "loser_age",
    "loser_ht",
    "loser_hand",
    "loser_ioc",
    "loser_seed",
    "loser_entry",
    "loser_rank",
    "loser_rank_points",
]
data = df[columns]
data.sample(10)

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,tourney_date,season,round,best_of,winner_id,winner_name,...,winner_rank,winner_rank_points,loser_age,loser_ht,loser_hand,loser_ioc,loser_seed,loser_entry,loser_rank,loser_rank_points
7729,Canada Masters,Hard,64,M,20190805,2019,R32,3,105932,Nikoloz Basilashvili,...,17.0,1975.0,29.2,193.0,R,GER,,,35.0,1265.0
14484,Bastad,Clay,32,A,20220711,2022,QF,3,105807,Pablo Carreno Busta,...,18.0,1926.0,29.9,170.0,R,ARG,3.0,,14.0,2325.0
14729,Washington,Hard,64,A,20220801,2022,R32,3,144707,Mikael Ymer,...,115.0,475.0,28.9,185.0,R,RUS,15.0,,36.0,1175.0
7075,Roland Garros,Clay,128,G,20190527,2019,R128,5,120424,Yannick Maden,...,114.0,504.0,25.2,178.0,R,BEL,,Q,178.0,286.0
11605,Roland Garros,Clay,128,G,20210531,2021,R32,5,100644,Alexander Zverev,...,6.0,6990.0,25.9,188.0,R,SRB,,,55.0,1168.0
11919,Australian Open,Hard,128,G,20210208,2021,R128,5,126205,Tommy Paul,...,53.0,1080.0,28.9,185.0,R,GEO,,,40.0,1405.0
13294,Doha,Hard,32,A,20220214,2022,QF,3,126239,Arthur Rinderknech,...,61.0,1042.0,22.8,185.0,L,CAN,1.0,,12.0,2930.0
14140,Roland Garros,Clay,128,G,20220523,2022,R64,5,208029,Holger Rune,...,40.0,1159.0,30.1,185.0,R,SUI,,,96.0,668.0
4008,Barcelona,Clay,64,A,20180423,2018,R64,3,104468,Gilles Simon,...,70.0,780.0,24.1,193.0,R,BLR,,Q,117.0,485.0
5172,Winston-Salem,Hard,64,A,20180820,2018,SF,3,105449,Steve Johnson,...,34.0,1235.0,27.1,188.0,R,ESP,2.0,,12.0,2380.0


##### b. Handling data-time objects

In [6]:
data.dtypes

tourney_name           object
surface                object
draw_size               int64
tourney_level          object
tourney_date            int64
season                  int64
round                  object
best_of                 int64
winner_id               int64
winner_name            object
loser_id                int64
loser_name             object
winner_age            float64
winner_ht             float64
winner_hand            object
winner_ioc             object
winner_seed           float64
winner_entry           object
winner_rank           float64
winner_rank_points    float64
loser_age             float64
loser_ht              float64
loser_hand             object
loser_ioc              object
loser_seed            float64
loser_entry            object
loser_rank            float64
loser_rank_points     float64
dtype: object

In [7]:
# Converting 'tourney_date' to datetime format
data['tourney_date']= pd.to_datetime(data['tourney_date'], format='%Y%m%d')
# Adding tournament year to the existing data
data['tournament_year'] = data['tourney_date'].dt.year

##### c. Renaming columns for easier understanding

In [8]:
data.rename(columns={'tourney_date':'tournament_date', 'tourney_name':'tournament_name',
                     'tourney_level':'tournament_level'}, inplace=True)

##### d. Since we are predicting for Australia Open which is essentially played on Hard court only considering matches played on Hard surface

In [9]:
data=data[data['surface']=="Hard"]

##### e. Checking for null values in dataset

In [10]:
data.isna().sum()

tournament_name          0
surface                  0
draw_size                0
tournament_level         0
tournament_date          0
season                   0
round                    0
best_of                  0
winner_id                0
winner_name              0
loser_id                 0
loser_name               0
winner_age               1
winner_ht               45
winner_hand              0
winner_ioc               0
winner_seed           6444
winner_entry          9575
winner_rank             39
winner_rank_points      39
loser_age                0
loser_ht               162
loser_hand               1
loser_ioc                0
loser_seed            8353
loser_entry           8707
loser_rank             145
loser_rank_points      145
tournament_year          0
dtype: int64

##### f. Missing value treatment

In [11]:
# Since there is only 1 missing value in winner age, treating  it by finding the age from google
data[data['winner_age'].isna()]
data.loc[data['winner_age'].isna(), 'winner_age'] = 17.0

In [12]:
# Likewise, treating the single missing value in loser hand by finding from google
data['loser_name'][data['loser_hand'].isna()]
data.loc[data['loser_hand'].isna(), 'loser_hand'] ='R' 

###### Sources: 
###### Based on his birthdate of August 2, 2002, Indonesian tennis player Ari Fahresi was 17 years old during most of 2019. (Gemini)
###### Based on tennis statistics profiles, Juan Sebastian Dominguez Collado plays right-handed


> Ranking and ranking points are important and dropping them would induce bias against elite players and using average rank would be misleading.

> When rank is missing, it usually is beacause the player is almost always weak and new to the circuit.

> Therefore, we will impute the rank as max rank + 1 and rank points as 0

In [13]:
max_rank = np.nanmax(
    pd.concat([data['winner_rank'], data['loser_rank']], axis=0)
)
imputed_rank = max_rank + 1
data['winner_rank'] = data['winner_rank'].fillna(imputed_rank)
data['loser_rank'] = data['loser_rank'].fillna(imputed_rank)
data['winner_rank_points'] = data['winner_rank_points'].fillna(0)
data['loser_rank_points'] = data['loser_rank_points'].fillna(0)

> For the remaining missing values in seed and entry variables they can be identified as structural tournament design anomalies (unseeded players and direct acceptances) therefore leaving them untreated.

> Player height, while potentially infuential, was excluded from the current analysis due to incomplete coverage and the absence of serve-level modeling, avoiding introducing bias of physiological assumptions.


##### Step 2: dataature Engineering

##### 1. Rank based strength

In [14]:
# Rank difference: When positive, winner was better ranked than loser
data['rank_diff']=data['loser_rank'] - data['winner_rank']
# Log rank points and difference (handles skew, emphasizes elite players)
data['log_winner_rank_points'] = np.log1p(data['winner_rank_points'])
data['log_loser_rank_points']  = np.log1p(data['loser_rank_points'])
data['log_rank_points_diff']   = data['log_winner_rank_points'] - data['log_loser_rank_points']


##### 2. Age and experience

In [15]:
data['winner_age'] = data['winner_age'].astype(float)
data['loser_age']  = data['loser_age'].astype(float)
# Age difference: When positive, winner was older/more experienced than loser
data['age_diff'] = data['winner_age'] - data['loser_age']


##### 3. Rounds context

In [27]:
data['round_order']=data['round'].map({
    'R128':1,
    'R64':2,
    'R32':3,
    'R16':4,
    'QF':5,
    'SF':6,
    'F':7})

##### Exporting the pre-processed data for further analysis

In [28]:
data.to_csv("Hard_Court_Data.csv", index=False)