In [385]:
import pandas as pd
from datetime import datetime

In [386]:
data = pd.read_csv('RawATPData.csv')
data.head()

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score
0,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,Dosedel S.,63,77,-1,-1,-1.0,-1.0,6-4 6-2
1,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Clement A.,Enqvist T.,Enqvist T.,56,5,-1,-1,-1.0,-1.0,3-6 3-6
2,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,Escude N.,40,655,-1,-1,-1.0,-1.0,6-7 7-5 6-3
3,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Knippschild J.,Federer R.,Federer R.,87,65,-1,-1,-1.0,-1.0,1-6 4-6
4,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,Fromberg R.,81,198,-1,-1,-1.0,-1.0,7-6 5-7 6-4


In [387]:
data.shape

(63811, 17)

In [388]:
data.dtypes

Tournament     object
Date           object
Series         object
Court          object
Surface        object
Round          object
Best of         int64
Player_1       object
Player_2       object
Winner         object
Rank_1          int64
Rank_2          int64
Pts_1           int64
Pts_2           int64
Odd_1         float64
Odd_2         float64
Score          object
dtype: object

In [389]:
data["opp_code"] = data["Player_2"].astype("category").cat.codes
data["tour_code"] = data["Tournament"].astype("category").cat.codes
data["surface_code"] = data["Surface"].astype("category").cat.codes
data["round_code"] = data["Round"].astype("category").cat.codes
data["winner_code"] = data["Winner"].astype("category").cat.codes
data["court_code"] = data["Court"].astype("category").cat.codes


In [390]:
data['player_1_wins'] = (data['Winner'] == data['Player_1']).astype(int)
data['rank_diff'] = data['Rank_2'] - data['Rank_1']  # Positive = Player 1 better ranked
data['pts_diff'] = data['Pts_1'] - data['Pts_2']
data["rank_diff"] = data["rank_diff"].astype("category").cat.codes
data["pts_diff"] = data["pts_diff"].astype("category").cat.codes


In [391]:
del data["opp_code"]
del data["winner_code"]
del data["Score"]
data

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,...,Pts_2,Odd_1,Odd_2,tour_code,surface_code,round_code,court_code,player_1_wins,rank_diff,pts_diff
0,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,Dosedel S.,...,-1,-1.00,-1.00,24,3,0,1,1,827,4664
1,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Clement A.,Enqvist T.,Enqvist T.,...,-1,-1.00,-1.00,24,3,0,1,0,762,4664
2,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,Escude N.,...,-1,-1.00,-1.00,24,3,0,1,1,1387,4664
3,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Knippschild J.,Federer R.,Federer R.,...,-1,-1.00,-1.00,24,3,0,1,0,791,4664
4,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,Fromberg R.,...,-1,-1.00,-1.00,24,3,0,1,1,930,4664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63806,Japan Open Tennis Championships,2024-09-29 00:00:00.000,ATP500,Outdoor,Hard,Quarterfinals,3,Rune H.,Nishikori K.,Rune H.,...,288,1.57,2.38,125,3,4,1,1,999,6985
63807,Japan Open Tennis Championships,2024-09-29 00:00:00.000,ATP500,Outdoor,Hard,Quarterfinals,3,Shelton B.,Fils A.,Fils A.,...,1775,1.67,2.20,125,3,4,1,0,820,5379
63808,Japan Open Tennis Championships,2024-09-30 00:00:00.000,ATP500,Outdoor,Hard,Semifinals,3,Humbert U.,Machac T.,Humbert U.,...,1374,1.57,2.38,125,3,6,1,1,829,5660
63809,Japan Open Tennis Championships,2024-09-30 00:00:00.000,ATP500,Outdoor,Hard,Semifinals,3,Rune H.,Fils A.,Fils A.,...,1775,1.67,2.20,125,3,6,1,0,823,5669


In [392]:
from sklearn.ensemble import RandomForestClassifier

In [393]:

rf = RandomForestClassifier(
    n_estimators=200,           
    min_samples_split=10,       
    min_samples_leaf=5,       
    max_depth=15,             
    max_features='sqrt',       
    random_state=42            
)

In [394]:
data['Date'] = pd.to_datetime(data['Date'])
train = data[data["Date"] < datetime(2023, 1, 1)]
test = data[data["Date"] >= datetime(2023, 1, 1)]

In [395]:
predictors = [
    "Best of", "Rank_1", "Rank_2", "Pts_1", "Pts_2",
    "tour_code", "surface_code", "round_code", "court_code", "pts_diff", "rank_diff"
]








In [396]:
rf.fit(train[predictors], train["player_1_wins"])

In [397]:
preds= rf.predict(test[predictors])

In [398]:
from sklearn.metrics import accuracy_score, precision_score
accuracy_score = accuracy_score(test["player_1_wins"], preds)
accuracy_score
print(f"Model accuracy: %{accuracy_score * 100:.2f}")

Model accuracy: %63.22


In [399]:
combined = pd.DataFrame(dict(actual=test["player_1_wins"], predicted=preds))

In [400]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1504,924
1,862,1566


In [401]:
precision_score(test["player_1_wins"], preds)

0.6289156626506024

In [402]:
grouped_data_1 = data.groupby("Player_1")
# grouped_data_2 = data.groupby("Player_2")


In [411]:
def rolling_averages_1(group, cols, new_cols):
    group = group.sort_values("Date")
    rolling_stats = group[cols].rolling(8, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

# def rolling_averages_2(group, cols, new_cols):
#     group = group.sort_values("Date")
#     rolling_stats = group[cols].rolling(5, closed='left').mean()
#     group[new_cols] = rolling_stats
#     group = group.dropna(subset=new_cols)
#     return group


In [404]:
# data['hard_win'] = data['player_1_wins'] * (data['Surface'] == 'Hard')
# data['clay_win'] = data['player_1_wins'] * (data['Surface'] == 'Clay')
# data['grass_win'] = data['player_1_wins'] * (data['Surface'] == 'Grass')

In [405]:
cols1 = ["Rank_1", "Pts_1", "player_1_wins"]
new_cols1 = ["p1_rank_L5", "p1_pts_L5", "p1_wr_L5"]

# cols2 = ["Rank_1", "Pts_1", "player_1_wins", "hard_win", "clay_win", "grass_win"]  # Same columns!
# new_cols2 = ["p2_rank_L5", "p2_pts_L5", "p2_wr_L5", "p2_hard_wr_L5", "p2_clay_wr_L5", "p2_grass_wr_L5"]

In [412]:
# Apply the rolling_averages_1 function to each group in grouped_data_1
tennis_rolling_p1 = grouped_data_1.apply(rolling_averages_1, cols=cols1, new_cols=new_cols1).reset_index(drop=True)

In [408]:
# def make_predictions(data, predictors):
#     train = data[data["Date"] < datetime(2023, 1, 1)]  # Fixed: "Date" not "date"
#     test = data[data["Date"] >= datetime(2023, 1, 1)]
#     rf.fit(train[predictors], train["player_1_wins"])
#     preds = rf.predict(test[predictors])
#     combined = pd.DataFrame(dict(actual=test["player_1_wins"], predicted=preds), index=test.index)
#     error = precision_score(test["player_1_wins"], preds)
#     return combined, error
def make_predictions(data, predictors):
    train = data[data["Date"] < '2023-01-01']
    test = data[data["Date"] >= '2023-01-01']
    rf.fit(train[predictors], train["player_1_wins"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["player_1_wins"], predicted=preds), index=test.index)
    error = precision_score(test["player_1_wins"], preds)
    return combined, error

In [413]:
# # Test baseline (no rolling stats)
# combined_baseline, error_baseline = make_predictions(data, predictors)
# print(f"Baseline accuracy: {error_baseline:.4f}")

# # Test with Player_1 rolling stats only
# p1_predictors = predictors + new_cols1
# combined_p1, error_p1 = make_predictions(tennis_rolling_p1, p1_predictors)
# print(f"P1 rolling accuracy: {error_p1:.4f}")
# print(f"Improvement: {error_p1 - error_baseline:.4f}")


combined, error = make_predictions(tennis_rolling_p1, predictors + new_cols1)

In [414]:
error

0.6263031275060145

In [358]:
combined

Unnamed: 0,actual,predicted
333,0,0
450,0,1
451,0,0
452,1,1
453,0,0
...,...,...
58441,1,1
58442,1,1
58443,0,0
58444,1,1


Earliest test date: 2023-01-01 00:00:00
Latest test date: 2024-10-02 00:00:00
