In [25]:
import pandas as pd
from datetime import datetime

In [26]:
data = pd.read_csv('RawATPData.csv')
data.head()

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score
0,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,Dosedel S.,63,77,-1,-1,-1.0,-1.0,6-4 6-2
1,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Clement A.,Enqvist T.,Enqvist T.,56,5,-1,-1,-1.0,-1.0,3-6 3-6
2,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,Escude N.,40,655,-1,-1,-1.0,-1.0,6-7 7-5 6-3
3,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Knippschild J.,Federer R.,Federer R.,87,65,-1,-1,-1.0,-1.0,1-6 4-6
4,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,Fromberg R.,81,198,-1,-1,-1.0,-1.0,7-6 5-7 6-4


In [27]:
data.shape

(63811, 17)

In [28]:
data.dtypes

Tournament     object
Date           object
Series         object
Court          object
Surface        object
Round          object
Best of         int64
Player_1       object
Player_2       object
Winner         object
Rank_1          int64
Rank_2          int64
Pts_1           int64
Pts_2           int64
Odd_1         float64
Odd_2         float64
Score          object
dtype: object

In [29]:
data["opp_code"] = data["Player_2"].astype("category").cat.codes
data["tour_code"] = data["Tournament"].astype("category").cat.codes
data["surface_code"] = data["Surface"].astype("category").cat.codes
data["round_code"] = data["Round"].astype("category").cat.codes
data["winner_code"] = data["Winner"].astype("category").cat.codes
data["court_code"] = data["Court"].astype("category").cat.codes


In [30]:
data['player_1_wins'] = (data['Winner'] == data['Player_1']).astype(int)
data['rank_diff'] = data['Rank_2'] - data['Rank_1']  # Positive = Player 1 better ranked
data['pts_diff'] = data['Pts_1'] - data['Pts_2']
data["rank_diff"] = data["rank_diff"].astype("category").cat.codes
data["pts_diff"] = data["pts_diff"].astype("category").cat.codes


In [31]:

data

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,...,Score,opp_code,tour_code,surface_code,round_code,winner_code,court_code,player_1_wins,rank_diff,pts_diff
0,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,Dosedel S.,...,6-4 6-2,750,24,3,0,251,1,1,827,4664
1,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Clement A.,Enqvist T.,Enqvist T.,...,3-6 3-6,356,24,3,0,273,1,0,762,4664
2,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,Escude N.,...,6-7 7-5 6-3,62,24,3,0,278,1,1,1387,4664
3,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Knippschild J.,Federer R.,Federer R.,...,1-6 4-6,379,24,3,0,290,1,0,791,4664
4,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,Fromberg R.,...,7-6 5-7 6-4,1406,24,3,0,314,1,1,930,4664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63806,Japan Open Tennis Championships,2024-09-29 00:00:00.000,ATP500,Outdoor,Hard,Quarterfinals,3,Rune H.,Nishikori K.,Rune H.,...,3-6 6-2 7-5,941,125,3,4,857,1,1,999,6985
63807,Japan Open Tennis Championships,2024-09-29 00:00:00.000,ATP500,Outdoor,Hard,Quarterfinals,3,Shelton B.,Fils A.,Fils A.,...,5-7 7-6 6-7,389,125,3,4,299,1,0,820,5379
63808,Japan Open Tennis Championships,2024-09-30 00:00:00.000,ATP500,Outdoor,Hard,Semifinals,3,Humbert U.,Machac T.,Humbert U.,...,6-3 3-6 6-2,779,125,3,6,439,1,1,829,5660
63809,Japan Open Tennis Championships,2024-09-30 00:00:00.000,ATP500,Outdoor,Hard,Semifinals,3,Rune H.,Fils A.,Fils A.,...,6-7 6-7,389,125,3,6,299,1,0,823,5669


In [32]:
from sklearn.ensemble import RandomForestClassifier

In [33]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=5)

In [34]:
data['Date'] = pd.to_datetime(data['Date'])
train = data[data["Date"] < datetime(2023, 1, 1)]
test = data[data["Date"] >= datetime(2023, 1, 1)]

In [35]:
predictors = [
    "Best of", "Rank_1", "Rank_2", "Pts_1", "Pts_2",
    "tour_code", "surface_code", "round_code", "court_code", "pts_diff", "rank_diff"
]








In [36]:
rf.fit(train[predictors], train["player_1_wins"])

In [37]:
preds= rf.predict(test[predictors])

In [38]:
from sklearn.metrics import accuracy_score, precision_score
accuracy_score = accuracy_score(test["player_1_wins"], preds)
accuracy_score
print(f"Model accuracy: %{accuracy_score * 100:.2f}")

Model accuracy: %61.80


In [39]:
# combined = pd.DataFrame(dict(actual=test["player_1_wins"], predicted=preds))

In [40]:
# pd.crosstab(index=combined["actual"], columns=combined["predicted"], rownames=["Actual"], colnames=["Predicted"])

In [41]:
# precision_score(test["player_1_wins"], preds)

In [42]:
# grouped_data = data.groupby("Player_1")

In [43]:
# group = grouped_data.get_group("Sinner J.").sort_values("Date")
# group

In [44]:
# def rolling_averages(group, cols, new_cols):
#     group = group.sort_values("Date")
#     rolling_stats = group[cols].rolling(3, closed='left').mean()
#     group[new_cols] = rolling_stats
#     group = group.dropna(subset=new_cols)
#     return group

In [45]:
# cols = ["Rank_1", "Pts_1", "player_1_wins"]
# new_cols = [f"{col}_rolling_avg" for col in cols]

# rolling_avg = rolling_averages(group, cols, new_cols)

In [46]:
# def make_predictions(data, predictors):
#     data['Date'] = pd.to_datetime(data['Date'])
#     train = data[data["Date"] < datetime(2023, 1, 1)]
#     test = data[data["Date"] >= datetime(2023, 1, 1)]
#     rf.fit(train[predictors], train["player_1_wins"])
#     preds = rf.predict(test[predictors])
#     combined = pd.DataFrame(dict(actual=test["player_1_wins"], predicted=preds), index=test.index)
#     error = precision_score(test["player_1_wins"], preds)
#     return combined, error

In [47]:
# combined, error = make_predictions(rolling_avg, predictors + new_cols)
# print(f"Model accuracy with rolling averages: %{error * 100:.2f}")