In [200]:
import pandas as pd
from datetime import datetime

In [201]:
data = pd.read_csv('RawATPData.csv')
data.head()

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score
0,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,Dosedel S.,63,77,-1,-1,-1.0,-1.0,6-4 6-2
1,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Clement A.,Enqvist T.,Enqvist T.,56,5,-1,-1,-1.0,-1.0,3-6 3-6
2,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,Escude N.,40,655,-1,-1,-1.0,-1.0,6-7 7-5 6-3
3,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Knippschild J.,Federer R.,Federer R.,87,65,-1,-1,-1.0,-1.0,1-6 4-6
4,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,Fromberg R.,81,198,-1,-1,-1.0,-1.0,7-6 5-7 6-4


In [202]:
data.shape

(63811, 17)

In [203]:
data.dtypes

Tournament     object
Date           object
Series         object
Court          object
Surface        object
Round          object
Best of         int64
Player_1       object
Player_2       object
Winner         object
Rank_1          int64
Rank_2          int64
Pts_1           int64
Pts_2           int64
Odd_1         float64
Odd_2         float64
Score          object
dtype: object

In [204]:
data["opp_code"] = data["Player_2"].astype("category").cat.codes
data["tour_code"] = data["Tournament"].astype("category").cat.codes
data["surface_code"] = data["Surface"].astype("category").cat.codes
data["round_code"] = data["Round"].astype("category").cat.codes
data["winner_code"] = data["Winner"].astype("category").cat.codes
data["court_code"] = data["Court"].astype("category").cat.codes


In [205]:
data['player_1_wins'] = (data['Winner'] == data['Player_1']).astype(int)

data['rank_diff'] = data['Rank_2'] - data['Rank_1'] 
data['pts_diff'] = data['Pts_1'] - data['Pts_2']
data["rank_diff"] = data["rank_diff"].astype("category").cat.codes
data["pts_diff"] = data["pts_diff"].astype("category").cat.codes


In [206]:
del data["opp_code"]
del data["winner_code"]
del data["Score"]
data

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,...,Pts_2,Odd_1,Odd_2,tour_code,surface_code,round_code,court_code,player_1_wins,rank_diff,pts_diff
0,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,Dosedel S.,...,-1,-1.00,-1.00,24,3,0,1,1,827,4664
1,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Clement A.,Enqvist T.,Enqvist T.,...,-1,-1.00,-1.00,24,3,0,1,0,762,4664
2,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,Escude N.,...,-1,-1.00,-1.00,24,3,0,1,1,1387,4664
3,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Knippschild J.,Federer R.,Federer R.,...,-1,-1.00,-1.00,24,3,0,1,0,791,4664
4,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,Fromberg R.,...,-1,-1.00,-1.00,24,3,0,1,1,930,4664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63806,Japan Open Tennis Championships,2024-09-29 00:00:00.000,ATP500,Outdoor,Hard,Quarterfinals,3,Rune H.,Nishikori K.,Rune H.,...,288,1.57,2.38,125,3,4,1,1,999,6985
63807,Japan Open Tennis Championships,2024-09-29 00:00:00.000,ATP500,Outdoor,Hard,Quarterfinals,3,Shelton B.,Fils A.,Fils A.,...,1775,1.67,2.20,125,3,4,1,0,820,5379
63808,Japan Open Tennis Championships,2024-09-30 00:00:00.000,ATP500,Outdoor,Hard,Semifinals,3,Humbert U.,Machac T.,Humbert U.,...,1374,1.57,2.38,125,3,6,1,1,829,5660
63809,Japan Open Tennis Championships,2024-09-30 00:00:00.000,ATP500,Outdoor,Hard,Semifinals,3,Rune H.,Fils A.,Fils A.,...,1775,1.67,2.20,125,3,6,1,0,823,5669


In [207]:
from sklearn.ensemble import RandomForestClassifier

In [208]:
from sklearn.metrics import accuracy_score, precision_score

In [209]:
# pd.crosstab(index=combined["actual"], columns=combined["predicted"])

In [210]:
grouped_data_1 = data.groupby("Player_1")
# grouped_data_2 = data.groupby("Player_2")


In [211]:
def rolling_averages_1(group, cols, new_cols):
    group = group.sort_values("Date")
    rolling_stats = group[cols].rolling(10, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group



In [212]:
def calculate_surface_rolling_averages(data):
    surfaces = ['Hard', 'Clay', 'Grass']
    
    for surface in surfaces:
        surface_data = data[data['Surface'] == surface].copy()
        
        if len(surface_data) == 0:
            continue
        
        grouped = surface_data.groupby('Player_1')
        
        def surface_rolling_avg(group):
            group = group.sort_values('Date')
            group[f'p1_{surface.lower()}_wr_L5'] = group['player_1_wins'].rolling(8, closed='left').mean()
            group[f'p1_{surface.lower()}_rank_L5'] = group['Rank_1'].rolling(8, closed='left').mean()
            return group
        
        surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
        
        merge_cols = ['Date', 'Player_1', f'p1_{surface.lower()}_wr_L5', f'p1_{surface.lower()}_rank_L5']
        data = data.merge(
            surface_rolling[merge_cols], 
            on=['Date', 'Player_1'], 
            how='left'
        )
    
    return data

In [213]:
def calculate_player_2_rolling_averages(data):


    p2_data = data.copy()
    
    p2_data['Temp_Player_1'] = p2_data['Player_1']
    p2_data['Player_1'] = p2_data['Player_2']  
    p2_data['Player_2'] = p2_data['Temp_Player_1']  
    
    p2_data['Temp_Rank_1'] = p2_data['Rank_1']
    p2_data['Rank_1'] = p2_data['Rank_2']
    p2_data['Rank_2'] = p2_data['Temp_Rank_1']
    
    p2_data['Temp_Pts_1'] = p2_data['Pts_1']
    p2_data['Pts_1'] = p2_data['Pts_2']
    p2_data['Pts_2'] = p2_data['Temp_Pts_1']
    

    p2_data['player_1_wins'] = (p2_data['Winner'] == p2_data['Player_1']).astype(int)
    

    cols = ["Rank_1", "player_1_wins"]
    new_cols = ["temp_rank_L10", "temp_wr_L10"]
    
    grouped_p2 = p2_data.groupby("Player_1")
    p2_rolling = grouped_p2.apply(rolling_averages_1, cols=cols, new_cols=new_cols).reset_index(drop=True)
    
    p2_rolling = calculate_surface_rolling_averages(p2_rolling)
    

    merge_cols = ['Date', 'Player_1', 'temp_rank_L10', 'temp_wr_L10']
    
    for surface in ['hard', 'clay', 'grass']:
        if f'p1_{surface}_wr_L8' in p2_rolling.columns:
            merge_cols.extend([f'p1_{surface}_wr_L8', f'p1_{surface}_rank_L8'])
    
    p2_stats = p2_rolling[merge_cols].copy()
    rename_dict = {
        'Player_1': 'Player_2',
        'temp_rank_L10': 'p2_rank_L10',
        'temp_wr_L10': 'p2_wr_L10'
    }
    
    for surface in ['hard', 'clay', 'grass']:
        if f'p1_{surface}_wr_L8' in p2_stats.columns:
            rename_dict[f'p1_{surface}_wr_L8'] = f'p2_{surface}_wr_L8'
            rename_dict[f'p1_{surface}_rank_L8'] = f'p2_{surface}_rank_L8'
    
    p2_stats = p2_stats.rename(columns=rename_dict)
    
    data = data.merge(p2_stats, on=['Date', 'Player_2'], how='left')
    
    return data

In [214]:
cols1 = ["Rank_1", "player_1_wins"]
new_cols1 = ["p1_rank_L10",  "p1_wr_L10"]

In [215]:

tennis_rolling_p1 = grouped_data_1.apply(rolling_averages_1, cols=cols1, new_cols=new_cols1).reset_index(drop=True)
tennis_rolling_p1 = calculate_surface_rolling_averages(tennis_rolling_p1)

tennis_rolling_complete = calculate_player_2_rolling_averages(tennis_rolling_p1)

rolling_predictors = ["p1_rank_L10", "p1_wr_L10", "p2_rank_L10", "p2_wr_L10"]

surface_predictors = []
for surface in ['hard', 'clay', 'grass']:
    for player in ['p1', 'p2']:
        surface_predictors.extend([f'{player}_{surface}_wr_L8', f'{player}_{surface}_rank_L8'])

surface_predictors

  tennis_rolling_p1 = grouped_data_1.apply(rolling_averages_1, cols=cols1, new_cols=new_cols1).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
  p2_rolling = grouped_p2.apply(rolling_averages_1, cols=cols, new_cols=new_cols).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)


['p1_hard_wr_L8',
 'p1_hard_rank_L8',
 'p2_hard_wr_L8',
 'p2_hard_rank_L8',
 'p1_clay_wr_L8',
 'p1_clay_rank_L8',
 'p2_clay_wr_L8',
 'p2_clay_rank_L8',
 'p1_grass_wr_L8',
 'p1_grass_rank_L8',
 'p2_grass_wr_L8',
 'p2_grass_rank_L8']

In [216]:
base_predictors = [
    "Best of", "Rank_1", "Rank_2", "Pts_1", "Pts_2",
    "tour_code", "surface_code", "round_code", "court_code", "pts_diff", "rank_diff"
]


In [217]:
all_predictors = base_predictors + rolling_predictors + surface_predictors
available_predictors = [col for col in all_predictors if col in tennis_rolling_complete.columns]

print(f"Using {len(available_predictors)} predictors: {available_predictors}")

Using 15 predictors: ['Best of', 'Rank_1', 'Rank_2', 'Pts_1', 'Pts_2', 'tour_code', 'surface_code', 'round_code', 'court_code', 'pts_diff', 'rank_diff', 'p1_rank_L10', 'p1_wr_L10', 'p2_rank_L10', 'p2_wr_L10']


In [241]:
# def make_predictions(data, predictors):
#     train = data[data["Date"] < datetime(2023, 1, 1)]  # Fixed: "Date" not "date"
#     test = data[data["Date"] >= datetime(2023, 1, 1)]
#     rf.fit(train[predictors], train["player_1_wins"])
#     preds = rf.predict(test[predictors])
#     combined = pd.DataFrame(dict(actual=test["player_1_wins"], predicted=preds), index=test.index)
#     error = precision_score(test["player_1_wins"], preds)
#     return combined, error
def make_predictions(data, predictors):
    train = data[data["Date"] < '2023-01-01']
    test = data[data["Date"] >= '2023-01-01']

    rf = RandomForestClassifier (
    n_estimators=500,
    min_samples_split=50,
    min_samples_leaf=20,
    max_depth=12,
    max_features='sqrt',
    random_state=42
    )

    rf.fit(train[predictors], train["player_1_wins"])
    preds = rf.predict(test[predictors])
    
    accuracy = accuracy_score(test["player_1_wins"], preds)
    precision = precision_score(test["player_1_wins"], preds)
    
    combined = pd.DataFrame(
        dict(actual=test["player_1_wins"], predicted=preds), 
        index=test.index
    )
    
    return combined, accuracy, precision, rf

In [242]:

print("ENHANCED MODEL WITH SURFACE ROLLING AVERAGES")


combined, accuracy, precision, model = make_predictions(tennis_rolling_complete, available_predictors)

print(f"Model Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Model Precision: {precision:.4f} ({precision*100:.2f}%)")

feature_importance = pd.DataFrame({
    'feature': available_predictors,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

feature_importance


ENHANCED MODEL WITH SURFACE ROLLING AVERAGES
Model Accuracy: 0.6409 (64.09%)
Model Precision: 0.6468 (64.68%)


Unnamed: 0,feature,importance
10,rank_diff,0.155587
11,p1_rank_L10,0.147641
14,p2_wr_L10,0.128104
1,Rank_1,0.116588
2,Rank_2,0.102978
13,p2_rank_L10,0.084004
5,tour_code,0.073731
12,p1_wr_L10,0.060521
7,round_code,0.052758
0,Best of,0.015986


In [220]:
print(tennis_rolling_complete.columns.tolist())

['Tournament', 'Date', 'Series', 'Court', 'Surface', 'Round', 'Best of', 'Player_1', 'Player_2', 'Winner', 'Rank_1', 'Rank_2', 'Pts_1', 'Pts_2', 'Odd_1', 'Odd_2', 'tour_code', 'surface_code', 'round_code', 'court_code', 'player_1_wins', 'rank_diff', 'pts_diff', 'p1_rank_L10', 'p1_wr_L10', 'p1_hard_wr_L5', 'p1_hard_rank_L5', 'p1_clay_wr_L5', 'p1_clay_rank_L5', 'p1_grass_wr_L5', 'p1_grass_rank_L5', 'p2_rank_L10', 'p2_wr_L10']
