In [151]:
import pandas as pd
from datetime import datetime

In [152]:
data = pd.read_csv('RawATPData.csv')
data.head()

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score
0,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,Dosedel S.,63,77,-1,-1,-1.0,-1.0,6-4 6-2
1,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Clement A.,Enqvist T.,Enqvist T.,56,5,-1,-1,-1.0,-1.0,3-6 3-6
2,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,Escude N.,40,655,-1,-1,-1.0,-1.0,6-7 7-5 6-3
3,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Knippschild J.,Federer R.,Federer R.,87,65,-1,-1,-1.0,-1.0,1-6 4-6
4,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,Fromberg R.,81,198,-1,-1,-1.0,-1.0,7-6 5-7 6-4


In [153]:
data.shape

(63811, 17)

In [154]:
data.dtypes

Tournament     object
Date           object
Series         object
Court          object
Surface        object
Round          object
Best of         int64
Player_1       object
Player_2       object
Winner         object
Rank_1          int64
Rank_2          int64
Pts_1           int64
Pts_2           int64
Odd_1         float64
Odd_2         float64
Score          object
dtype: object

In [155]:
data["opp_code"] = data["Player_2"].astype("category").cat.codes
data["tour_code"] = data["Tournament"].astype("category").cat.codes
data["surface_code"] = data["Surface"].astype("category").cat.codes
data["round_code"] = data["Round"].astype("category").cat.codes
data["winner_code"] = data["Winner"].astype("category").cat.codes
data["court_code"] = data["Court"].astype("category").cat.codes


In [156]:
data['player_1_wins'] = (data['Winner'] == data['Player_1']).astype(int)

data['rank_diff'] = data['Rank_2'] - data['Rank_1'] 
data["rank_diff"] = data["rank_diff"].astype("category").cat.codes



In [157]:
del data["opp_code"]
del data["winner_code"]
del data["Score"]
data

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,...,Pts_1,Pts_2,Odd_1,Odd_2,tour_code,surface_code,round_code,court_code,player_1_wins,rank_diff
0,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,Dosedel S.,...,-1,-1,-1.00,-1.00,24,3,0,1,1,827
1,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Clement A.,Enqvist T.,Enqvist T.,...,-1,-1,-1.00,-1.00,24,3,0,1,0,762
2,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,Escude N.,...,-1,-1,-1.00,-1.00,24,3,0,1,1,1387
3,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Knippschild J.,Federer R.,Federer R.,...,-1,-1,-1.00,-1.00,24,3,0,1,0,791
4,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,Fromberg R.,...,-1,-1,-1.00,-1.00,24,3,0,1,1,930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63806,Japan Open Tennis Championships,2024-09-29 00:00:00.000,ATP500,Outdoor,Hard,Quarterfinals,3,Rune H.,Nishikori K.,Rune H.,...,2780,288,1.57,2.38,125,3,4,1,1,999
63807,Japan Open Tennis Championships,2024-09-29 00:00:00.000,ATP500,Outdoor,Hard,Quarterfinals,3,Shelton B.,Fils A.,Fils A.,...,2490,1775,1.67,2.20,125,3,4,1,0,820
63808,Japan Open Tennis Championships,2024-09-30 00:00:00.000,ATP500,Outdoor,Hard,Semifinals,3,Humbert U.,Machac T.,Humbert U.,...,2370,1374,1.57,2.38,125,3,6,1,1,829
63809,Japan Open Tennis Championships,2024-09-30 00:00:00.000,ATP500,Outdoor,Hard,Semifinals,3,Rune H.,Fils A.,Fils A.,...,2780,1775,1.67,2.20,125,3,6,1,0,823


In [158]:
from sklearn.ensemble import RandomForestClassifier

In [159]:
from sklearn.metrics import accuracy_score, precision_score

In [160]:
# pd.crosstab(index=combined["actual"], columns=combined["predicted"])

In [161]:
grouped_data_1 = data.groupby("Player_1")


In [162]:
def rolling_averages_1(group, cols, new_cols):
    group = group.sort_values("Date")
    rolling_stats = group[cols].rolling(10, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group



In [163]:
def calculate_surface_rolling_averages(data):
    surfaces = ['Hard', 'Clay', 'Grass']
    
    for surface in surfaces:
        surface_data = data[data['Surface'] == surface].copy()
        
        if len(surface_data) == 0:
            continue
        
        grouped = surface_data.groupby('Player_1')
        
        def surface_rolling_avg(group):
            group = group.sort_values('Date')
            group[f'p1_{surface.lower()}_wr_L5'] = group['player_1_wins'].rolling(8, closed='left').mean()
            group[f'p1_{surface.lower()}_rank_L5'] = group['Rank_1'].rolling(8, closed='left').mean()
            return group
        
        surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
        
        merge_cols = ['Date', 'Player_1', f'p1_{surface.lower()}_wr_L5', f'p1_{surface.lower()}_rank_L5']
        data = data.merge(
            surface_rolling[merge_cols], 
            on=['Date', 'Player_1'], 
            how='left'
        )
    
    return data

In [164]:
def calculate_player_2_rolling_averages(data):


    p2_data = data.copy()
    
    p2_data['Temp_Player_1'] = p2_data['Player_1']
    p2_data['Player_1'] = p2_data['Player_2']  
    p2_data['Player_2'] = p2_data['Temp_Player_1']  
    
    p2_data['Temp_Rank_1'] = p2_data['Rank_1']
    p2_data['Rank_1'] = p2_data['Rank_2']
    p2_data['Rank_2'] = p2_data['Temp_Rank_1']
    
    p2_data['Temp_Pts_1'] = p2_data['Pts_1']
    p2_data['Pts_1'] = p2_data['Pts_2']
    p2_data['Pts_2'] = p2_data['Temp_Pts_1']
    

    p2_data['player_1_wins'] = (p2_data['Winner'] == p2_data['Player_1']).astype(int)
    

    cols = ["Rank_1", "player_1_wins"]
    new_cols = ["temp_rank_L10", "temp_wr_L10"]
    
    grouped_p2 = p2_data.groupby("Player_1")
    p2_rolling = grouped_p2.apply(rolling_averages_1, cols=cols, new_cols=new_cols).reset_index(drop=True)
    
    p2_rolling = calculate_surface_rolling_averages(p2_rolling)
    

    merge_cols = ['Date', 'Player_1', 'temp_rank_L10', 'temp_wr_L10']
    
    for surface in ['hard', 'clay', 'grass']:
        if f'p1_{surface}_wr_L8' in p2_rolling.columns:
            merge_cols.extend([f'p1_{surface}_wr_L8', f'p1_{surface}_rank_L8'])
    
    p2_stats = p2_rolling[merge_cols].copy()
    rename_dict = {
        'Player_1': 'Player_2',
        'temp_rank_L10': 'p2_rank_L10',
        'temp_wr_L10': 'p2_wr_L10'
    }
    
    for surface in ['hard', 'clay', 'grass']:
        if f'p1_{surface}_wr_L8' in p2_stats.columns:
            rename_dict[f'p1_{surface}_wr_L8'] = f'p2_{surface}_wr_L8'
            rename_dict[f'p1_{surface}_rank_L8'] = f'p2_{surface}_rank_L8'
    
    p2_stats = p2_stats.rename(columns=rename_dict)
    
    data = data.merge(p2_stats, on=['Date', 'Player_2'], how='left')
    
    return data

In [165]:
cols1 = ["Rank_1", "player_1_wins"]
new_cols1 = ["p1_rank_L10",  "p1_wr_L10"]

In [166]:

tennis_rolling_p1 = grouped_data_1.apply(rolling_averages_1, cols=cols1, new_cols=new_cols1).reset_index(drop=True)
tennis_rolling_p1 = calculate_surface_rolling_averages(tennis_rolling_p1)

tennis_rolling_complete = calculate_player_2_rolling_averages(tennis_rolling_p1)

rolling_predictors = ["p1_rank_L10", "p1_wr_L10", "p2_rank_L10", "p2_wr_L10"]

surface_predictors = []
for surface in ['hard', 'clay', 'grass']:
    for player in ['p1', 'p2']:
        surface_predictors.extend([f'{player}_{surface}_wr_L8', f'{player}_{surface}_rank_L8'])

surface_predictors

  tennis_rolling_p1 = grouped_data_1.apply(rolling_averages_1, cols=cols1, new_cols=new_cols1).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
  p2_rolling = grouped_p2.apply(rolling_averages_1, cols=cols, new_cols=new_cols).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)


['p1_hard_wr_L8',
 'p1_hard_rank_L8',
 'p2_hard_wr_L8',
 'p2_hard_rank_L8',
 'p1_clay_wr_L8',
 'p1_clay_rank_L8',
 'p2_clay_wr_L8',
 'p2_clay_rank_L8',
 'p1_grass_wr_L8',
 'p1_grass_rank_L8',
 'p2_grass_wr_L8',
 'p2_grass_rank_L8']

In [167]:
base_predictors = [
    "Best of", "Rank_1", "Rank_2",
    "tour_code", "surface_code", "round_code", "court_code", "rank_diff"
]


In [168]:
all_predictors = base_predictors + rolling_predictors + surface_predictors
available_predictors = [col for col in all_predictors if col in tennis_rolling_complete.columns]

print(f"Using {len(available_predictors)} predictors: {available_predictors}")

Using 12 predictors: ['Best of', 'Rank_1', 'Rank_2', 'tour_code', 'surface_code', 'round_code', 'court_code', 'rank_diff', 'p1_rank_L10', 'p1_wr_L10', 'p2_rank_L10', 'p2_wr_L10']


In [169]:

def make_predictions(data, predictors):
    train = data[data["Date"] < '2023-01-01']
    test = data[data["Date"] >= '2023-01-01']

    rf = RandomForestClassifier (
    n_estimators=500,
    min_samples_split=50,
    min_samples_leaf=20,
    max_depth=12,
    max_features='sqrt',
    random_state=1
    )

    rf.fit(train[predictors], train["player_1_wins"])
    preds = rf.predict(test[predictors])
    
    accuracy = accuracy_score(test["player_1_wins"], preds)
    precision = precision_score(test["player_1_wins"], preds)
    
    combined = pd.DataFrame(
        dict(actual=test["player_1_wins"], predicted=preds), 
        index=test.index
    )
    
    return combined, accuracy, precision, rf

In [170]:

print("Mens Tennis Match Prediction Model")


combined, accuracy, precision, model = make_predictions(tennis_rolling_complete, available_predictors)

print(f"Model Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Model Precision: {precision:.4f} ({precision*100:.2f}%)")

feature_importance = pd.DataFrame({
    'feature': available_predictors,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

feature_importance


Mens Tennis Match Prediction Model
Model Accuracy: 0.6247 (62.47%)
Model Precision: 0.6598 (65.98%)


Unnamed: 0,feature,importance
7,rank_diff,0.164215
8,p1_rank_L10,0.156514
11,p2_wr_L10,0.132554
1,Rank_1,0.118319
2,Rank_2,0.105216
10,p2_rank_L10,0.085357
3,tour_code,0.078508
9,p1_wr_L10,0.060092
5,round_code,0.057791
0,Best of,0.016181


In [171]:
print(tennis_rolling_complete.columns.tolist())

['Tournament', 'Date', 'Series', 'Court', 'Surface', 'Round', 'Best of', 'Player_1', 'Player_2', 'Winner', 'Rank_1', 'Rank_2', 'Pts_1', 'Pts_2', 'Odd_1', 'Odd_2', 'tour_code', 'surface_code', 'round_code', 'court_code', 'player_1_wins', 'rank_diff', 'p1_rank_L10', 'p1_wr_L10', 'p1_hard_wr_L5', 'p1_hard_rank_L5', 'p1_clay_wr_L5', 'p1_clay_rank_L5', 'p1_grass_wr_L5', 'p1_grass_rank_L5', 'p2_rank_L10', 'p2_wr_L10']


AI Generated Testing Function

In [172]:
#EVERYTHING IN THIS CELL IS AI GENERATED -> I did this for time sake to test my model on some matches

def predict_tennis_match(player_1_name, player_2_name, player_1_rank, player_2_rank, 
                        player_1_points, player_2_points, tournament_name, surface, 
                        round_name, court_type="Outdoor", best_of_sets=3):
    """
    Predict the outcome of any tennis match using your trained model
    """
    
    print(f"{tournament_name} - {round_name}")
    print(f"{player_1_name} (#{player_1_rank}) vs {player_2_name} (#{player_2_rank})")
    print(f"Surface: {surface}")
    print()
    
    # Calculate advantages
    rank_advantage = player_2_rank - player_1_rank
    points_advantage = player_1_points - player_2_points
    
    # Get rolling stats
    def get_player_rolling_stats(player_name, is_player_1=True):
        recent_data = tennis_rolling_complete[tennis_rolling_complete['Date'] >= '2023-01-01']
        
        if is_player_1:
            player_matches = recent_data[recent_data['Player_1'] == player_name]
        else:
            p1_matches = recent_data[recent_data['Player_1'] == player_name]
            p2_matches = recent_data[recent_data['Player_2'] == player_name]
            player_matches = pd.concat([p1_matches, p2_matches])
        
        if len(player_matches) > 0:
            latest_match = player_matches.iloc[-1]
            if is_player_1:
                rank_l10 = latest_match.get('p1_rank_L10', player_1_rank)
                wr_l10 = latest_match.get('p1_wr_L10', 0.5)
            else:
                rank_l10 = latest_match.get('p2_rank_L10', player_2_rank)
                wr_l10 = latest_match.get('p2_wr_L10', 0.5)
            return rank_l10, wr_l10
        else:
            return (player_1_rank, 0.5) if is_player_1 else (player_2_rank, 0.5)
    
    p1_rank_l10, p1_wr_l10 = get_player_rolling_stats(player_1_name, True)
    p2_rank_l10, p2_wr_l10 = get_player_rolling_stats(player_2_name, False)
    
    # Categorical mapping
    def safe_categorical_map(value, category_series):
        categories = category_series.astype("category").cat.categories
        try:
            return list(categories).index(value)
        except ValueError:
            return 0
    
    tour_code = safe_categorical_map(tournament_name, tennis_rolling_complete["Tournament"])
    surface_code = safe_categorical_map(surface, tennis_rolling_complete["Surface"])
    round_code = safe_categorical_map(round_name, tennis_rolling_complete["Round"])
    court_code = safe_categorical_map(court_type, tennis_rolling_complete["Court"])
    
    # Handle differences encoding
    temp_data = tennis_rolling_complete.copy()
    temp_data.loc[len(temp_data)] = [None] * len(temp_data.columns)
    temp_data.loc[len(temp_data)-1, 'Rank_1'] = player_1_rank
    temp_data.loc[len(temp_data)-1, 'Rank_2'] = player_2_rank  
    temp_data.loc[len(temp_data)-1, 'Pts_1'] = player_1_points
    temp_data.loc[len(temp_data)-1, 'Pts_2'] = player_2_points
    
    temp_data['rank_diff_raw'] = temp_data['Rank_2'] - temp_data['Rank_1']
    temp_data['pts_diff_raw'] = temp_data['Pts_1'] - temp_data['Pts_2']
    
    rank_diff_code = temp_data['rank_diff_raw'].astype("category").cat.codes.iloc[-1]
    pts_diff_code = temp_data['pts_diff_raw'].astype("category").cat.codes.iloc[-1]
    
    # Create prediction data
    prediction_data = pd.DataFrame({
        "Best of": [best_of_sets],
        "Rank_1": [player_1_rank],
        "Rank_2": [player_2_rank], 
        "Pts_1": [player_1_points],
        "Pts_2": [player_2_points],
        "tour_code": [tour_code],
        "surface_code": [surface_code],
        "round_code": [round_code],
        "court_code": [court_code],
        "pts_diff": [pts_diff_code],
        "rank_diff": [rank_diff_code],
        "p1_rank_L10": [p1_rank_l10],
        "p1_wr_L10": [p1_wr_l10],
        "p2_rank_L10": [p2_rank_l10],
        "p2_wr_L10": [p2_wr_l10]
    })
    
    # Make prediction
    try:
        prediction = model.predict(prediction_data[available_predictors])[0]
        probabilities = model.predict_proba(prediction_data[available_predictors])[0]
    except Exception as e:
        print(f"Error: {e}")
        return None
    
    # Results
    if prediction == 1:
        winner = player_1_name
        confidence = probabilities[1] * 100
    else:
        winner = player_2_name
        confidence = probabilities[0] * 100
    
    print(f"Predicted Winner: {winner}")
    print(f"Confidence: {confidence:.1f}%")
    print()
    print(f"{player_1_name}: {probabilities[1]*100:.1f}%")
    print(f"{player_2_name}: {probabilities[0]*100:.1f}%")
    
    if abs(rank_advantage) >= 10:
        print(f"\nRanking gap: {abs(rank_advantage)} positions")
    
    return {
        'predicted_winner': winner,
        'player_1_win_probability': probabilities[1],
        'player_2_win_probability': probabilities[0], 
        'confidence': confidence,
        'prediction_binary': prediction
    }

# def quick_predict(p1_name, p2_name, p1_rank, p2_rank, p1_pts=None, p2_pts=None, 
#                   surface="Hard", tournament="ATP Tournament"):
#     """Quick prediction with minimal inputs"""
#     if p1_pts is None:
#         p1_pts = max(3000 - p1_rank * 20, 100)
#     if p2_pts is None:
#         p2_pts = max(3000 - p2_rank * 20, 100)
    
#     return predict_tennis_match(
#         player_1_name=p1_name,
#         player_2_name=p2_name,
#         player_1_rank=p1_rank,
#         player_2_rank=p2_rank,
#         player_1_points=p1_pts,
#         player_2_points=p2_pts,
#         tournament_name=tournament,
#         surface=surface,
#         round_name="1st Round"
#     )



In [173]:

result = predict_tennis_match(
    player_1_name="Djokovic N.",    
    player_2_name="Sinner J.",         
    player_1_rank=1,                      
    player_2_rank=4,                      
    player_1_points=12000,               
    player_2_points=9500,                
    tournament_name="Australian Open",     
    surface="Hard",                       
    round_name="Semifinals",             
    best_of_sets=5
)

Australian Open - Semifinals
Djokovic N. (#1) vs Sinner J. (#4)
Surface: Hard

Predicted Winner: Sinner J.
Confidence: 60.7%

Djokovic N.: 39.3%
Sinner J.: 60.7%


  temp_data.loc[len(temp_data)] = [None] * len(temp_data.columns)
