In [285]:
import pandas as pd
from datetime import datetime

In [286]:
data = pd.read_csv('RawATPData.csv')
data.head()

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score
0,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,Dosedel S.,63,77,-1,-1,-1.0,-1.0,6-4 6-2
1,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Clement A.,Enqvist T.,Enqvist T.,56,5,-1,-1,-1.0,-1.0,3-6 3-6
2,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,Escude N.,40,655,-1,-1,-1.0,-1.0,6-7 7-5 6-3
3,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Knippschild J.,Federer R.,Federer R.,87,65,-1,-1,-1.0,-1.0,1-6 4-6
4,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,Fromberg R.,81,198,-1,-1,-1.0,-1.0,7-6 5-7 6-4


In [287]:
data.shape

(63811, 17)

In [288]:
def calculate_weighted_h2h(data, decay_factor=0.1):
    """Calculate H2H with exponential decay for older matches"""
    
    from datetime import datetime
    import numpy as np
    
    data = data.sort_values('Date').reset_index(drop=True)
    data['h2h_weighted_p1'] = 0.5
    
    pair_matches = {}
    
    for idx, row in data.iterrows():
        player_1 = row['Player_1']
        player_2 = row['Player_2']
        winner = row['Winner']
        current_date = pd.to_datetime(row['Date'])
        
        pair_key = tuple(sorted([player_1, player_2]))
        
        if pair_key not in pair_matches:
            pair_matches[pair_key] = []
        
        # Calculate weighted H2H based on previous matches
        if len(pair_matches[pair_key]) > 0:
            weighted_score = 0
            total_weight = 0
            
            for match in pair_matches[pair_key]:
                match_date = pd.to_datetime(match['date'])
                days_ago = (current_date - match_date).days
                weight = np.exp(-decay_factor * days_ago / 365)  # Decay over years
                
                if match['winner'] == player_1:
                    weighted_score += weight
                total_weight += weight
            
            if total_weight > 0:
                data.at[idx, 'h2h_weighted_p1'] = weighted_score / total_weight
        
        # Add current match to history
        pair_matches[pair_key].append({
            'winner': winner,
            'date': row['Date']
        })
    
    return data

In [289]:
def calculate_h2h_records(data):
    """Calculate head-to-head records for all player pairs"""
    
    # Sort data chronologically
    data = data.sort_values('Date').reset_index(drop=True)
    
    # Initialize H2H columns
    data['h2h_wins_p1'] = 0
    data['h2h_total_matches'] = 0
    data['h2h_win_rate_p1'] = 0.5  # Default 50% if no history
    
    # Track H2H records for each player pair
    h2h_records = {}
    
    for idx, row in data.iterrows():
        player_1 = row['Player_1']
        player_2 = row['Player_2']
        winner = row['Winner']
        
        # Create ordered pair key (alphabetical order to ensure consistency)
        pair_key = tuple(sorted([player_1, player_2]))
        
        # Initialize if first meeting
        if pair_key not in h2h_records:
            h2h_records[pair_key] = {'total': 0, player_1: 0, player_2: 0}
        
        # Get current H2H record BEFORE this match
        total_previous = h2h_records[pair_key]['total']
        p1_wins = h2h_records[pair_key][player_1]
        
        # Store H2H stats before this match
        data.at[idx, 'h2h_total_matches'] = total_previous
        data.at[idx, 'h2h_wins_p1'] = p1_wins
        
        if total_previous > 0:
            data.at[idx, 'h2h_win_rate_p1'] = p1_wins / total_previous
        else:
            data.at[idx, 'h2h_win_rate_p1'] = 0.5
        
        # Update H2H record after this match
        h2h_records[pair_key]['total'] += 1
        h2h_records[pair_key][winner] += 1
    
    return data, h2h_records

In [290]:
def calculate_surface_h2h(data):
    """Calculate H2H records by surface type"""
    
    surfaces = ['Hard', 'Clay', 'Grass']
    
    for surface in surfaces:
        surface_data = data[data['Surface'] == surface].copy()
        surface_data = surface_data.sort_values('Date')
        
        # Initialize surface H2H tracking
        surface_h2h = {}
        surface_data[f'h2h_{surface.lower()}_wins_p1'] = 0
        surface_data[f'h2h_{surface.lower()}_total'] = 0
        surface_data[f'h2h_{surface.lower()}_wr_p1'] = 0.5
        
        for idx, row in surface_data.iterrows():
            player_1 = row['Player_1']
            player_2 = row['Player_2']
            winner = row['Winner']
            
            pair_key = tuple(sorted([player_1, player_2]))
            
            if pair_key not in surface_h2h:
                surface_h2h[pair_key] = {'total': 0, player_1: 0, player_2: 0}
            
            # Store current H2H before match
            total = surface_h2h[pair_key]['total']
            p1_wins = surface_h2h[pair_key][player_1]
            
            surface_data.at[idx, f'h2h_{surface.lower()}_wins_p1'] = p1_wins
            surface_data.at[idx, f'h2h_{surface.lower()}_total'] = total
            
            if total > 0:
                surface_data.at[idx, f'h2h_{surface.lower()}_wr_p1'] = p1_wins / total
            
            # Update after match
            surface_h2h[pair_key]['total'] += 1
            surface_h2h[pair_key][winner] += 1
        
        # Merge back to main data
        merge_cols = ['Date', 'Player_1', 'Player_2'] + [col for col in surface_data.columns if f'h2h_{surface.lower()}' in col]
        data = data.merge(surface_data[merge_cols], on=['Date', 'Player_1', 'Player_2'], how='left')
    
    return data

In [291]:
def calculate_recent_h2h(data, lookback_matches=5):
    """Calculate H2H record for last N meetings only"""
    
    data = data.sort_values('Date').reset_index(drop=True)
    data[f'h2h_last_{lookback_matches}_wins_p1'] = 0
    data[f'h2h_last_{lookback_matches}_total'] = 0
    
    # Store match history for each pair
    pair_history = {}
    
    for idx, row in data.iterrows():
        player_1 = row['Player_1']
        player_2 = row['Player_2']
        winner = row['Winner']
        
        pair_key = tuple(sorted([player_1, player_2]))
        
        if pair_key not in pair_history:
            pair_history[pair_key] = []
        
        # Get last N matches between these players
        recent_matches = pair_history[pair_key][-lookback_matches:]
        
        # Count Player 1 wins in recent matches
        p1_recent_wins = sum(1 for match in recent_matches if match['winner'] == player_1)
        recent_total = len(recent_matches)
        
        data.at[idx, f'h2h_last_{lookback_matches}_wins_p1'] = p1_recent_wins
        data.at[idx, f'h2h_last_{lookback_matches}_total'] = recent_total
        
        # Add this match to history
        pair_history[pair_key].append({
            'winner': winner,
            'date': row['Date'],
            'surface': row['Surface']
        })
    
    return data

In [292]:
data.dtypes

Tournament     object
Date           object
Series         object
Court          object
Surface        object
Round          object
Best of         int64
Player_1       object
Player_2       object
Winner         object
Rank_1          int64
Rank_2          int64
Pts_1           int64
Pts_2           int64
Odd_1         float64
Odd_2         float64
Score          object
dtype: object

In [293]:
data["opp_code"] = data["Player_2"].astype("category").cat.codes
data["tour_code"] = data["Tournament"].astype("category").cat.codes
data["surface_code"] = data["Surface"].astype("category").cat.codes
data["round_code"] = data["Round"].astype("category").cat.codes
data["winner_code"] = data["Winner"].astype("category").cat.codes
data["court_code"] = data["Court"].astype("category").cat.codes


In [294]:
data['player_1_wins'] = (data['Winner'] == data['Player_1']).astype(int)

data['rank_diff'] = data['Rank_2'] - data['Rank_1'] 
data["rank_diff"] = data["rank_diff"].astype("category").cat.codes



In [295]:
del data["opp_code"]
del data["winner_code"]
del data["Score"]
data

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,...,Pts_1,Pts_2,Odd_1,Odd_2,tour_code,surface_code,round_code,court_code,player_1_wins,rank_diff
0,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,Dosedel S.,...,-1,-1,-1.00,-1.00,24,3,0,1,1,827
1,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Clement A.,Enqvist T.,Enqvist T.,...,-1,-1,-1.00,-1.00,24,3,0,1,0,762
2,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,Escude N.,...,-1,-1,-1.00,-1.00,24,3,0,1,1,1387
3,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Knippschild J.,Federer R.,Federer R.,...,-1,-1,-1.00,-1.00,24,3,0,1,0,791
4,Australian Hardcourt Championships,2000-01-03 00:00:00.000,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,Fromberg R.,...,-1,-1,-1.00,-1.00,24,3,0,1,1,930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63806,Japan Open Tennis Championships,2024-09-29 00:00:00.000,ATP500,Outdoor,Hard,Quarterfinals,3,Rune H.,Nishikori K.,Rune H.,...,2780,288,1.57,2.38,125,3,4,1,1,999
63807,Japan Open Tennis Championships,2024-09-29 00:00:00.000,ATP500,Outdoor,Hard,Quarterfinals,3,Shelton B.,Fils A.,Fils A.,...,2490,1775,1.67,2.20,125,3,4,1,0,820
63808,Japan Open Tennis Championships,2024-09-30 00:00:00.000,ATP500,Outdoor,Hard,Semifinals,3,Humbert U.,Machac T.,Humbert U.,...,2370,1374,1.57,2.38,125,3,6,1,1,829
63809,Japan Open Tennis Championships,2024-09-30 00:00:00.000,ATP500,Outdoor,Hard,Semifinals,3,Rune H.,Fils A.,Fils A.,...,2780,1775,1.67,2.20,125,3,6,1,0,823


In [296]:
data, h2h_records = calculate_h2h_records(data)
data = calculate_surface_h2h(data)
data = calculate_recent_h2h(data, lookback_matches=5)
data = calculate_weighted_h2h(data)

In [297]:
from sklearn.ensemble import RandomForestClassifier

In [298]:
from sklearn.metrics import accuracy_score, precision_score

In [299]:
# pd.crosstab(index=combined["actual"], columns=combined["predicted"])

In [300]:
grouped_data_1 = data.groupby("Player_1")


In [301]:
def rolling_averages_1(group, cols, new_cols):
    group = group.sort_values("Date")
    rolling_stats = group[cols].rolling(10, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group



In [302]:
def calculate_surface_rolling_averages(data):
    surfaces = ['Hard', 'Clay', 'Grass']
    
    for surface in surfaces:
        surface_data = data[data['Surface'] == surface].copy()
        
        if len(surface_data) == 0:
            continue
        
        grouped = surface_data.groupby('Player_1')
        
        def surface_rolling_avg(group):
            group = group.sort_values('Date')
            group[f'p1_{surface.lower()}_wr_L5'] = group['player_1_wins'].rolling(8, closed='left').mean()
            group[f'p1_{surface.lower()}_rank_L5'] = group['Rank_1'].rolling(8, closed='left').mean()
            return group
        
        surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
        
        merge_cols = ['Date', 'Player_1', f'p1_{surface.lower()}_wr_L5', f'p1_{surface.lower()}_rank_L5']
        data = data.merge(
            surface_rolling[merge_cols], 
            on=['Date', 'Player_1'], 
            how='left'
        )
    
    return data

In [303]:
def calculate_player_2_rolling_averages(data):


    p2_data = data.copy()
    
    p2_data['Temp_Player_1'] = p2_data['Player_1']
    p2_data['Player_1'] = p2_data['Player_2']  
    p2_data['Player_2'] = p2_data['Temp_Player_1']  
    
    p2_data['Temp_Rank_1'] = p2_data['Rank_1']
    p2_data['Rank_1'] = p2_data['Rank_2']
    p2_data['Rank_2'] = p2_data['Temp_Rank_1']
    
    p2_data['Temp_Pts_1'] = p2_data['Pts_1']
    p2_data['Pts_1'] = p2_data['Pts_2']
    p2_data['Pts_2'] = p2_data['Temp_Pts_1']
    

    p2_data['player_1_wins'] = (p2_data['Winner'] == p2_data['Player_1']).astype(int)
    

    cols = ["Rank_1", "player_1_wins"]
    new_cols = ["temp_rank_L10", "temp_wr_L10"]
    
    grouped_p2 = p2_data.groupby("Player_1")
    p2_rolling = grouped_p2.apply(rolling_averages_1, cols=cols, new_cols=new_cols).reset_index(drop=True)
    
    p2_rolling = calculate_surface_rolling_averages(p2_rolling)
    

    merge_cols = ['Date', 'Player_1', 'temp_rank_L10', 'temp_wr_L10']
    
    for surface in ['hard', 'clay', 'grass']:
        if f'p1_{surface}_wr_L8' in p2_rolling.columns:
            merge_cols.extend([f'p1_{surface}_wr_L8', f'p1_{surface}_rank_L8'])
    
    p2_stats = p2_rolling[merge_cols].copy()
    rename_dict = {
        'Player_1': 'Player_2',
        'temp_rank_L10': 'p2_rank_L10',
        'temp_wr_L10': 'p2_wr_L10'
    }
    
    for surface in ['hard', 'clay', 'grass']:
        if f'p1_{surface}_wr_L8' in p2_stats.columns:
            rename_dict[f'p1_{surface}_wr_L8'] = f'p2_{surface}_wr_L8'
            rename_dict[f'p1_{surface}_rank_L8'] = f'p2_{surface}_rank_L8'
    
    p2_stats = p2_stats.rename(columns=rename_dict)
    
    data = data.merge(p2_stats, on=['Date', 'Player_2'], how='left')
    
    return data

In [304]:
import numpy as np
import pandas as pd

def calculate_elo_ratings(data, initial_rating=1500, k_factor=32):
    """
    Calculate ELO ratings for tennis players
    """
    # Sort matches chronologically - important for ELO calculation
    data = data.sort_values('Date').reset_index(drop=True)
    
    # Track each player's current ELO rating
    elo_ratings = {}
    
    # Add ELO columns to data
    data['elo_1'] = 0.0  # Player 1's ELO before this match
    data['elo_2'] = 0.0  # Player 2's ELO before this match
    data['elo_diff'] = 0.0  # ELO difference (P1 - P2)
    
    # Process each match in chronological order
    for idx, row in data.iterrows():
        player_1 = row['Player_1']
        player_2 = row['Player_2']
        winner = row['Winner']
        
        # Get current ELO ratings (start at 1500 for new players)
        if player_1 not in elo_ratings:
            elo_ratings[player_1] = initial_rating
        if player_2 not in elo_ratings:
            elo_ratings[player_2] = initial_rating
            
        elo_1 = elo_ratings[player_1]
        elo_2 = elo_ratings[player_2]
        
        # Store ELO ratings before this match
        data.at[idx, 'elo_1'] = elo_1
        data.at[idx, 'elo_2'] = elo_2
        data.at[idx, 'elo_diff'] = elo_1 - elo_2
        
        # Calculate expected win probability for Player 1
        expected_1 = 1 / (1 + 10**((elo_2 - elo_1) / 400))
        
        # Determine who actually won (1 = win, 0 = loss)
        actual_1 = 1 if winner == player_1 else 0
        actual_2 = 1 - actual_1
        
        # Update ELO ratings based on result
        new_elo_1 = elo_1 + k_factor * (actual_1 - expected_1)
        new_elo_2 = elo_2 + k_factor * (actual_2 - (1 - expected_1))
        
        # Save updated ratings for next match
        elo_ratings[player_1] = new_elo_1
        elo_ratings[player_2] = new_elo_2
    
    return data









# Train your model with ELO features


In [305]:
cols1 = ["Rank_1", "player_1_wins"]
new_cols1 = ["p1_rank_L10",  "p1_wr_L10"]

In [306]:

# Add ELO features to your existing model
tennis_with_elo = calculate_elo_ratings(data.copy())

# Your existing pipeline but with ELO data
grouped_data_1 = tennis_with_elo.groupby("Player_1")
tennis_rolling_p1 = grouped_data_1.apply(rolling_averages_1, cols=cols1, new_cols=new_cols1).reset_index(drop=True)
tennis_rolling_p1 = calculate_surface_rolling_averages(tennis_rolling_p1)
tennis_rolling_complete = calculate_player_2_rolling_averages(tennis_rolling_p1)
h2h_predictors = [
    "h2h_wins_p1", "h2h_total_matches", "h2h_win_rate_p1",
    "h2h_hard_wins_p1", "h2h_clay_wins_p1", "h2h_grass_wins_p1",
    "h2h_last_5_wins_p1", "h2h_weighted_p1"
]

rolling_predictors = ["p1_rank_L10", "p1_wr_L10", "p2_rank_L10", "p2_wr_L10"]

surface_predictors=["p1_hard_wr_L5", "p1_clay_wr_L5", "p1_grass_wr_L5",
    "p1_hard_rank_L5", "p1_clay_rank_L5", "p1_grass_rank_L5",
    "p2_hard_wr_L5", "p2_clay_wr_L5", "p2_grass_wr_L5",
    "p2_hard_rank_L5", "p2_clay_rank_L5", "p2_grass_rank_L5"]

  tennis_rolling_p1 = grouped_data_1.apply(rolling_averages_1, cols=cols1, new_cols=new_cols1).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
  p2_rolling = grouped_p2.apply(rolling_averages_1, cols=cols, new_cols=new_cols).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)
  surface_rolling = grouped.apply(surface_rolling_avg).reset_index(drop=True)


In [307]:
base_predictors = [
    "Best of", "Rank_1", "Rank_2",
    "tour_code", "surface_code", "round_code", "court_code", "rank_diff"
]
elo_predictors = ["elo_1", "elo_2", "elo_diff"]


In [308]:
# Combine all predictors
all_predictors = base_predictors + elo_predictors + rolling_predictors + surface_predictors + h2h_predictors
available_predictors = [col for col in all_predictors if col in tennis_rolling_complete.columns]


print(f"Using {len(available_predictors)} predictors: {available_predictors}")

Using 29 predictors: ['Best of', 'Rank_1', 'Rank_2', 'tour_code', 'surface_code', 'round_code', 'court_code', 'rank_diff', 'elo_1', 'elo_2', 'elo_diff', 'p1_rank_L10', 'p1_wr_L10', 'p2_rank_L10', 'p2_wr_L10', 'p1_hard_wr_L5', 'p1_clay_wr_L5', 'p1_grass_wr_L5', 'p1_hard_rank_L5', 'p1_clay_rank_L5', 'p1_grass_rank_L5', 'h2h_wins_p1', 'h2h_total_matches', 'h2h_win_rate_p1', 'h2h_hard_wins_p1', 'h2h_clay_wins_p1', 'h2h_grass_wins_p1', 'h2h_last_5_wins_p1', 'h2h_weighted_p1']


In [309]:

def make_predictions(data, predictors):
    train = data[data["Date"] < '2023-01-01']
    test = data[data["Date"] >= '2023-01-01']

    rf = RandomForestClassifier (
    n_estimators=500,
    min_samples_split=50,
    min_samples_leaf=20,
    max_depth=12,
    max_features='sqrt',
    random_state=1
    )

    rf.fit(train[predictors], train["player_1_wins"])
    preds = rf.predict(test[predictors])
    
    accuracy = accuracy_score(test["player_1_wins"], preds)
    precision = precision_score(test["player_1_wins"], preds)
    
    combined = pd.DataFrame(
        dict(actual=test["player_1_wins"], predicted=preds), 
        index=test.index
    )
    
    return combined, accuracy, precision, rf

In [310]:

print("Mens Tennis Match Prediction Model")


combined, accuracy, precision, model = make_predictions(tennis_rolling_complete, available_predictors)

print(f"Model Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Model Precision: {precision:.4f} ({precision*100:.2f}%)")

feature_importance = pd.DataFrame({
    'feature': available_predictors,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

feature_importance


Mens Tennis Match Prediction Model
Model Accuracy: 0.6444 (64.44%)
Model Precision: 0.6678 (66.78%)


Unnamed: 0,feature,importance
10,elo_diff,0.143355
7,rank_diff,0.084085
14,p2_wr_L10,0.082165
8,elo_1,0.078717
11,p1_rank_L10,0.07748
1,Rank_1,0.065235
9,elo_2,0.05869
2,Rank_2,0.05622
13,p2_rank_L10,0.047065
18,p1_hard_rank_L5,0.038159


In [311]:
print(tennis_rolling_complete.columns.tolist())

['Tournament', 'Date', 'Series', 'Court', 'Surface', 'Round', 'Best of', 'Player_1', 'Player_2', 'Winner', 'Rank_1', 'Rank_2', 'Pts_1', 'Pts_2', 'Odd_1', 'Odd_2', 'tour_code', 'surface_code', 'round_code', 'court_code', 'player_1_wins', 'rank_diff', 'h2h_wins_p1', 'h2h_total_matches', 'h2h_win_rate_p1', 'h2h_hard_wins_p1', 'h2h_hard_total', 'h2h_hard_wr_p1', 'h2h_clay_wins_p1', 'h2h_clay_total', 'h2h_clay_wr_p1', 'h2h_grass_wins_p1', 'h2h_grass_total', 'h2h_grass_wr_p1', 'h2h_last_5_wins_p1', 'h2h_last_5_total', 'h2h_weighted_p1', 'elo_1', 'elo_2', 'elo_diff', 'p1_rank_L10', 'p1_wr_L10', 'p1_hard_wr_L5', 'p1_hard_rank_L5', 'p1_clay_wr_L5', 'p1_clay_rank_L5', 'p1_grass_wr_L5', 'p1_grass_rank_L5', 'p2_rank_L10', 'p2_wr_L10']


AI Generated Testing Function

In [312]:
# #EVERYTHING IN THIS CELL IS AI GENERATED -> I did this for time sake to test my model on some matches

# def predict_tennis_match(player_1_name, player_2_name, player_1_rank, player_2_rank, 
#                         player_1_points, player_2_points, tournament_name, surface, 
#                         round_name, court_type="Outdoor", best_of_sets=3):
#     """
#     Predict the outcome of any tennis match using your trained model
#     """
    
#     print(f"{tournament_name} - {round_name}")
#     print(f"{player_1_name} (#{player_1_rank}) vs {player_2_name} (#{player_2_rank})")
#     print(f"Surface: {surface}")
#     print()
    
#     # Calculate advantages
#     rank_advantage = player_2_rank - player_1_rank
#     points_advantage = player_1_points - player_2_points
    
#     # Get rolling stats
#     def get_player_rolling_stats(player_name, is_player_1=True):
#         recent_data = tennis_rolling_complete[tennis_rolling_complete['Date'] >= '2023-01-01']
        
#         if is_player_1:
#             player_matches = recent_data[recent_data['Player_1'] == player_name]
#         else:
#             p1_matches = recent_data[recent_data['Player_1'] == player_name]
#             p2_matches = recent_data[recent_data['Player_2'] == player_name]
#             player_matches = pd.concat([p1_matches, p2_matches])
        
#         if len(player_matches) > 0:
#             latest_match = player_matches.iloc[-1]
#             if is_player_1:
#                 rank_l10 = latest_match.get('p1_rank_L10', player_1_rank)
#                 wr_l10 = latest_match.get('p1_wr_L10', 0.5)
#             else:
#                 rank_l10 = latest_match.get('p2_rank_L10', player_2_rank)
#                 wr_l10 = latest_match.get('p2_wr_L10', 0.5)
#             return rank_l10, wr_l10
#         else:
#             return (player_1_rank, 0.5) if is_player_1 else (player_2_rank, 0.5)
    
#     p1_rank_l10, p1_wr_l10 = get_player_rolling_stats(player_1_name, True)
#     p2_rank_l10, p2_wr_l10 = get_player_rolling_stats(player_2_name, False)
    
#     # Categorical mapping
#     def safe_categorical_map(value, category_series):
#         categories = category_series.astype("category").cat.categories
#         try:
#             return list(categories).index(value)
#         except ValueError:
#             return 0
    
#     tour_code = safe_categorical_map(tournament_name, tennis_rolling_complete["Tournament"])
#     surface_code = safe_categorical_map(surface, tennis_rolling_complete["Surface"])
#     round_code = safe_categorical_map(round_name, tennis_rolling_complete["Round"])
#     court_code = safe_categorical_map(court_type, tennis_rolling_complete["Court"])
    
#     # Handle differences encoding
#     temp_data = tennis_rolling_complete.copy()
#     temp_data.loc[len(temp_data)] = [None] * len(temp_data.columns)
#     temp_data.loc[len(temp_data)-1, 'Rank_1'] = player_1_rank
#     temp_data.loc[len(temp_data)-1, 'Rank_2'] = player_2_rank  
#     temp_data.loc[len(temp_data)-1, 'Pts_1'] = player_1_points
#     temp_data.loc[len(temp_data)-1, 'Pts_2'] = player_2_points
    
#     temp_data['rank_diff_raw'] = temp_data['Rank_2'] - temp_data['Rank_1']
#     temp_data['pts_diff_raw'] = temp_data['Pts_1'] - temp_data['Pts_2']
    
#     rank_diff_code = temp_data['rank_diff_raw'].astype("category").cat.codes.iloc[-1]
#     pts_diff_code = temp_data['pts_diff_raw'].astype("category").cat.codes.iloc[-1]
    
#     # Create prediction data
#     prediction_data = pd.DataFrame({
#         "Best of": [best_of_sets],
#         "Rank_1": [player_1_rank],
#         "Rank_2": [player_2_rank], 
#         "Pts_1": [player_1_points],
#         "Pts_2": [player_2_points],
#         "tour_code": [tour_code],
#         "surface_code": [surface_code],
#         "round_code": [round_code],
#         "court_code": [court_code],
#         "pts_diff": [pts_diff_code],
#         "rank_diff": [rank_diff_code],
#         "p1_rank_L10": [p1_rank_l10],
#         "p1_wr_L10": [p1_wr_l10],
#         "p2_rank_L10": [p2_rank_l10],
#         "p2_wr_L10": [p2_wr_l10]
#     })
    
#     # Make prediction
#     try:
#         prediction = model.predict(prediction_data[available_predictors])[0]
#         probabilities = model.predict_proba(prediction_data[available_predictors])[0]
#     except Exception as e:
#         print(f"Error: {e}")
#         return None
    
#     # Results
#     if prediction == 1:
#         winner = player_1_name
#         confidence = probabilities[1] * 100
#     else:
#         winner = player_2_name
#         confidence = probabilities[0] * 100
    
#     print(f"Predicted Winner: {winner}")
#     print(f"Confidence: {confidence:.1f}%")
#     print()
#     print(f"{player_1_name}: {probabilities[1]*100:.1f}%")
#     print(f"{player_2_name}: {probabilities[0]*100:.1f}%")
    
#     if abs(rank_advantage) >= 10:
#         print(f"\nRanking gap: {abs(rank_advantage)} positions")
    
#     return {
#         'predicted_winner': winner,
#         'player_1_win_probability': probabilities[1],
#         'player_2_win_probability': probabilities[0], 
#         'confidence': confidence,
#         'prediction_binary': prediction
#     }

# # def quick_predict(p1_name, p2_name, p1_rank, p2_rank, p1_pts=None, p2_pts=None, 
# #                   surface="Hard", tournament="ATP Tournament"):
# #     """Quick prediction with minimal inputs"""
# #     if p1_pts is None:
# #         p1_pts = max(3000 - p1_rank * 20, 100)
# #     if p2_pts is None:
# #         p2_pts = max(3000 - p2_rank * 20, 100)
    
# #     return predict_tennis_match(
# #         player_1_name=p1_name,
# #         player_2_name=p2_name,
# #         player_1_rank=p1_rank,
# #         player_2_rank=p2_rank,
# #         player_1_points=p1_pts,
# #         player_2_points=p2_pts,
# #         tournament_name=tournament,
# #         surface=surface,
# #         round_name="1st Round"
# #     )



In [313]:

result = predict_tennis_match(
    player_1_name="Djokovic N.",    
    player_2_name="Sinner J.",         
    player_1_rank=1,                      
    player_2_rank=4,                      
    player_1_points=12000,               
    player_2_points=9500,                
    tournament_name="Australian Open",     
    surface="Hard",                       
    round_name="Semifinals",             
    best_of_sets=5
)

Australian Open - Semifinals
Djokovic N. (#1) vs Sinner J. (#4)
Surface: Hard

Error: "['elo_1', 'elo_2', 'elo_diff', 'p1_hard_wr_L5', 'p1_clay_wr_L5', 'p1_grass_wr_L5', 'p1_hard_rank_L5', 'p1_clay_rank_L5', 'p1_grass_rank_L5', 'h2h_wins_p1', 'h2h_total_matches', 'h2h_win_rate_p1', 'h2h_hard_wins_p1', 'h2h_clay_wins_p1', 'h2h_grass_wins_p1', 'h2h_last_5_wins_p1', 'h2h_weighted_p1'] not in index"


  temp_data.loc[len(temp_data)] = [None] * len(temp_data.columns)
