# Predictive Modeling


In [12]:
import pandas as pd
import numpy as np
from pathlib import Path

# Load the match-level dataset
data_path = Path("../data/clean-match/atp_matches_match_level_1.csv")
df = pd.read_csv(data_path)

print(f"Loaded {len(df):,} records")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")

# Create binary missing value indicators for baseline features
baseline_features = ['delta_rank', 'delta_age', 'delta_ht', 'delta_rank_points']
for feature in baseline_features:
    if feature in df.columns:
        missing_col = f'{feature}_missing'
        df[missing_col] = df[feature].isna().astype(int)
        print(f"Created {missing_col}: {df[missing_col].sum():,} missing values ({df[missing_col].mean():.2%})")

# Impute missing values with 0 (before standardization)
print("\nImputing missing values with 0:")
for feature in baseline_features:
    if feature in df.columns:
        missing_count = df[feature].isna().sum()
        df[feature] = df[feature].fillna(0)
        print(f"  {feature}: imputed {missing_count:,} missing values")

df.head()


Loaded 58,502 records
Shape: (58502, 23)

Columns: ['tourney_id', 'tourney_name', 'tourney_date', 'surface', 'round', 'draw_size', 'player_A_id', 'player_A_name', 'player_A_rank', 'player_A_rank_points', 'player_A_age', 'player_A_ht', 'player_B_id', 'player_B_name', 'player_B_rank', 'player_B_rank_points', 'player_B_age', 'player_B_ht', 'delta_rank', 'delta_age', 'delta_ht', 'delta_rank_points', 'target']
Created delta_rank_missing: 1,222 missing values (2.09%)
Created delta_age_missing: 6 missing values (0.01%)
Created delta_ht_missing: 2,271 missing values (3.88%)
Created delta_rank_points_missing: 1,222 missing values (2.09%)

Imputing missing values with 0:
  delta_rank: imputed 1,222 missing values
  delta_age: imputed 6 missing values
  delta_ht: imputed 2,271 missing values
  delta_rank_points: imputed 1,222 missing values


Unnamed: 0,tourney_id,tourney_name,tourney_date,surface,round,draw_size,player_A_id,player_A_name,player_A_rank,player_A_rank_points,...,player_B_ht,delta_rank,delta_age,delta_ht,delta_rank_points,target,delta_rank_missing,delta_age_missing,delta_ht_missing,delta_rank_points_missing
0,2005-1536,Madrid Masters,20051017,Hard,R64,48,102720,Tomas Zib,63.0,621.0,...,198.0,21.0,5.5,-20.0,-206.0,0,0,0,0,0
1,2005-1536,Madrid Masters,20051017,Hard,R64,48,102845,Carlos Moya,33.0,1005.0,...,183.0,-8.0,5.0,7.0,169.0,1,0,0,0,0
2,2005-1536,Madrid Masters,20051017,Hard,R64,48,102450,Tim Henman,26.0,1120.0,...,188.0,-2.0,6.7,-3.0,5.0,1,0,0,0,0
3,2005-1536,Madrid Masters,20051017,Hard,R64,48,104022,Mikhail Youzhny,29.0,1090.0,...,190.0,-26.0,0.4,-7.0,412.0,0,0,0,0,0
4,2005-1536,Madrid Masters,20051017,Hard,R64,48,103017,Nicolas Kiefer,30.0,1070.0,...,180.0,-15.0,4.7,3.0,299.0,0,0,0,0,0


In [13]:
# Split dataset into training (2006-2021) and test (2022-2024) based on match date
# Extract year from tourney_date (format: YYYYMMDD)
df['year'] = pd.to_numeric(
    df['tourney_date'].astype(str).str[:4], 
    errors='coerce'
)

# Create train/test masks
train_mask = (df['year'] >= 2006) & (df['year'] <= 2021)
test_mask = (df['year'] >= 2022) & (df['year'] <= 2024)

# Split the data
df_train = df[train_mask].copy()
df_test = df[test_mask].copy()

print(f"Training set: {len(df_train):,} rows (2006-2021)")
print(f"Test set:      {len(df_test):,} rows (2022-2024)")
print(f"\nTraining set year range: {df_train['year'].min():.0f} - {df_train['year'].max():.0f}")
print(f"Test set year range:      {df_test['year'].min():.0f} - {df_test['year'].max():.0f}")


Training set: 46,259 rows (2006-2021)
Test set:      8,979 rows (2022-2024)

Training set year range: 2006 - 2021
Test set year range:      2022 - 2024


In [14]:
# Select features and target
# Include both imputed delta features and their missingness indicators
delta_features = ['delta_rank_points', 'delta_rank', 'delta_age', 'delta_ht']
missing_indicators = [f'{feat}_missing' for feat in delta_features]
feature_cols = delta_features + missing_indicators
target_col = 'target'

# Create feature matrices and target vectors
X_train = df_train[feature_cols]
X_test = df_test[feature_cols]
y_train = df_train[target_col]
y_test = df_test[target_col]

print(f"Training set:")
print(f"  Features shape: {X_train.shape}")
print(f"  Target shape: {y_train.shape}")
print(f"\nTest set:")
print(f"  Features shape: {X_test.shape}")
print(f"  Target shape: {y_test.shape}")
print(f"\nFeatures ({len(feature_cols)}): {feature_cols}")
print(f"\nTarget distribution (train):")
print(y_train.value_counts().sort_index())
print(f"\nTarget distribution (test):")
print(y_test.value_counts().sort_index())


Training set:
  Features shape: (46259, 8)
  Target shape: (46259,)

Test set:
  Features shape: (8979, 8)
  Target shape: (8979,)

Features (8): ['delta_rank_points', 'delta_rank', 'delta_age', 'delta_ht', 'delta_rank_points_missing', 'delta_rank_missing', 'delta_age_missing', 'delta_ht_missing']

Target distribution (train):
target
0    23050
1    23209
Name: count, dtype: int64

Target distribution (test):
target
0    4528
1    4451
Name: count, dtype: int64


In [15]:
# Standardize features using StandardScaler
from sklearn.preprocessing import StandardScaler

# Fit scaler on training data only
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames to preserve column names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("Standardized training features - mean values (should be near zero):")
print(X_train_scaled.mean().round(6))
print(f"\nStandardized training features - std values (should be ~1.0):")
print(X_train_scaled.std().round(6))


Standardized training features - mean values (should be near zero):
delta_rank_points           -0.0
delta_rank                   0.0
delta_age                   -0.0
delta_ht                     0.0
delta_rank_points_missing   -0.0
delta_rank_missing          -0.0
delta_age_missing            0.0
delta_ht_missing             0.0
dtype: float64

Standardized training features - std values (should be ~1.0):
delta_rank_points            1.000011
delta_rank                   1.000011
delta_age                    1.000011
delta_ht                     1.000011
delta_rank_points_missing    1.000011
delta_rank_missing           1.000011
delta_age_missing            1.000011
delta_ht_missing             1.000011
dtype: float64


In [16]:
# Train logistic regression model
from sklearn.linear_model import LogisticRegression

# Use default regularization (C=1.0)
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)

print("Logistic Regression Model Trained")
print(f"Number of iterations: {lr_model.n_iter_[0]}")
print(f"Regularization strength (C): {lr_model.C}")

# Get coefficients with feature names
coefficients = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'coefficient': lr_model.coef_[0]
})

# Sort by absolute value of coefficient
coefficients['abs_coefficient'] = coefficients['coefficient'].abs()
coefficients = coefficients.sort_values('abs_coefficient', ascending=False).drop(columns='abs_coefficient')

print("\nModel Coefficients (sorted by absolute value):")
print("=" * 60)
print(coefficients.to_string(index=False))


Logistic Regression Model Trained
Number of iterations: 9
Regularization strength (C): 1.0

Model Coefficients (sorted by absolute value):
                  feature  coefficient
        delta_rank_points     0.801364
               delta_rank    -0.506150
                 delta_ht     0.107098
                delta_age    -0.060202
delta_rank_points_missing     0.048178
       delta_rank_missing     0.048178
         delta_ht_missing     0.044656
        delta_age_missing    -0.024570


In [17]:
# Evaluate model on test set
from sklearn.metrics import roc_auc_score, log_loss

# Get predictions and probabilities
y_pred_proba = lr_model.predict_proba(X_test_scaled)[:, 1]

# Calculate metrics
roc_auc = roc_auc_score(y_test, y_pred_proba)
log_loss_score = log_loss(y_test, y_pred_proba)

print("Model Performance on Test Set:")
print("=" * 50)
print(f"ROC AUC:  {roc_auc:.4f}")
print(f"Log Loss: {log_loss_score:.4f}")

# Display updated coefficients including missingness indicators
print("\n" + "=" * 50)
print("Model Coefficients (including missingness indicators):")
print("=" * 50)
coefficients = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'coefficient': lr_model.coef_[0]
})

# Sort by absolute value of coefficient
coefficients['abs_coefficient'] = coefficients['coefficient'].abs()
coefficients = coefficients.sort_values('abs_coefficient', ascending=False).drop(columns='abs_coefficient')

print(coefficients.to_string(index=False))


Model Performance on Test Set:
ROC AUC:  0.7008
Log Loss: 0.6317

Model Coefficients (including missingness indicators):
                  feature  coefficient
        delta_rank_points     0.801364
               delta_rank    -0.506150
                 delta_ht     0.107098
                delta_age    -0.060202
delta_rank_points_missing     0.048178
       delta_rank_missing     0.048178
         delta_ht_missing     0.044656
        delta_age_missing    -0.024570


In [18]:
# Calculate rolling 52-week win percentage for each player
# Convert tourney_date to datetime for date calculations
df['match_date'] = pd.to_datetime(df['tourney_date'].astype(str), format='%Y%m%d', errors='coerce')

# Sort by player and date for efficient calculation
df_sorted = df.sort_values(['player_A_id', 'match_date']).copy()

print("Calculating rolling 52-week win percentages...")
print(f"Total matches: {len(df_sorted):,}")


Calculating rolling 52-week win percentages...
Total matches: 58,502


In [19]:
# Calculate win percentage for each player over rolling 52-week window
# Exclude the current match from calculations
# A player can appear as either player A or player B in different matches

def calculate_win_pct(row, df_all, player_id, date_col):
    """Calculate 52-week rolling win percentage for a player, excluding current match"""
    match_date = row[date_col]
    
    if pd.isna(match_date) or pd.isna(player_id):
        return np.nan, False  # (win_pct, has_history)
    
    # Define 52-week window (365 days before current match)
    window_start = match_date - pd.Timedelta(days=365)
    
    # Get all matches for this player in the 52-week window, excluding current match
    # Player can be either player A or player B
    player_matches = df_all[
        ((df_all['player_A_id'] == player_id) | (df_all['player_B_id'] == player_id)) &
        (df_all[date_col] >= window_start) &
        (df_all[date_col] < match_date)  # Exclude current match
    ].copy()
    
    if len(player_matches) == 0:
        return np.nan, False  # No history
    
    # Calculate wins: player wins if they are player A and target=1, or player B and target=0
    wins = (
        ((player_matches['player_A_id'] == player_id) & (player_matches['target'] == 1)) |
        ((player_matches['player_B_id'] == player_id) & (player_matches['target'] == 0))
    ).sum()
    
    total_matches = len(player_matches)
    win_pct = wins / total_matches if total_matches > 0 else np.nan
    
    return win_pct, True

# Calculate win percentages for player A and player B
print("Calculating win percentages for player A...")
df_sorted['player_A_win_pct_52w'] = df_sorted.apply(
    lambda row: calculate_win_pct(row, df_sorted, row['player_A_id'], 'match_date')[0],
    axis=1
)
df_sorted['player_A_has_history'] = df_sorted.apply(
    lambda row: calculate_win_pct(row, df_sorted, row['player_A_id'], 'match_date')[1],
    axis=1
)

print("Calculating win percentages for player B...")
df_sorted['player_B_win_pct_52w'] = df_sorted.apply(
    lambda row: calculate_win_pct(row, df_sorted, row['player_B_id'], 'match_date')[0],
    axis=1
)
df_sorted['player_B_has_history'] = df_sorted.apply(
    lambda row: calculate_win_pct(row, df_sorted, row['player_B_id'], 'match_date')[1],
    axis=1
)

# Calculate delta win percentage
df_sorted['delta_win_pct_52w'] = df_sorted['player_A_win_pct_52w'] - df_sorted['player_B_win_pct_52w']

# Create missing-history indicators
df_sorted['player_A_no_history'] = (~df_sorted['player_A_has_history']).astype(int)
df_sorted['player_B_no_history'] = (~df_sorted['player_B_has_history']).astype(int)

print(f"\nWin percentage statistics:")
print(f"Player A win pct - mean: {df_sorted['player_A_win_pct_52w'].mean():.3f}, missing: {df_sorted['player_A_win_pct_52w'].isna().sum():,}")
print(f"Player B win pct - mean: {df_sorted['player_B_win_pct_52w'].mean():.3f}, missing: {df_sorted['player_B_win_pct_52w'].isna().sum():,}")
print(f"Delta win pct - mean: {df_sorted['delta_win_pct_52w'].mean():.3f}, missing: {df_sorted['delta_win_pct_52w'].isna().sum():,}")
print(f"Player A no history: {df_sorted['player_A_no_history'].sum():,} ({df_sorted['player_A_no_history'].mean():.2%})")
print(f"Player B no history: {df_sorted['player_B_no_history'].sum():,} ({df_sorted['player_B_no_history'].mean():.2%})")

# Update df with the new features
df = df_sorted.sort_index().copy()


Calculating win percentages for player A...
Calculating win percentages for player B...

Win percentage statistics:
Player A win pct - mean: 0.503, missing: 1,794
Player B win pct - mean: 0.502, missing: 2,914
Delta win pct - mean: 0.002, missing: 4,144
Player A no history: 1,794 (3.07%)
Player B no history: 2,914 (4.98%)


In [20]:
# Re-split dataset with new features (same temporal split)
df['year'] = pd.to_numeric(
    df['tourney_date'].astype(str).str[:4], 
    errors='coerce'
)

train_mask = (df['year'] >= 2006) & (df['year'] <= 2021)
test_mask = (df['year'] >= 2022) & (df['year'] <= 2024)

df_train = df[train_mask].copy()
df_test = df[test_mask].copy()

print(f"Training set: {len(df_train):,} rows (2006-2021)")
print(f"Test set:      {len(df_test):,} rows (2022-2024)")


Training set: 46,259 rows (2006-2021)
Test set:      8,979 rows (2022-2024)


In [21]:
# Select features including win percentage features
delta_features = ['delta_rank_points', 'delta_rank', 'delta_age', 'delta_ht']
missing_indicators = [f'{feat}_missing' for feat in delta_features]
win_pct_features = ['delta_win_pct_52w']
history_indicators = ['player_A_no_history', 'player_B_no_history']

feature_cols = delta_features + missing_indicators + win_pct_features + history_indicators
target_col = 'target'

# Create feature matrices and target vectors
X_train = df_train[feature_cols].copy()
X_test = df_test[feature_cols].copy()
y_train = df_train[target_col]
y_test = df_test[target_col]

# Impute missing values in win percentage with 0 (for players with no history)
X_train['delta_win_pct_52w'] = X_train['delta_win_pct_52w'].fillna(0)
X_test['delta_win_pct_52w'] = X_test['delta_win_pct_52w'].fillna(0)

print(f"Training set:")
print(f"  Features shape: {X_train.shape}")
print(f"  Target shape: {y_train.shape}")
print(f"\nTest set:")
print(f"  Features shape: {X_test.shape}")
print(f"  Target shape: {y_test.shape}")
print(f"\nFeatures ({len(feature_cols)}): {feature_cols}")
print(f"\nMissing values in training set:")
print(X_train.isnull().sum())


Training set:
  Features shape: (46259, 11)
  Target shape: (46259,)

Test set:
  Features shape: (8979, 11)
  Target shape: (8979,)

Features (11): ['delta_rank_points', 'delta_rank', 'delta_age', 'delta_ht', 'delta_rank_points_missing', 'delta_rank_missing', 'delta_age_missing', 'delta_ht_missing', 'delta_win_pct_52w', 'player_A_no_history', 'player_B_no_history']

Missing values in training set:
delta_rank_points            0
delta_rank                   0
delta_age                    0
delta_ht                     0
delta_rank_points_missing    0
delta_rank_missing           0
delta_age_missing            0
delta_ht_missing             0
delta_win_pct_52w            0
player_A_no_history          0
player_B_no_history          0
dtype: int64


In [22]:
# Standardize features using StandardScaler (with win percentage features)
from sklearn.preprocessing import StandardScaler

# Fit scaler on training data only
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames to preserve column names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("Standardized training features - mean values (should be near zero):")
print(X_train_scaled.mean().round(6))
print(f"\nStandardized training features - std values (should be ~1.0):")
print(X_train_scaled.std().round(6))


Standardized training features - mean values (should be near zero):
delta_rank_points           -0.0
delta_rank                   0.0
delta_age                   -0.0
delta_ht                     0.0
delta_rank_points_missing   -0.0
delta_rank_missing          -0.0
delta_age_missing            0.0
delta_ht_missing             0.0
delta_win_pct_52w           -0.0
player_A_no_history          0.0
player_B_no_history          0.0
dtype: float64

Standardized training features - std values (should be ~1.0):
delta_rank_points            1.000011
delta_rank                   1.000011
delta_age                    1.000011
delta_ht                     1.000011
delta_rank_points_missing    1.000011
delta_rank_missing           1.000011
delta_age_missing            1.000011
delta_ht_missing             1.000011
delta_win_pct_52w            1.000011
player_A_no_history          1.000011
player_B_no_history          1.000011
dtype: float64


In [23]:
# Retrain logistic regression model with win percentage features
from sklearn.linear_model import LogisticRegression

# Use default regularization (C=1.0)
lr_model_winpct = LogisticRegression(random_state=42, max_iter=1000)
lr_model_winpct.fit(X_train_scaled, y_train)

print("Logistic Regression Model Trained (with win percentage features)")
print(f"Number of iterations: {lr_model_winpct.n_iter_[0]}")
print(f"Regularization strength (C): {lr_model_winpct.C}")


Logistic Regression Model Trained (with win percentage features)
Number of iterations: 8
Regularization strength (C): 1.0


In [24]:
# Evaluate model on test set
from sklearn.metrics import roc_auc_score, log_loss

# Get predictions and probabilities
y_pred_proba = lr_model_winpct.predict_proba(X_test_scaled)[:, 1]

# Calculate metrics
roc_auc = roc_auc_score(y_test, y_pred_proba)
log_loss_score = log_loss(y_test, y_pred_proba)

print("Model Performance on Test Set (with win percentage features):")
print("=" * 60)
print(f"ROC AUC:  {roc_auc:.4f}")
print(f"Log Loss: {log_loss_score:.4f}")

# Display updated coefficients including all features
print("\n" + "=" * 60)
print("Model Coefficients (including win percentage and history indicators):")
print("=" * 60)
coefficients = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'coefficient': lr_model_winpct.coef_[0]
})

# Sort by absolute value of coefficient
coefficients['abs_coefficient'] = coefficients['coefficient'].abs()
coefficients = coefficients.sort_values('abs_coefficient', ascending=False).drop(columns='abs_coefficient')

print(coefficients.to_string(index=False))


Model Performance on Test Set (with win percentage features):
ROC AUC:  0.7011
Log Loss: 0.6304

Model Coefficients (including win percentage and history indicators):
                  feature  coefficient
        delta_rank_points     0.619405
               delta_rank    -0.376798
        delta_win_pct_52w     0.291340
      player_B_no_history     0.125546
                 delta_ht     0.091360
      player_A_no_history    -0.086261
                delta_age    -0.060970
delta_rank_points_missing     0.041618
       delta_rank_missing     0.041618
         delta_ht_missing     0.038814
        delta_age_missing    -0.024139


In [25]:
# Calculate surface-specific rolling 52-week win percentage for each player
# Only use matches on the same surface as the current match

def calculate_surface_win_pct(row, df_all, player_id, date_col, surface_col):
    """Calculate 52-week rolling win percentage for a player on a specific surface, excluding current match"""
    match_date = row[date_col]
    surface = row[surface_col]
    
    if pd.isna(match_date) or pd.isna(player_id) or pd.isna(surface):
        return np.nan, False  # (win_pct, has_history)
    
    # Define 52-week window (365 days before current match)
    window_start = match_date - pd.Timedelta(days=365)
    
    # Get all matches for this player on the same surface in the 52-week window, excluding current match
    player_matches = df_all[
        ((df_all['player_A_id'] == player_id) | (df_all['player_B_id'] == player_id)) &
        (df_all[surface_col] == surface) &
        (df_all[date_col] >= window_start) &
        (df_all[date_col] < match_date)  # Exclude current match
    ].copy()
    
    if len(player_matches) == 0:
        return np.nan, False  # No history on this surface
    
    # Calculate wins: player wins if they are player A and target=1, or player B and target=0
    wins = (
        ((player_matches['player_A_id'] == player_id) & (player_matches['target'] == 1)) |
        ((player_matches['player_B_id'] == player_id) & (player_matches['target'] == 0))
    ).sum()
    
    total_matches = len(player_matches)
    win_pct = wins / total_matches if total_matches > 0 else np.nan
    
    return win_pct, True

print("Calculating surface-specific win percentages...")


Calculating surface-specific win percentages...


In [26]:
# Calculate surface-specific win percentages for player A and player B
print("Calculating surface-specific win percentages for player A...")
df['player_A_win_pct_52w_surface'] = df.apply(
    lambda row: calculate_surface_win_pct(row, df, row['player_A_id'], 'match_date', 'surface')[0],
    axis=1
)
df['player_A_has_surface_history'] = df.apply(
    lambda row: calculate_surface_win_pct(row, df, row['player_A_id'], 'match_date', 'surface')[1],
    axis=1
)

print("Calculating surface-specific win percentages for player B...")
df['player_B_win_pct_52w_surface'] = df.apply(
    lambda row: calculate_surface_win_pct(row, df, row['player_B_id'], 'match_date', 'surface')[0],
    axis=1
)
df['player_B_has_surface_history'] = df.apply(
    lambda row: calculate_surface_win_pct(row, df, row['player_B_id'], 'match_date', 'surface')[1],
    axis=1
)

# Calculate delta surface win percentage
df['delta_win_pct_52w_surface'] = df['player_A_win_pct_52w_surface'] - df['player_B_win_pct_52w_surface']

# Create missing-surface-history indicators
df['player_A_no_surface_history'] = (~df['player_A_has_surface_history']).astype(int)
df['player_B_no_surface_history'] = (~df['player_B_has_surface_history']).astype(int)

print(f"\nSurface-specific win percentage statistics:")
print(f"Player A surface win pct - mean: {df['player_A_win_pct_52w_surface'].mean():.3f}, missing: {df['player_A_win_pct_52w_surface'].isna().sum():,}")
print(f"Player B surface win pct - mean: {df['player_B_win_pct_52w_surface'].mean():.3f}, missing: {df['player_B_win_pct_52w_surface'].isna().sum():,}")
print(f"Delta surface win pct - mean: {df['delta_win_pct_52w_surface'].mean():.3f}, missing: {df['delta_win_pct_52w_surface'].isna().sum():,}")
print(f"Player A no surface history: {df['player_A_no_surface_history'].sum():,} ({df['player_A_no_surface_history'].mean():.2%})")
print(f"Player B no surface history: {df['player_B_no_surface_history'].sum():,} ({df['player_B_no_surface_history'].mean():.2%})")


Calculating surface-specific win percentages for player A...
Calculating surface-specific win percentages for player B...

Surface-specific win percentage statistics:
Player A surface win pct - mean: 0.492, missing: 4,445
Player B surface win pct - mean: 0.492, missing: 6,053
Delta surface win pct - mean: 0.001, missing: 8,749
Player A no surface history: 4,445 (7.60%)
Player B no surface history: 6,053 (10.35%)


In [27]:
# Re-split dataset with surface-specific features (same temporal split)
df['year'] = pd.to_numeric(
    df['tourney_date'].astype(str).str[:4], 
    errors='coerce'
)

train_mask = (df['year'] >= 2006) & (df['year'] <= 2021)
test_mask = (df['year'] >= 2022) & (df['year'] <= 2024)

df_train = df[train_mask].copy()
df_test = df[test_mask].copy()

print(f"Training set: {len(df_train):,} rows (2006-2021)")
print(f"Test set:      {len(df_test):,} rows (2022-2024)")


Training set: 46,259 rows (2006-2021)
Test set:      8,979 rows (2022-2024)


In [28]:
# Select features including surface-specific win percentage features
delta_features = ['delta_rank_points', 'delta_rank', 'delta_age', 'delta_ht']
missing_indicators = [f'{feat}_missing' for feat in delta_features]
win_pct_features = ['delta_win_pct_52w']
history_indicators = ['player_A_no_history', 'player_B_no_history']
surface_win_pct_features = ['delta_win_pct_52w_surface']
surface_history_indicators = ['player_A_no_surface_history', 'player_B_no_surface_history']

feature_cols = delta_features + missing_indicators + win_pct_features + history_indicators + surface_win_pct_features + surface_history_indicators
target_col = 'target'

# Create feature matrices and target vectors
X_train = df_train[feature_cols].copy()
X_test = df_test[feature_cols].copy()
y_train = df_train[target_col]
y_test = df_test[target_col]

# Impute missing values in win percentages with 0 (for players with no history)
X_train['delta_win_pct_52w'] = X_train['delta_win_pct_52w'].fillna(0)
X_test['delta_win_pct_52w'] = X_test['delta_win_pct_52w'].fillna(0)
X_train['delta_win_pct_52w_surface'] = X_train['delta_win_pct_52w_surface'].fillna(0)
X_test['delta_win_pct_52w_surface'] = X_test['delta_win_pct_52w_surface'].fillna(0)

print(f"Training set:")
print(f"  Features shape: {X_train.shape}")
print(f"  Target shape: {y_train.shape}")
print(f"\nTest set:")
print(f"  Features shape: {X_test.shape}")
print(f"  Target shape: {y_test.shape}")
print(f"\nFeatures ({len(feature_cols)}): {feature_cols}")
print(f"\nMissing values in training set:")
print(X_train.isnull().sum())


Training set:
  Features shape: (46259, 14)
  Target shape: (46259,)

Test set:
  Features shape: (8979, 14)
  Target shape: (8979,)

Features (14): ['delta_rank_points', 'delta_rank', 'delta_age', 'delta_ht', 'delta_rank_points_missing', 'delta_rank_missing', 'delta_age_missing', 'delta_ht_missing', 'delta_win_pct_52w', 'player_A_no_history', 'player_B_no_history', 'delta_win_pct_52w_surface', 'player_A_no_surface_history', 'player_B_no_surface_history']

Missing values in training set:
delta_rank_points              0
delta_rank                     0
delta_age                      0
delta_ht                       0
delta_rank_points_missing      0
delta_rank_missing             0
delta_age_missing              0
delta_ht_missing               0
delta_win_pct_52w              0
player_A_no_history            0
player_B_no_history            0
delta_win_pct_52w_surface      0
player_A_no_surface_history    0
player_B_no_surface_history    0
dtype: int64


In [29]:
# Standardize features using StandardScaler (with surface-specific win percentage features)
from sklearn.preprocessing import StandardScaler

# Fit scaler on training data only
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames to preserve column names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("Standardized training features - mean values (should be near zero):")
print(X_train_scaled.mean().round(6))
print(f"\nStandardized training features - std values (should be ~1.0):")
print(X_train_scaled.std().round(6))


Standardized training features - mean values (should be near zero):
delta_rank_points             -0.0
delta_rank                     0.0
delta_age                     -0.0
delta_ht                       0.0
delta_rank_points_missing     -0.0
delta_rank_missing            -0.0
delta_age_missing              0.0
delta_ht_missing               0.0
delta_win_pct_52w             -0.0
player_A_no_history            0.0
player_B_no_history            0.0
delta_win_pct_52w_surface     -0.0
player_A_no_surface_history   -0.0
player_B_no_surface_history    0.0
dtype: float64

Standardized training features - std values (should be ~1.0):
delta_rank_points              1.000011
delta_rank                     1.000011
delta_age                      1.000011
delta_ht                       1.000011
delta_rank_points_missing      1.000011
delta_rank_missing             1.000011
delta_age_missing              1.000011
delta_ht_missing               1.000011
delta_win_pct_52w              1.000011
play

In [30]:
# Retrain logistic regression model with surface-specific win percentage features
from sklearn.linear_model import LogisticRegression

# Use default regularization (C=1.0)
lr_model_surface = LogisticRegression(random_state=42, max_iter=1000)
lr_model_surface.fit(X_train_scaled, y_train)

print("Logistic Regression Model Trained (with surface-specific win percentage features)")
print(f"Number of iterations: {lr_model_surface.n_iter_[0]}")
print(f"Regularization strength (C): {lr_model_surface.C}")


Logistic Regression Model Trained (with surface-specific win percentage features)
Number of iterations: 11
Regularization strength (C): 1.0


In [31]:
# Evaluate model on test set
from sklearn.metrics import roc_auc_score, log_loss

# Get predictions and probabilities
y_pred_proba = lr_model_surface.predict_proba(X_test_scaled)[:, 1]

# Calculate metrics
roc_auc = roc_auc_score(y_test, y_pred_proba)
log_loss_score = log_loss(y_test, y_pred_proba)

print("Model Performance on Test Set (with surface-specific win percentage features):")
print("=" * 70)
print(f"ROC AUC:  {roc_auc:.4f}")
print(f"Log Loss: {log_loss_score:.4f}")

# Display updated coefficients including all features
print("\n" + "=" * 70)
print("Model Coefficients (including surface-specific win percentage and history indicators):")
print("=" * 70)
coefficients = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'coefficient': lr_model_surface.coef_[0]
})

# Sort by absolute value of coefficient
coefficients['abs_coefficient'] = coefficients['coefficient'].abs()
coefficients = coefficients.sort_values('abs_coefficient', ascending=False).drop(columns='abs_coefficient')

print(coefficients.to_string(index=False))


Model Performance on Test Set (with surface-specific win percentage features):
ROC AUC:  0.7051
Log Loss: 0.6280

Model Coefficients (including surface-specific win percentage and history indicators):
                    feature  coefficient
          delta_rank_points     0.588654
                 delta_rank    -0.346537
  delta_win_pct_52w_surface     0.262349
player_B_no_surface_history     0.142599
player_A_no_surface_history    -0.121118
          delta_win_pct_52w     0.119240
                   delta_ht     0.089405
                  delta_age    -0.065514
        player_B_no_history     0.046551
  delta_rank_points_missing     0.042409
         delta_rank_missing     0.042409
           delta_ht_missing     0.039033
        player_A_no_history    -0.026788
          delta_age_missing    -0.024971


In [32]:
# Calculate rolling last-10-weeks (70 days) win percentage for each player
# Only use matches played in the 70 days prior to the current match date

def calculate_10w_win_pct(row, df_all, player_id, date_col):
    """Calculate 70-day rolling win percentage for a player, excluding current match"""
    match_date = row[date_col]
    
    if pd.isna(match_date) or pd.isna(player_id):
        return np.nan, False  # (win_pct, has_history)
    
    # Define 70-day window (70 days before current match)
    window_start = match_date - pd.Timedelta(days=70)
    
    # Get all matches for this player in the 70-day window, excluding current match
    # Player can be either player A or player B
    player_matches = df_all[
        ((df_all['player_A_id'] == player_id) | (df_all['player_B_id'] == player_id)) &
        (df_all[date_col] >= window_start) &
        (df_all[date_col] < match_date)  # Exclude current match
    ].copy()
    
    if len(player_matches) == 0:
        return np.nan, False  # No history in last 10 weeks
    
    # Calculate wins: player wins if they are player A and target=1, or player B and target=0
    wins = (
        ((player_matches['player_A_id'] == player_id) & (player_matches['target'] == 1)) |
        ((player_matches['player_B_id'] == player_id) & (player_matches['target'] == 0))
    ).sum()
    
    total_matches = len(player_matches)
    win_pct = wins / total_matches if total_matches > 0 else np.nan
    
    return win_pct, True

print("Calculating last-10-weeks (70-day) win percentages...")


Calculating last-10-weeks (70-day) win percentages...


In [33]:
# Calculate last-10-weeks win percentages for player A and player B
print("Calculating last-10-weeks win percentages for player A...")
df['player_A_win_pct_10w'] = df.apply(
    lambda row: calculate_10w_win_pct(row, df, row['player_A_id'], 'match_date')[0],
    axis=1
)
df['player_A_has_10w_history'] = df.apply(
    lambda row: calculate_10w_win_pct(row, df, row['player_A_id'], 'match_date')[1],
    axis=1
)

print("Calculating last-10-weeks win percentages for player B...")
df['player_B_win_pct_10w'] = df.apply(
    lambda row: calculate_10w_win_pct(row, df, row['player_B_id'], 'match_date')[0],
    axis=1
)
df['player_B_has_10w_history'] = df.apply(
    lambda row: calculate_10w_win_pct(row, df, row['player_B_id'], 'match_date')[1],
    axis=1
)

# Calculate delta win percentage (Player A - Player B)
df['delta_win_pct_10w'] = df['player_A_win_pct_10w'] - df['player_B_win_pct_10w']

# Create binary indicators for players with no matches in the last 10 weeks
df['player_A_no_10w_history'] = (~df['player_A_has_10w_history']).astype(int)
df['player_B_no_10w_history'] = (~df['player_B_has_10w_history']).astype(int)

print(f"\nLast-10-weeks win percentage statistics:")
print(f"Player A 10w win pct - mean: {df['player_A_win_pct_10w'].mean():.3f}, missing: {df['player_A_win_pct_10w'].isna().sum():,}")
print(f"Player B 10w win pct - mean: {df['player_B_win_pct_10w'].mean():.3f}, missing: {df['player_B_win_pct_10w'].isna().sum():,}")
print(f"Delta 10w win pct - mean: {df['delta_win_pct_10w'].mean():.3f}, missing: {df['delta_win_pct_10w'].isna().sum():,}")
print(f"Player A no 10w history: {df['player_A_no_10w_history'].sum():,} ({df['player_A_no_10w_history'].mean():.2%})")
print(f"Player B no 10w history: {df['player_B_no_10w_history'].sum():,} ({df['player_B_no_10w_history'].mean():.2%})")


Calculating last-10-weeks win percentages for player A...
Calculating last-10-weeks win percentages for player B...

Last-10-weeks win percentage statistics:
Player A 10w win pct - mean: 0.468, missing: 6,785
Player B 10w win pct - mean: 0.482, missing: 8,345
Delta 10w win pct - mean: -0.013, missing: 12,100
Player A no 10w history: 6,785 (11.60%)
Player B no 10w history: 8,345 (14.26%)


In [34]:
# Re-split dataset with last-10-weeks features (same temporal split)
df['year'] = pd.to_numeric(
    df['tourney_date'].astype(str).str[:4], 
    errors='coerce'
)

train_mask = (df['year'] >= 2006) & (df['year'] <= 2021)
test_mask = (df['year'] >= 2022) & (df['year'] <= 2024)

df_train = df[train_mask].copy()
df_test = df[test_mask].copy()

print(f"Training set: {len(df_train):,} rows (2006-2021)")
print(f"Test set:      {len(df_test):,} rows (2022-2024)")


Training set: 46,259 rows (2006-2021)
Test set:      8,979 rows (2022-2024)


In [35]:
# Select features including last-10-weeks win percentage features
delta_features = ['delta_rank_points', 'delta_rank', 'delta_age', 'delta_ht']
missing_indicators = [f'{feat}_missing' for feat in delta_features]
win_pct_features = ['delta_win_pct_52w']
history_indicators = ['player_A_no_history', 'player_B_no_history']
surface_win_pct_features = ['delta_win_pct_52w_surface']
surface_history_indicators = ['player_A_no_surface_history', 'player_B_no_surface_history']
win_pct_10w_features = ['delta_win_pct_10w']
history_10w_indicators = ['player_A_no_10w_history', 'player_B_no_10w_history']

feature_cols = (delta_features + missing_indicators + win_pct_features + history_indicators + 
                surface_win_pct_features + surface_history_indicators + 
                win_pct_10w_features + history_10w_indicators)
target_col = 'target'

# Create feature matrices and target vectors
X_train = df_train[feature_cols].copy()
X_test = df_test[feature_cols].copy()
y_train = df_train[target_col]
y_test = df_test[target_col]

# Impute missing values in win percentages with 0 (for players with no history)
X_train['delta_win_pct_52w'] = X_train['delta_win_pct_52w'].fillna(0)
X_test['delta_win_pct_52w'] = X_test['delta_win_pct_52w'].fillna(0)
X_train['delta_win_pct_52w_surface'] = X_train['delta_win_pct_52w_surface'].fillna(0)
X_test['delta_win_pct_52w_surface'] = X_test['delta_win_pct_52w_surface'].fillna(0)
X_train['delta_win_pct_10w'] = X_train['delta_win_pct_10w'].fillna(0)
X_test['delta_win_pct_10w'] = X_test['delta_win_pct_10w'].fillna(0)

print(f"Training set:")
print(f"  Features shape: {X_train.shape}")
print(f"  Target shape: {y_train.shape}")
print(f"\nTest set:")
print(f"  Features shape: {X_test.shape}")
print(f"  Target shape: {y_test.shape}")
print(f"\nFeatures ({len(feature_cols)}): {feature_cols}")
print(f"\nMissing values in training set:")
print(X_train.isnull().sum())


Training set:
  Features shape: (46259, 17)
  Target shape: (46259,)

Test set:
  Features shape: (8979, 17)
  Target shape: (8979,)

Features (17): ['delta_rank_points', 'delta_rank', 'delta_age', 'delta_ht', 'delta_rank_points_missing', 'delta_rank_missing', 'delta_age_missing', 'delta_ht_missing', 'delta_win_pct_52w', 'player_A_no_history', 'player_B_no_history', 'delta_win_pct_52w_surface', 'player_A_no_surface_history', 'player_B_no_surface_history', 'delta_win_pct_10w', 'player_A_no_10w_history', 'player_B_no_10w_history']

Missing values in training set:
delta_rank_points              0
delta_rank                     0
delta_age                      0
delta_ht                       0
delta_rank_points_missing      0
delta_rank_missing             0
delta_age_missing              0
delta_ht_missing               0
delta_win_pct_52w              0
player_A_no_history            0
player_B_no_history            0
delta_win_pct_52w_surface      0
player_A_no_surface_history    0
pla

In [36]:
# Standardize features using StandardScaler (with last-10-weeks win percentage features)
from sklearn.preprocessing import StandardScaler

# Fit scaler on training data only
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames to preserve column names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("Standardized training features - mean values (should be near zero):")
print(X_train_scaled.mean().round(6))
print(f"\nStandardized training features - std values (should be ~1.0):")
print(X_train_scaled.std().round(6))


Standardized training features - mean values (should be near zero):
delta_rank_points             -0.0
delta_rank                     0.0
delta_age                     -0.0
delta_ht                       0.0
delta_rank_points_missing     -0.0
delta_rank_missing            -0.0
delta_age_missing              0.0
delta_ht_missing               0.0
delta_win_pct_52w             -0.0
player_A_no_history            0.0
player_B_no_history            0.0
delta_win_pct_52w_surface     -0.0
player_A_no_surface_history   -0.0
player_B_no_surface_history    0.0
delta_win_pct_10w             -0.0
player_A_no_10w_history       -0.0
player_B_no_10w_history       -0.0
dtype: float64

Standardized training features - std values (should be ~1.0):
delta_rank_points              1.000011
delta_rank                     1.000011
delta_age                      1.000011
delta_ht                       1.000011
delta_rank_points_missing      1.000011
delta_rank_missing             1.000011
delta_age_missing  

In [37]:
# Retrain logistic regression model with last-10-weeks win percentage features
from sklearn.linear_model import LogisticRegression

# Use default regularization (C=1.0)
lr_model_10w = LogisticRegression(random_state=42, max_iter=1000)
lr_model_10w.fit(X_train_scaled, y_train)

print("Logistic Regression Model Trained (with last-10-weeks win percentage features)")
print(f"Number of iterations: {lr_model_10w.n_iter_[0]}")
print(f"Regularization strength (C): {lr_model_10w.C}")


Logistic Regression Model Trained (with last-10-weeks win percentage features)
Number of iterations: 11
Regularization strength (C): 1.0


In [38]:
# Evaluate model on test set
from sklearn.metrics import roc_auc_score, log_loss

# Get predictions and probabilities
y_pred_proba = lr_model_10w.predict_proba(X_test_scaled)[:, 1]

# Calculate metrics
roc_auc = roc_auc_score(y_test, y_pred_proba)
log_loss_score = log_loss(y_test, y_pred_proba)

print("Model Performance on Test Set (with last-10-weeks win percentage features):")
print("=" * 70)
print(f"ROC AUC:  {roc_auc:.4f}")
print(f"Log Loss: {log_loss_score:.4f}")

# Display updated coefficients including all features
print("\n" + "=" * 70)
print("Model Coefficients (including last-10-weeks win percentage and history indicators):")
print("=" * 70)
coefficients = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'coefficient': lr_model_10w.coef_[0]
})

# Sort by absolute value of coefficient
coefficients['abs_coefficient'] = coefficients['coefficient'].abs()
coefficients = coefficients.sort_values('abs_coefficient', ascending=False).drop(columns='abs_coefficient')

print(coefficients.to_string(index=False))


Model Performance on Test Set (with last-10-weeks win percentage features):
ROC AUC:  0.7086
Log Loss: 0.6253

Model Coefficients (including last-10-weeks win percentage and history indicators):
                    feature  coefficient
          delta_rank_points     0.565192
                 delta_rank    -0.307678
  delta_win_pct_52w_surface     0.236121
          delta_win_pct_10w     0.159672
player_B_no_surface_history     0.124845
player_A_no_surface_history    -0.109921
    player_B_no_10w_history     0.103795
                   delta_ht     0.086545
    player_A_no_10w_history    -0.082406
                  delta_age    -0.061415
          delta_win_pct_52w     0.055460
         delta_rank_missing     0.042720
  delta_rank_points_missing     0.042720
           delta_ht_missing     0.038614
          delta_age_missing    -0.024972
        player_B_no_history     0.024965
        player_A_no_history    -0.014227


In [42]:
# Compare model performance before and after adding last-10-weeks features
# Note: We need to evaluate both models on their respective feature sets

# Re-create feature sets for comparison
# Surface model features (before 10-week)
delta_features = ['delta_rank_points', 'delta_rank', 'delta_age', 'delta_ht']
missing_indicators = [f'{feat}_missing' for feat in delta_features]
win_pct_features = ['delta_win_pct_52w']
history_indicators = ['player_A_no_history', 'player_B_no_history']
surface_win_pct_features = ['delta_win_pct_52w_surface']
surface_history_indicators = ['player_A_no_surface_history', 'player_B_no_surface_history']
feature_cols_surface = (delta_features + missing_indicators + win_pct_features + history_indicators + 
                        surface_win_pct_features + surface_history_indicators)

# 10-week model features (after 10-week)
win_pct_10w_features = ['delta_win_pct_10w']
history_10w_indicators = ['player_A_no_10w_history', 'player_B_no_10w_history']
feature_cols_10w = feature_cols_surface + win_pct_10w_features + history_10w_indicators

# Prepare test sets for each model
X_test_surface = df_test[feature_cols_surface].copy()
X_test_10w = df_test[feature_cols_10w].copy()

# Impute missing values
for col in ['delta_win_pct_52w', 'delta_win_pct_52w_surface']:
    if col in X_test_surface.columns:
        X_test_surface[col] = X_test_surface[col].fillna(0)
        X_test_10w[col] = X_test_10w[col].fillna(0)
for col in ['delta_win_pct_10w']:
    if col in X_test_10w.columns:
        X_test_10w[col] = X_test_10w[col].fillna(0)

# Standardize test sets using scalers fitted on training data
# For surface model - need to recreate scaler
X_train_surface = df_train[feature_cols_surface].copy()
for col in ['delta_win_pct_52w', 'delta_win_pct_52w_surface']:
    if col in X_train_surface.columns:
        X_train_surface[col] = X_train_surface[col].fillna(0)
scaler_surface = StandardScaler()
scaler_surface.fit(X_train_surface)
X_test_surface_scaled = pd.DataFrame(
    scaler_surface.transform(X_test_surface),
    columns=X_test_surface.columns,
    index=X_test_surface.index
)

# For 10-week model - use existing scaler
X_test_10w_scaled = X_test_scaled[feature_cols_10w]

# Get predictions
y_pred_proba_surface = lr_model_surface.predict_proba(X_test_surface_scaled)[:, 1]
roc_auc_surface = roc_auc_score(y_test, y_pred_proba_surface)
log_loss_surface = log_loss(y_test, y_pred_proba_surface)

y_pred_proba_10w = lr_model_10w.predict_proba(X_test_10w_scaled)[:, 1]
roc_auc_10w = roc_auc_score(y_test, y_pred_proba_10w)
log_loss_10w = log_loss(y_test, y_pred_proba_10w)

# Calculate improvements
roc_auc_improvement = roc_auc_10w - roc_auc_surface
log_loss_improvement = log_loss_10w - log_loss_surface # Lower is better, so improvement is positive

print("Model Performance Comparison:")
print("=" * 80)
print(f"{'Metric':<20} {'Before (Surface)':<20} {'After (+10-week)':<20} {'Change':<20}")
print("-" * 80)
print(f"{'ROC AUC':<20} {roc_auc_surface:<20.4f} {roc_auc_10w:<20.4f} {roc_auc_improvement:+.4f}")
print(f"{'Log Loss':<20} {log_loss_surface:<20.4f} {log_loss_10w:<20.4f} {log_loss_improvement:+.4f}")
print("=" * 80)

# Calculate percentage improvements
roc_auc_pct_improvement = (roc_auc_improvement / roc_auc_surface) * 100
log_loss_pct_improvement = (log_loss_improvement / log_loss_surface) * 100

print(f"\nPercentage Improvements:")
print(f"  ROC AUC: {roc_auc_pct_improvement:+.2f}%")
print(f"  Log Loss: {log_loss_pct_improvement:+.2f}% (lower is better)")

# Summary assessment
print(f"\n{'='*80}")
print("Summary Assessment:")
print(f"{'='*80}")

if roc_auc_improvement > 0.01:
    print("✓ Meaningful improvement in ROC AUC (>0.01 increase)")
elif roc_auc_improvement > 0.005:
    print("~ Modest improvement in ROC AUC (0.005-0.01 increase)")
elif roc_auc_improvement > 0:
    print("~ Small improvement in ROC AUC (<0.005 increase)")
else:
    print("✗ No improvement or slight decrease in ROC AUC")

if abs(log_loss_improvement) > 0.01:
    print("✓ Meaningful improvement in Log Loss (>0.01 decrease)")
elif abs(log_loss_improvement) > 0.005:
    print("~ Modest improvement in Log Loss (0.005-0.01 decrease)")
elif abs(log_loss_improvement) > 0:
    print("~ Small improvement in Log Loss (<0.005 decrease)")
else:
    print("✗ No improvement or slight increase in Log Loss")

# Overall assessment
if roc_auc_improvement > 0.005 and abs(log_loss_improvement) > 0.005:
    print(f"\nOverall: The addition of last-10-weeks features provides a meaningful improvement.")
elif roc_auc_improvement > 0 or abs(log_loss_improvement) > 0:
    print(f"\nOverall: The addition of last-10-weeks features provides modest improvement.")
else:
    print(f"\nOverall: The addition of last-10-weeks features does not meaningfully improve performance.")


Model Performance Comparison:
Metric               Before (Surface)     After (+10-week)     Change              
--------------------------------------------------------------------------------
ROC AUC              0.7051               0.7086               +0.0035
Log Loss             0.6280               0.6253               -0.0027

Percentage Improvements:
  ROC AUC: +0.49%
  Log Loss: -0.44% (lower is better)

Summary Assessment:
~ Small improvement in ROC AUC (<0.005 increase)
~ Small improvement in Log Loss (<0.005 decrease)

Overall: The addition of last-10-weeks features provides modest improvement.
