In [1]:
# Uncomment to upgrade packages
# !pip3 install pandas --upgrade --quiet
# !pip3 install numpy  --upgrade --quiet
# !pip3 install scipy --upgrade --quiet
# !pip3 install statsmodels  --upgrade --quiet
# !pip3 install seaborn  --upgrade --quiet
# !pip3 install matplotlib  --upgrade --quiet
# !pip3 install scikit-learn  --upgrade  --quiet
# !pip install scikit-optimize  --quiet
# !pip install -U --quiet yellowbrick

In [2]:
# Imports de librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns      
import missingno as msno
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures

from apafib import load_medical_costs  
from sklearn.model_selection import train_test_split
from scipy import stats
from matplotlib.lines import Line2D

# PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Definiciones
RND = 16 

columns = ['age','sex','bmi','children','smoker','region','charges']
columns_no_target = ['age','sex','bmi','children','smoker','region']
numerical_columns = ['age', 'children', 'bmi']
numerical_columns_with_target = ['age', 'children', 'bmi', 'charges']
categorical_columns = ['sex', 'smoker', 'region']   
target = 'charges'

sns.set(style="whitegrid", font_scale=1.05)

# Funciones auxiliares
def format_pval(p):
    """Formatea p-value: decimal si p >= 1e-3, si no en notación mantisa*10^{exp} (LaTeX)."""
    if p == 0:
        return r"$0$"
    exp = int(np.floor(np.log10(p)))
    if exp >= -3:
        return f"{p:.4f}"                # p mostrado como decimal (4 decimales)
    else:
        mant = p / (10.0**exp)
        # devolver string LaTeX, por ejemplo: $1.94\times10^{-19}$
        return rf"${mant:.2f}\times10^{{{exp}}}$"
    

from time import time
from datetime import timedelta
init_time = time()


# quitar warnings:
import warnings
warnings.filterwarnings("ignore", message="findfont:.*")

In [3]:
matches = pd.read_csv('./data/atp_matches.csv')
matches.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,year,month,day,month_name,tourney_points
0,2011-339,Brisbane,Hard,32,A,20110102,1,104417,1.0,,...,4.0,5.0,5580.0,173.0,309.0,2011,1,2,January,250
1,2011-339,Brisbane,Hard,32,A,20110102,2,103582,,,...,5.0,58.0,835.0,75.0,643.0,2011,1,2,January,250
2,2011-339,Brisbane,Hard,32,A,20110102,3,105051,,Q,...,8.0,196.0,263.0,204.0,243.0,2011,1,2,January,250
3,2011-339,Brisbane,Hard,32,A,20110102,4,104797,8.0,,...,3.0,40.0,1031.0,43.0,975.0,2011,1,2,January,250
4,2011-339,Brisbane,Hard,32,A,20110102,5,103888,4.0,,...,6.0,16.0,1991.0,83.0,600.0,2011,1,2,January,250


División 70/30

In [4]:
display(matches.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
tourney_id,39541.0,1894.0,2011-580,127.0,,,,,,,
tourney_name,39541.0,1078.0,Roland Garros,1778.0,,,,,,,
surface,39488.0,4.0,Hard,23060.0,,,,,,,
draw_size,39541.0,,,,57.467489,42.144138,2.0,32.0,32.0,96.0,128.0
tourney_level,39541.0,6.0,A,21148.0,,,,,,,
tourney_date,39541.0,,,,20174404.064313,41143.884302,20110102.0,20140319.0,20170731.0,20210726.0,20241218.0
match_num,39541.0,,,,167.239802,151.260611,1.0,21.0,190.0,283.0,1701.0
winner_id,39541.0,,,,117737.747477,29694.476989,100644.0,104607.0,105373.0,109739.0,212721.0
winner_seed,16884.0,,,,7.543592,6.988957,1.0,3.0,5.0,9.0,33.0
winner_entry,5347.0,10.0,Q,2968.0,,,,,,,


Vamos a empezar limpiando los datos.

Primero eliminaremos los identificadores y las variables day y month-name (ya están en tourney_date). Eliminaré también la variable seed, ya que se puede sacar directamente del ranking ATP del jugador. También eliminare tourney level que no hace falta (explica que era cada cosa: G-> grand slam, D-> davis...), amb punts ya ho veiem.

## Dataset Overview

This dataset contains **39,541 ATP tennis matches** from **2011-2024** including:

### Tournament Classification (tournament_points)
The dataset now includes **tournament_points** which represents the ATP ranking points awarded to the winner:

- **Grand Slam (2000 pts)**: Australian Open, French Open, Wimbledon, US Open - 6,985 matches (17.7%)
- **ATP Finals (1500 pts)**: Season-ending championship - 272 matches (0.7%)
- **Masters 1000 (1000 pts)**: Indian Wells, Miami, Monte Carlo, etc. - 7,681 matches (19.4%)
- **ATP 500 (500 pts)**: Barcelona, Dubai, Rotterdam, etc. - 6,887 matches (17.4%)
- **ATP 250 (250 pts)**: Brisbane, Doha, Sydney, etc. - 14,261 matches (36.1%)
- **Davis Cup (0 pts)**: Team event, no individual points - 3,455 matches (8.7%)

In [5]:
matches.drop(columns=['tourney_id', 'winner_id', 'loser_id', 'day', 'month_name', 'loser_seed', 'winner_seed', 'tourney_level'], inplace=True)

Después organizaremos en la siguiente estructura:
1. winner será el ganador
2. loser será el perdedor 

In [6]:
columnas_a_renombrar = {
    'w_ace': 'winner_aces',
    'w_df': 'winner_double_faults',
    'w_svpt': 'winner_serve_points_total',
    'w_1stIn': 'winner_first_serves_in',
    'w_1stWon': 'winner_first_serve_points_won',
    'w_2ndWon': 'winner_second_serve_points_won',
    'w_SvGms': 'winner_service_games',
    'w_bpSaved': 'winner_break_points_saved',
    'w_bpFaced': 'winner_break_points_faced',
    'l_ace': 'loser_aces',
    'l_df': 'loser_double_faults',
    'l_svpt': 'loser_serve_points_total',
    'l_1stIn': 'loser_first_serves_in',
    'l_1stWon': 'loser_first_serve_points_won',
    'l_2ndWon': 'loser_second_serve_points_won',
    'l_SvGms': 'loser_service_games',
    'l_bpSaved': 'loser_break_points_saved',
    'l_bpFaced': 'loser_break_points_faced'
}

# Renombrar las columnas en el DataFrame
matches = matches.rename(columns=columnas_a_renombrar)
print("Columnas renombradas de w_* y l_* a winner_* y loser_*")
display(matches.describe(include='all').T)

Columnas renombradas de w_* y l_* a winner_* y loser_*


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
tourney_name,39541.0,1078.0,Roland Garros,1778.0,,,,,,,
surface,39488.0,4.0,Hard,23060.0,,,,,,,
draw_size,39541.0,,,,57.467489,42.144138,2.0,32.0,32.0,96.0,128.0
tourney_date,39541.0,,,,20174404.064313,41143.884302,20110102.0,20140319.0,20170731.0,20210726.0,20241218.0
match_num,39541.0,,,,167.239802,151.260611,1.0,21.0,190.0,283.0,1701.0
winner_entry,5347.0,10.0,Q,2968.0,,,,,,,
winner_name,39541.0,1051.0,Novak Djokovic,813.0,,,,,,,
winner_hand,39541.0,4.0,R,34321.0,,,,,,,
winner_ht,39113.0,,,,187.368727,7.376512,3.0,183.0,188.0,193.0,211.0
winner_ioc,39541.0,97.0,ESP,4242.0,,,,,,,


In [7]:
import numpy as np

# Ordenar cronológicamente ANTES del shuffle (crítico para evitar data leakage temporal)
matches = matches.sort_values('tourney_date').reset_index(drop=True)
print("DataFrame ordenado por fecha.")

# Configurar semilla para reproducibilidad
np.random.seed(RND)

# Crear máscara de shuffle: True = mantener orden, False = invertir
shuffle_mask = np.random.rand(len(matches)) > 0.5

# Inicializar columnas player1 y player2
# Identificar todas las columnas que empiezan con winner_ o loser_
winner_cols = [col for col in matches.columns if col.startswith('winner_')]
loser_cols = [col for col in matches.columns if col.startswith('loser_')]

# Crear diccionario de mapeo
winner_to_player = {col: col.replace('winner_', 'player1_') for col in winner_cols}
loser_to_player = {col: col.replace('loser_', 'player2_') for col in loser_cols}

# Copiar y renombrar columnas
for winner_col, player1_col in winner_to_player.items():
    matches[player1_col] = matches[winner_col]
    
for loser_col, player2_col in loser_to_player.items():
    matches[player2_col] = matches[loser_col]

# Aplicar shuffle: intercambiar player1 y player2 según la máscara
print(f"Aplicando shuffle aleatorio a {(~shuffle_mask).sum()} de {len(matches)} partidos...")

for i in range(len(matches)):
    if not shuffle_mask[i]:  # Si shuffle_mask[i] es False, intercambiar
        # Intercambiar todas las columnas player1 y player2
        for winner_col in winner_cols:
            player1_col = winner_col.replace('winner_', 'player1_')
            player2_col = winner_col.replace('winner_', 'player2_')
            
            # Swap
            temp = matches.at[i, player1_col]
            matches.at[i, player1_col] = matches.at[i, player2_col]
            matches.at[i, player2_col] = temp

# Crear variable target: player1_wins
# Si shuffle_mask[i] es True, player1 es el winner original (ganó)
# Si shuffle_mask[i] es False, player1 es el loser original (perdió)
matches['player1_wins'] = shuffle_mask.astype(int)

print(f"\n✓ Shuffle completado")
print(f"  - player1_wins = 1: {matches['player1_wins'].sum()} partidos ({matches['player1_wins'].mean()*100:.1f}%)")
print(f"  - player1_wins = 0: {(1-matches['player1_wins']).sum()} partidos ({(1-matches['player1_wins'].mean())*100:.1f}%)")
print(f"\nPrimeras filas con nueva estructura:")
display(matches[['player1_name', 'player2_name', 'player1_wins']].head(10))

DataFrame ordenado por fecha.
Aplicando shuffle aleatorio a 19842 de 39541 partidos...

✓ Shuffle completado
  - player1_wins = 1: 19699 partidos (49.8%)
  - player1_wins = 0: 19842 partidos (50.2%)

Primeras filas con nueva estructura:


Unnamed: 0,player1_name,player2_name,player1_wins
0,Ryan Harrison,Robin Soderling,0
1,Marcos Baghdatis,Lukasz Kubot,1
2,Andy Roddick,Alexandr Dolgopolov,1
3,Matthew Ebden,Robin Soderling,0
4,Adrian Mannarino,Mardy Fish,0
5,Dudi Sela,Michael Berrer,0
6,Denis Istomin,Thiemo De Bakker,1
7,John Millman,Matthew Ebden,0
8,Benjamin Becker,Santiago Giraldo,0
9,Andy Roddick,Marcos Baghdatis,1


## Shuffle aleatorio: winner/loser → player1/player2

**¿Por qué es necesario?**

Si mantenemos la estructura `winner_*` vs `loser_*`, el modelo aprenderá simplemente que "el jugador etiquetado como winner siempre gana", lo cual es **data leakage implícito**.

**Solución:**
1. Para cada partido, shuffle aleatorio de quién es `player1` y quién es `player2`
2. Crear variable target `player1_wins` (1 si player1 ganó, 0 si perdió)
3. El modelo aprende de características reales, no de la estructura de los datos

## Feature Engineering

Hay muchas variables que nos darían data leakage, y solo las queremos para hacer feature engineering y posteriormente las eliminaremos.

Vamos a identificar cuales son todas las variables que causarían data leakage:

**Todas las variables empezadas por `winner_` o `loser_` (ya transformadas a player1/player2), la variable `score` y la variable `minutes`.**

Teniendo en cuenta esto, vamos a crear nuevas variables (feature engineering) trabajando siempre con la estructura **player1 vs player2**.

### 1. H2H (Head-to-Head)

Vamos a crear el H2H de cada jugador contra su oponente.

In [8]:
# Feature #1: H2H (Head-to-Head)
from collections import defaultdict

# --- Preparar la estructura para guardar los récords H2H ---
# La clave será una tupla ordenada con los nombres de los dos jugadores.
# El valor será un contador de victorias para cada uno en ese enfrentamiento.
h2h_records = defaultdict(lambda: defaultdict(int))

# Listas para guardar los resultados calculados
player1_h2h_list = []
player2_h2h_list = []

# --- Iterar sobre cada partido para calcular el H2H ---
print("Calculando H2H para cada partido...")
for idx, row in matches.iterrows():
    player1_name = row['player1_name']
    player2_name = row['player2_name']
    player1_won = row['player1_wins']
    
    # Crear una clave única y ordenada para el par de jugadores
    player_pair_key = tuple(sorted((player1_name, player2_name)))
    
    # Consultar el historial H2H *antes* de este partido
    player1_wins_before = h2h_records[player_pair_key][player1_name]
    player2_wins_before = h2h_records[player_pair_key][player2_name]
    
    # Guardar los valores históricos
    player1_h2h_list.append(player1_wins_before)
    player2_h2h_list.append(player2_wins_before)
    
    # Actualizar el registro con el resultado de *este* partido para el futuro
    if player1_won == 1:
        h2h_records[player_pair_key][player1_name] += 1
    else:
        h2h_records[player_pair_key][player2_name] += 1

# --- Añadir las nuevas columnas al DataFrame ---
matches['player1_h2h_wins'] = player1_h2h_list
matches['player2_h2h_wins'] = player2_h2h_list
print("✓ Nuevas columnas H2H añadidas al DataFrame.")

# --- Verificar el resultado ---
print("\nPrimeros 10 partidos:")
print(matches[['player1_name', 'player2_name', 'player1_h2h_wins', 'player2_h2h_wins', 'player1_wins']].head(10))
print("\nÚltimos 10 partidos:")
print(matches[['player1_name', 'player2_name', 'player1_h2h_wins', 'player2_h2h_wins', 'player1_wins']].tail(10))

Calculando H2H para cada partido...
✓ Nuevas columnas H2H añadidas al DataFrame.

Primeros 10 partidos:
       player1_name         player2_name  player1_h2h_wins  player2_h2h_wins  \
0     Ryan Harrison      Robin Soderling                 0                 0   
1  Marcos Baghdatis         Lukasz Kubot                 0                 0   
2      Andy Roddick  Alexandr Dolgopolov                 0                 0   
3     Matthew Ebden      Robin Soderling                 0                 0   
4  Adrian Mannarino           Mardy Fish                 0                 0   
5         Dudi Sela       Michael Berrer                 0                 0   
6     Denis Istomin     Thiemo De Bakker                 0                 0   
7      John Millman        Matthew Ebden                 0                 0   
8   Benjamin Becker     Santiago Giraldo                 0                 0   
9      Andy Roddick     Marcos Baghdatis                 0                 0   

   player1_wins

### 2. Victorias y derrotas
Total de victorias, total de derrotas, %winrate, lo mismo pero anual (YTD - Year To Date)

In [9]:
# Feature #2: Victorias y derrotas totales y anuales (YTD) - VERSIÓN CORREGIDA
from collections import defaultdict

# Diccionarios para tracking de estadísticas globales y anuales
player_stats = defaultdict(lambda: {'wins': 0, 'losses': 0})
player_stats_ytd = defaultdict(lambda: {'wins': 0, 'losses': 0, 'current_year': None})

# Listas para almacenar las estadísticas calculadas
player1_total_wins_list = []
player1_total_losses_list = []
player1_winrate_list = []
player1_ytd_wins_list = []
player1_ytd_losses_list = []
player1_ytd_winrate_list = []

player2_total_wins_list = []
player2_total_losses_list = []
player2_winrate_list = []
player2_ytd_wins_list = []
player2_ytd_losses_list = []
player2_ytd_winrate_list = []

print("Calculando estadísticas de victorias y derrotas...")
year_resets = 0  # Counter para debug

for idx, row in matches.iterrows():
    player1_name = row['player1_name']
    player2_name = row['player2_name']
    player1_won = row['player1_wins']
    year = row['year']  # ✅ USAR LA COLUMNA YEAR DIRECTAMENTE
    
    # --- PLAYER1 ---
    # Resetear YTD si cambió el año
    if player_stats_ytd[player1_name]['current_year'] is None:
        # Primera vez que vemos a este jugador
        player_stats_ytd[player1_name]['current_year'] = year
    elif player_stats_ytd[player1_name]['current_year'] != year:
        # Cambió el año, resetear
        player_stats_ytd[player1_name]['wins'] = 0
        player_stats_ytd[player1_name]['losses'] = 0
        player_stats_ytd[player1_name]['current_year'] = year
        year_resets += 1
    
    # Stats globales ANTES del partido (carrera completa)
    player1_total_wins = player_stats[player1_name]['wins']
    player1_total_losses = player_stats[player1_name]['losses']
    player1_total_matches = player1_total_wins + player1_total_losses
    player1_winrate = player1_total_wins / player1_total_matches if player1_total_matches > 0 else 0.0
    
    # Stats YTD ANTES del partido (solo del año actual)
    player1_ytd_wins = player_stats_ytd[player1_name]['wins']
    player1_ytd_losses = player_stats_ytd[player1_name]['losses']
    player1_ytd_matches = player1_ytd_wins + player1_ytd_losses
    player1_ytd_winrate = player1_ytd_wins / player1_ytd_matches if player1_ytd_matches > 0 else 0.0
    
    # Guardar valores de player1
    player1_total_wins_list.append(player1_total_wins)
    player1_total_losses_list.append(player1_total_losses)
    player1_winrate_list.append(player1_winrate)
    player1_ytd_wins_list.append(player1_ytd_wins)
    player1_ytd_losses_list.append(player1_ytd_losses)
    player1_ytd_winrate_list.append(player1_ytd_winrate)
    
    # --- PLAYER2 ---
    # Resetear YTD si cambió el año
    if player_stats_ytd[player2_name]['current_year'] is None:
        # Primera vez que vemos a este jugador
        player_stats_ytd[player2_name]['current_year'] = year
    elif player_stats_ytd[player2_name]['current_year'] != year:
        # Cambió el año, resetear
        player_stats_ytd[player2_name]['wins'] = 0
        player_stats_ytd[player2_name]['losses'] = 0
        player_stats_ytd[player2_name]['current_year'] = year
        year_resets += 1
    
    # Stats globales ANTES del partido (carrera completa)
    player2_total_wins = player_stats[player2_name]['wins']
    player2_total_losses = player_stats[player2_name]['losses']
    player2_total_matches = player2_total_wins + player2_total_losses
    player2_winrate = player2_total_wins / player2_total_matches if player2_total_matches > 0 else 0.0
    
    # Stats YTD ANTES del partido (solo del año actual)
    player2_ytd_wins = player_stats_ytd[player2_name]['wins']
    player2_ytd_losses = player_stats_ytd[player2_name]['losses']
    player2_ytd_matches = player2_ytd_wins + player2_ytd_losses
    player2_ytd_winrate = player2_ytd_wins / player2_ytd_matches if player2_ytd_matches > 0 else 0.0
    
    # Guardar valores de player2
    player2_total_wins_list.append(player2_total_wins)
    player2_total_losses_list.append(player2_total_losses)
    player2_winrate_list.append(player2_winrate)
    player2_ytd_wins_list.append(player2_ytd_wins)
    player2_ytd_losses_list.append(player2_ytd_losses)
    player2_ytd_winrate_list.append(player2_ytd_winrate)
    
    # --- Actualizar estadísticas DESPUÉS del partido ---
    if player1_won == 1:
        # Player1 gana
        player_stats[player1_name]['wins'] += 1
        player_stats_ytd[player1_name]['wins'] += 1
        # Player2 pierde
        player_stats[player2_name]['losses'] += 1
        player_stats_ytd[player2_name]['losses'] += 1
    else:
        # Player2 gana
        player_stats[player2_name]['wins'] += 1
        player_stats_ytd[player2_name]['wins'] += 1
        # Player1 pierde
        player_stats[player1_name]['losses'] += 1
        player_stats_ytd[player1_name]['losses'] += 1

# Añadir las columnas al DataFrame
matches['player1_total_wins'] = player1_total_wins_list
matches['player1_total_losses'] = player1_total_losses_list
matches['player1_winrate'] = player1_winrate_list
matches['player1_ytd_wins'] = player1_ytd_wins_list
matches['player1_ytd_losses'] = player1_ytd_losses_list
matches['player1_ytd_winrate'] = player1_ytd_winrate_list

matches['player2_total_wins'] = player2_total_wins_list
matches['player2_total_losses'] = player2_total_losses_list
matches['player2_winrate'] = player2_winrate_list
matches['player2_ytd_wins'] = player2_ytd_wins_list
matches['player2_ytd_losses'] = player2_ytd_losses_list
matches['player2_ytd_winrate'] = player2_ytd_winrate_list

print(f"✓ Columnas de victorias y derrotas añadidas al DataFrame.")
print(f"✓ Se detectaron {year_resets} reseteos de año YTD")

# Verificación de que YTD y Total son diferentes
print("\n🔍 Verificación de diferencias YTD vs Total:")
print(f"   Total wins - Media: {matches['player1_total_wins'].mean():.2f}, Max: {matches['player1_total_wins'].max()}")
print(f"   YTD wins   - Media: {matches['player1_ytd_wins'].mean():.2f}, Max: {matches['player1_ytd_wins'].max()}")
print(f"   Total winrate - Media: {matches['player1_winrate'].mean():.3f}")
print(f"   YTD winrate   - Media: {matches['player1_ytd_winrate'].mean():.3f}")

if matches['player1_ytd_wins'].max() < matches['player1_total_wins'].max():
    print("\n✅ YTD Max < Total Max → El YTD está funcionando correctamente!")
else:
    print("\n⚠️  YTD Max == Total Max → Revisar implementación...")

print("\nÚltimos 10 partidos con comparación YTD vs Total:")
display(matches[['player1_name', 'player2_name', 'year',
                 'player1_total_wins', 'player1_ytd_wins',
                 'player1_total_losses', 'player1_ytd_losses',
                 'player1_winrate', 'player1_ytd_winrate', 'player1_wins']].tail(10))

Calculando estadísticas de victorias y derrotas...
✓ Columnas de victorias y derrotas añadidas al DataFrame.
✓ Se detectaron 4362 reseteos de año YTD

🔍 Verificación de diferencias YTD vs Total:
   Total wins - Media: 91.23, Max: 811
   YTD wins   - Media: 11.43, Max: 83
   Total winrate - Media: 0.491
   YTD winrate   - Media: 0.482

✅ YTD Max < Total Max → El YTD está funcionando correctamente!

Últimos 10 partidos con comparación YTD vs Total:


Unnamed: 0,player1_name,player2_name,year,player1_total_wins,player1_ytd_wins,player1_total_losses,player1_ytd_losses,player1_winrate,player1_ytd_winrate,player1_wins
39531,Alex Michelsen,Nishesh Basavareddy,2024,40,33,37,29,0.519481,0.532258,1
39532,Jakub Mensik,Arthur Fils,2024,28,25,20,19,0.583333,0.568182,0
39533,Learner Tien,Arthur Fils,2024,4,4,5,3,0.444444,0.571429,1
39534,Joao Fonseca,Arthur Fils,2024,9,9,8,7,0.529412,0.5625,1
39535,Learner Tien,Alex Michelsen,2024,5,5,5,3,0.5,0.625,1
39536,Nishesh Basavareddy,Luca Van Assche,2024,0,0,1,1,0.0,0.0,0
39537,Juncheng Shang,Luca Van Assche,2024,29,25,31,20,0.483333,0.555556,0
39538,Nishesh Basavareddy,Juncheng Shang,2024,0,0,2,2,0.0,0.0,1
39539,Learner Tien,Joao Fonseca,2024,6,6,5,3,0.545455,0.666667,0
39540,Luca Van Assche,Joao Fonseca,2024,22,10,38,18,0.366667,0.357143,0


### 3. Forma actual
Últimos 10 partidos: ganados, perdidos, % ganados

In [10]:
# Feature #3: Forma actual - Últimos 10 partidos
from collections import defaultdict, deque

# Diccionario para mantener los últimos resultados de cada jugador
# Usamos deque con maxlen=10 para mantener solo los últimos 10 partidos
player_recent_results = defaultdict(lambda: deque(maxlen=10))

# Listas para almacenar las estadísticas de forma
player1_last10_wins_list = []
player1_last10_losses_list = []
player1_last10_winrate_list = []

player2_last10_wins_list = []
player2_last10_losses_list = []
player2_last10_winrate_list = []

print("Calculando forma actual (últimos 10 partidos)...")
for idx, row in matches.iterrows():
    player1_name = row['player1_name']
    player2_name = row['player2_name']
    player1_won = row['player1_wins']
    
    # --- PLAYER1 ---
    # Obtener los últimos resultados ANTES del partido actual
    player1_recent = list(player_recent_results[player1_name])
    player1_last10_wins = sum(player1_recent)  # 1 = victoria, 0 = derrota
    player1_last10_losses = len(player1_recent) - player1_last10_wins
    player1_last10_winrate = player1_last10_wins / len(player1_recent) if len(player1_recent) > 0 else 0.0
    
    # Guardar valores
    player1_last10_wins_list.append(player1_last10_wins)
    player1_last10_losses_list.append(player1_last10_losses)
    player1_last10_winrate_list.append(player1_last10_winrate)
    
    # --- PLAYER2 ---
    # Obtener los últimos resultados ANTES del partido actual
    player2_recent = list(player_recent_results[player2_name])
    player2_last10_wins = sum(player2_recent)  # 1 = victoria, 0 = derrota
    player2_last10_losses = len(player2_recent) - player2_last10_wins
    player2_last10_winrate = player2_last10_wins / len(player2_recent) if len(player2_recent) > 0 else 0.0
    
    # Guardar valores
    player2_last10_wins_list.append(player2_last10_wins)
    player2_last10_losses_list.append(player2_last10_losses)
    player2_last10_winrate_list.append(player2_last10_winrate)
    
    # --- Actualizar resultados recientes DESPUÉS del partido ---
    if player1_won == 1:
        # Player1 gana (añadir 1)
        player_recent_results[player1_name].append(1)
        # Player2 pierde (añadir 0)
        player_recent_results[player2_name].append(0)
    else:
        # Player2 gana (añadir 1)
        player_recent_results[player2_name].append(1)
        # Player1 pierde (añadir 0)
        player_recent_results[player1_name].append(0)

# Añadir las columnas al DataFrame
matches['player1_last10_wins'] = player1_last10_wins_list
matches['player1_last10_losses'] = player1_last10_losses_list
matches['player1_last10_winrate'] = player1_last10_winrate_list

matches['player2_last10_wins'] = player2_last10_wins_list
matches['player2_last10_losses'] = player2_last10_losses_list
matches['player2_last10_winrate'] = player2_last10_winrate_list

print("✓ Columnas de forma actual añadidas al DataFrame.")
print("\nVerificación de las nuevas columnas:")
print(matches[['player1_name', 'player2_name', 'player1_last10_wins', 'player1_last10_losses', 
               'player1_last10_winrate', 'player2_last10_wins', 'player2_last10_losses', 'player2_last10_winrate', 'player1_wins']].tail(10))

Calculando forma actual (últimos 10 partidos)...
✓ Columnas de forma actual añadidas al DataFrame.

Verificación de las nuevas columnas:
              player1_name         player2_name  player1_last10_wins  \
39531       Alex Michelsen  Nishesh Basavareddy                    6   
39532         Jakub Mensik          Arthur Fils                    6   
39533         Learner Tien          Arthur Fils                    4   
39534         Joao Fonseca          Arthur Fils                    6   
39535         Learner Tien       Alex Michelsen                    5   
39536  Nishesh Basavareddy      Luca Van Assche                    0   
39537       Juncheng Shang      Luca Van Assche                    4   
39538  Nishesh Basavareddy       Juncheng Shang                    0   
39539         Learner Tien         Joao Fonseca                    6   
39540      Luca Van Assche         Joao Fonseca                    3   

       player1_last10_losses  player1_last10_winrate  player2_last10_w

## Eliminar columnas con data leakage

Ahora eliminamos todas las columnas originales de `winner_*` y `loser_*` (que ya fueron transformadas a `player1_*` y `player2_*`), además de `score` y `minutes`.

In [11]:
# Columnas con data leakage a eliminar
columnas_data_leakage = [
    'winner_aces',
    'winner_double_faults',
    'winner_serve_points_total',
    'winner_first_serves_in',
    'winner_first_serve_points_won',
    'winner_second_serve_points_won',
    'winner_service_games',
    'winner_break_points_saved',
    'winner_break_points_faced',
    'loser_aces',
    'loser_double_faults',
    'loser_serve_points_total',
    'loser_first_serves_in',
    'loser_first_serve_points_won',
    'loser_second_serve_points_won',
    'loser_service_games',
    'loser_break_points_saved',
    'loser_break_points_faced',
    'winner_name',
    'loser_name',
    'score',
    'minutes'
]

# Eliminar también las columnas player1_* y player2_* de estadísticas de partido
# (las que vienen del partido en sí, no las features engineered)
player_match_stats = [
    'player1_aces',
    'player1_double_faults',
    'player1_serve_points_total',
    'player1_first_serves_in',
    'player1_first_serve_points_won',
    'player1_second_serve_points_won',
    'player1_service_games',
    'player1_break_points_saved',
    'player1_break_points_faced',
    'player2_aces',
    'player2_double_faults',
    'player2_serve_points_total',
    'player2_first_serves_in',
    'player2_first_serve_points_won',
    'player2_second_serve_points_won',
    'player2_service_games',
    'player2_break_points_saved',
    'player2_break_points_faced',
]

# Combinar y eliminar solo las que existen
all_leakage_cols = columnas_data_leakage + player_match_stats
cols_to_drop = [col for col in all_leakage_cols if col in matches.columns]

print(f"Eliminando {len(cols_to_drop)} columnas con data leakage...")
matches.drop(columns=cols_to_drop, inplace=True)

print("✓ Columnas eliminadas")
print(f"\nColumnas restantes: {len(matches.columns)}")
print("\nPrimeras columnas del DataFrame limpio:")
display(matches.head())

Eliminando 40 columnas con data leakage...
✓ Columnas eliminadas

Columnas restantes: 61

Primeras columnas del DataFrame limpio:


Unnamed: 0,tourney_name,surface,draw_size,tourney_date,match_num,winner_entry,winner_hand,winner_ht,winner_ioc,winner_age,...,player2_winrate,player2_ytd_wins,player2_ytd_losses,player2_ytd_winrate,player1_last10_wins,player1_last10_losses,player1_last10_winrate,player2_last10_wins,player2_last10_losses,player2_last10_winrate
0,Brisbane,Hard,32,20110102,1,,R,193.0,SWE,26.3,...,0.0,0,0,0.0,0,0,0.0,0,0,0.0
1,Brisbane,Hard,32,20110102,23,,R,183.0,CYP,25.5,...,0.0,0,0,0.0,0,0,0.0,0,0,0.0
2,Brisbane,Hard,32,20110102,24,,R,188.0,USA,28.3,...,0.0,0,0,0.0,0,0,0.0,0,0,0.0
3,Brisbane,Hard,32,20110102,25,,R,193.0,SWE,26.3,...,1.0,1,0,1.0,0,0,0.0,1,0,1.0
4,Brisbane,Hard,32,20110102,5,,R,188.0,USA,29.0,...,0.0,0,0,0.0,0,0,0.0,0,0,0.0


In [12]:
# Verificación final del dataset
print("="*80)
print("VERIFICACIÓN FINAL DEL DATASET")
print("="*80)

print(f"\n📊 Dimensiones: {matches.shape[0]:,} partidos × {matches.shape[1]} columnas")

print(f"\n🎯 Target balance:")
print(f"   player1_wins = 1: {matches['player1_wins'].sum():,} ({matches['player1_wins'].mean()*100:.2f}%)")
print(f"   player1_wins = 0: {(1-matches['player1_wins']).sum():,} ({(1-matches['player1_wins'].mean())*100:.2f}%)")

print(f"\n📝 Columnas del dataset:")
player1_cols = [col for col in matches.columns if col.startswith('player1_')]
player2_cols = [col for col in matches.columns if col.startswith('player2_')]
other_cols = [col for col in matches.columns if not col.startswith('player1_') and not col.startswith('player2_')]

print(f"\n   Player1 features ({len(player1_cols)}):")
for col in sorted(player1_cols):
    print(f"      - {col}")

print(f"\n   Player2 features ({len(player2_cols)}):")
for col in sorted(player2_cols):
    print(f"      - {col}")

print(f"\n   Otras columnas ({len(other_cols)}):")
for col in sorted(other_cols):
    print(f"      - {col}")

print("\n" + "="*80)
print("Dataset listo para el modelado ✅")
print("="*80)

VERIFICACIÓN FINAL DEL DATASET

📊 Dimensiones: 39,541 partidos × 61 columnas

🎯 Target balance:
   player1_wins = 1: 19,699 (49.82%)
   player1_wins = 0: 19,842 (50.18%)

📝 Columnas del dataset:

   Player1 features (19):
      - player1_age
      - player1_entry
      - player1_h2h_wins
      - player1_hand
      - player1_ht
      - player1_ioc
      - player1_last10_losses
      - player1_last10_winrate
      - player1_last10_wins
      - player1_name
      - player1_rank
      - player1_rank_points
      - player1_total_losses
      - player1_total_wins
      - player1_winrate
      - player1_wins
      - player1_ytd_losses
      - player1_ytd_winrate
      - player1_ytd_wins

   Player2 features (18):
      - player2_age
      - player2_entry
      - player2_h2h_wins
      - player2_hand
      - player2_ht
      - player2_ioc
      - player2_last10_losses
      - player2_last10_winrate
      - player2_last10_wins
      - player2_name
      - player2_rank
      - player2_rank_point

In [13]:
# Limpiar columnas duplicadas winner_* y loser_* que quedaron del original
# (ya tenemos player1_* y player2_* que son las versiones con shuffle)
remaining_winner_loser_cols = [col for col in matches.columns 
                               if col.startswith('winner_') or col.startswith('loser_')]

if remaining_winner_loser_cols:
    print(f"Eliminando {len(remaining_winner_loser_cols)} columnas duplicadas winner_*/loser_*:")
    for col in remaining_winner_loser_cols:
        print(f"   - {col}")
    matches.drop(columns=remaining_winner_loser_cols, inplace=True)
    print(f"\n✓ Columnas eliminadas. Nuevas dimensiones: {matches.shape}")
else:
    print("✓ No hay columnas winner_*/loser_* adicionales para eliminar.")

Eliminando 14 columnas duplicadas winner_*/loser_*:
   - winner_entry
   - winner_hand
   - winner_ht
   - winner_ioc
   - winner_age
   - loser_entry
   - loser_hand
   - loser_ht
   - loser_ioc
   - loser_age
   - winner_rank
   - winner_rank_points
   - loser_rank
   - loser_rank_points

✓ Columnas eliminadas. Nuevas dimensiones: (39541, 47)


In [14]:
# Mostrar ejemplos aleatorios del dataset final
print("Ejemplos de partidos con la nueva estructura:\n")
sample = matches[['player1_name', 'player2_name', 'player1_rank', 'player2_rank',
                  'player1_h2h_wins', 'player2_h2h_wins', 
                  'player1_winrate', 'player2_winrate',
                  'player1_last10_winrate', 'player2_last10_winrate',
                  'player1_wins']].sample(10, random_state=RND)

display(sample)

Ejemplos de partidos con la nueva estructura:



Unnamed: 0,player1_name,player2_name,player1_rank,player2_rank,player1_h2h_wins,player2_h2h_wins,player1_winrate,player2_winrate,player1_last10_winrate,player2_last10_winrate,player1_wins
21247,Gerald Melzer,Pablo Andujar,110.0,1821.0,1,0,0.369565,0.417004,0.5,0.1,0
18543,Benoit Paire,Taylor Fritz,40.0,136.0,0,0,0.477778,0.404255,0.5,0.4,0
5434,Milos Raonic,Andy Murray,16.0,4.0,2,1,0.663366,0.8,0.7,0.8,0
9141,Fabio Fognini,Alex Bogomolov Jr,16.0,89.0,0,0,0.526627,0.431193,0.5,0.6,1
12962,Gael Monfils,Guillermo Garcia Lopez,15.0,45.0,1,1,0.671362,0.473684,0.8,0.5,0
246,Marin Cilic,Santiago Giraldo,15.0,59.0,0,0,0.5,0.75,0.5,0.75,1
24702,Lucas Pouille,Hubert Hurkacz,28.0,52.0,0,1,0.554502,0.409091,0.4,0.6,0
265,Nicolas Mahut,Viktor Troicki,132.0,27.0,0,0,1.0,0.666667,1.0,0.666667,0
1003,Pablo Andujar,Potito Starace,69.0,47.0,0,1,0.5,0.578947,0.6,0.6,1
11749,Dominic Thiem,David Goffin,39.0,28.0,0,2,0.482143,0.495146,0.4,1.0,0


---

## 📋 Resumen de transformaciones aplicadas

### 1. **Limpieza inicial**
- ✅ Eliminadas columnas innecesarias: IDs, seeds, tourney_level, day, month_name
- ✅ Renombradas columnas de estadísticas: `w_*` → `winner_*`, `l_*` → `loser_*`

### 2. **Shuffle aleatorio (player1/player2)**
- ✅ Ordenamiento cronológico del dataset (crítico para evitar data leakage temporal)
- ✅ Shuffle aleatorio ~50/50 de winner/loser → player1/player2
- ✅ Creación de variable target: `player1_wins` (balanceada)
- ✅ **Elimina data leakage implícito**: El modelo no puede aprender "el winner siempre gana"

### 3. **Feature Engineering**
Todas las features calculadas usando **solo información disponible antes de cada partido**:

#### **Feature #1: H2H (Head-to-Head)**
- `player1_h2h_wins`: Victorias previas de player1 contra player2
- `player2_h2h_wins`: Victorias previas de player2 contra player1

#### **Feature #2: Victorias y Derrotas**
Estadísticas globales:
- `player1_total_wins`, `player1_total_losses`, `player1_winrate`
- `player2_total_wins`, `player2_total_losses`, `player2_winrate`

Estadísticas YTD (Year-To-Date):
- `player1_ytd_wins`, `player1_ytd_losses`, `player1_ytd_winrate`
- `player2_ytd_wins`, `player2_ytd_losses`, `player2_ytd_winrate`

#### **Feature #3: Forma Actual**
Últimos 10 partidos:
- `player1_last10_wins`, `player1_last10_losses`, `player1_last10_winrate`
- `player2_last10_wins`, `player2_last10_losses`, `player2_last10_winrate`

### 4. **Eliminación de data leakage**
- ✅ Eliminadas todas las estadísticas del partido (aces, dobles faltas, puntos ganados, etc.)
- ✅ Eliminadas columnas `score` y `minutes`
- ✅ Eliminadas columnas duplicadas `winner_*` y `loser_*` originales

### 5. **Dataset final**
- **39,541 partidos** × **47 columnas**
- **Target balanceado**: 49.82% player1 gana, 50.18% player2 gana
- **Sin data leakage**: Todas las features usan solo información histórica
- **Listo para modelado** 🚀

## ✅ Verificación final de la estructura

El dataset ahora tiene la estructura correcta para evitar data leakage:

### Características clave:
1. **Nomenclatura neutral**: `player1_*` y `player2_*` en lugar de `winner_*` y `loser_*`
2. **Shuffle aleatorio**: ~50% de los partidos tienen player1 como ganador, ~50% tienen player2 como ganador
3. **Variable target**: `player1_wins` (1 = player1 ganó, 0 = player2 ganó)
4. **Features engineered**: H2H, victorias/derrotas totales y YTD, forma actual (últimos 10 partidos)
5. **Sin data leakage**: Todas las estadísticas de partido eliminadas

### ¿Por qué este enfoque?
Sin el shuffle, el modelo aprendería simplemente que "el jugador en la columna winner siempre gana", en lugar de aprender patrones basados en características reales (ranking, H2H, forma, etc.).

In [15]:
display(matches.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
tourney_name,39541.0,1078.0,Australian Open,1778.0,,,,,,,
surface,39488.0,4.0,Hard,23060.0,,,,,,,
draw_size,39541.0,,,,57.467489,42.144138,2.0,32.0,32.0,96.0,128.0
tourney_date,39541.0,,,,20174404.064313,41143.884302,20110102.0,20140319.0,20170731.0,20210726.0,20241218.0
match_num,39541.0,,,,167.239802,151.260611,1.0,21.0,190.0,283.0,1701.0
best_of,39541.0,,,,3.433929,0.824367,3.0,3.0,3.0,3.0,5.0
round,39541.0,9.0,R32,12112.0,,,,,,,
year,39541.0,,,,2017.383501,4.113749,2011.0,2014.0,2017.0,2021.0,2024.0
month,39541.0,,,,5.530766,3.00719,1.0,3.0,5.0,8.0,12.0
tourney_points,39541.0,,,,735.129359,664.846518,0.0,250.0,500.0,1000.0,2000.0


Vemos algo que parece raro y es un jugador con altura 3cm player1_ht, vamos a entrar en más detalle:

In [16]:
import pandas as pd

# Umbral de altura poco realista
umbral_altura_minima = 150

# --- 1. Crear un diccionario para almacenar los datos anómalos ---
jugadores_con_altura_anomala = {}

# --- 2. Filtrar y extraer datos de la columna del jugador 1 ---
df_anomalo_p1 = matches.loc[matches['player1_ht'] < umbral_altura_minima, ['player1_name', 'player1_ht']]
for index, row in df_anomalo_p1.iterrows():
    jugadores_con_altura_anomala[row['player1_name']] = row['player1_ht']

# --- 3. Filtrar y extraer datos de la columna del jugador 2 ---
df_anomalo_p2 = matches.loc[matches['player2_ht'] < umbral_altura_minima, ['player2_name', 'player2_ht']]
for index, row in df_anomalo_p2.iterrows():
    jugadores_con_altura_anomala[row['player2_name']] = row['player2_ht']

# --- 4. Imprimir la lista final y única ---
if jugadores_con_altura_anomala:
    print("Jugadores con alturas anómalas encontradas:")
    for jugador, altura in jugadores_con_altura_anomala.items():
        print(f"- {jugador}: {altura} cm")
else:
    print("No se encontraron jugadores con alturas anómalas.")

Jugadores con alturas anómalas encontradas:
- Jorge Brian Panta Herreros: 3.0 cm
- Johannes Ingildsen: 15.0 cm
- Viacheslav Bielinskyi: 71.0 cm


Al ser muy pocos jugadores, corregiré los datos a mano:

In [17]:
# Jorge Brian Panta Herreros-> no hay informacion sobre su altura en internet-> eliminar
matches.drop(matches[matches['player1_name'] == 'Jorge Brian Panta Herreros'].index, inplace=True)
matches.drop(matches[matches['player2_name'] == 'Jorge Brian Panta Herreros'].index, inplace=True)

# Johannes Ingildsen-> 193 cm segun Wikipedia-> corregir
matches.loc[matches['player1_name'] == 'Johannes Ingildsen', 'player1_ht'] = 193
matches.loc[matches['player2_name'] == 'Johannes Ingildsen', 'player2_ht'] = 193

# Viacheslav Bielinski-> no hay informacion sobre su altura en internet-> eliminar
matches.drop(matches[matches['player1_name'] == 'Viacheslav Bielinskyi'].index, inplace=True)
matches.drop(matches[matches['player2_name'] == 'Viacheslav Bielinskyi'].index, inplace=True)

In [18]:
display(matches.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
tourney_name,39533.0,1078.0,Roland Garros,1778.0,,,,,,,
surface,39481.0,4.0,Hard,23058.0,,,,,,,
draw_size,39533.0,,,,57.478309,42.141537,2.0,32.0,32.0,96.0,128.0
tourney_date,39533.0,,,,20174403.606127,41142.947638,20110102.0,20140319.0,20170731.0,20210726.0,20241218.0
match_num,39533.0,,,,167.273038,151.257867,1.0,21.0,190.0,283.0,1701.0
best_of,39533.0,,,,3.433967,0.824393,3.0,3.0,3.0,3.0,5.0
round,39533.0,9.0,R32,12112.0,,,,,,,
year,39533.0,,,,2017.383452,4.113654,2011.0,2014.0,2017.0,2021.0,2024.0
month,39533.0,,,,5.53105,3.007147,1.0,3.0,5.0,8.0,12.0
tourney_points,39533.0,,,,735.278122,664.831526,0.0,250.0,500.0,1000.0,2000.0


# Tratamiento de valores anomalos
