In [19]:
# Uncomment to upgrade packages
#!pip3 install pandas --upgrade --quiet
#!pip3 install numpy  --upgrade --quiet
#!pip3 install scipy --upgrade --quiet
#!pip3 install statsmodels  --upgrade --quiet
#!pip3 install seaborn  --upgrade --quiet
#!pip3 install matplotlib  --upgrade --quiet
#!pip3 install scikit-learn  --upgrade  --quiet
#!pip install scikit-optimize  --quiet
#!pip install -U --quiet yellowbrick

In [20]:
# Imports de librer√≠as
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns      
import missingno as msno
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures

from apafib import load_medical_costs  
from sklearn.model_selection import train_test_split
from scipy import stats
from matplotlib.lines import Line2D

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Definiciones
RND = 16 

sns.set(style="whitegrid", font_scale=1.05)

# Funciones auxiliares
def format_pval(p):
    """Formatea p-value: decimal si p >= 1e-3, si no en notaci√≥n mantisa*10^{exp} (LaTeX)."""
    if p == 0:
        return r"$0$"
    exp = int(np.floor(np.log10(p)))
    if exp >= -3:
        return f"{p:.4f}"                # p mostrado como decimal (4 decimales)
    else:
        mant = p / (10.0**exp)
        # devolver string LaTeX, por ejemplo: $1.94\times10^{-19}$
        return rf"${mant:.2f}\times10^{{{exp}}}$"
    

from time import time
from datetime import timedelta
init_time = time()


# quitar warnings:
import warnings
warnings.filterwarnings("ignore", message="findfont:.*")

# Pr√°ctica de APA

Nosotros haremos un modelo de ML para predecir el ganador de partidos de tenis. Utilizaremos el repo de Jeff Sackman entre 2011 y 2024 como dataset (estamos hablando de m√°s de 39k filas de datos). Es un problema de clasificaci√≥n (binaria).

In [21]:
matches = pd.read_csv('./data/atp_matches.csv')
matches.head()
matches.shape

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,year,month,day,month_name,tourney_points
0,2011-339,Brisbane,Hard,32,A,20110102,1,104417,1.0,,...,4.0,5.0,5580.0,173.0,309.0,2011,1,2,January,250
1,2011-339,Brisbane,Hard,32,A,20110102,2,103582,,,...,5.0,58.0,835.0,75.0,643.0,2011,1,2,January,250
2,2011-339,Brisbane,Hard,32,A,20110102,3,105051,,Q,...,8.0,196.0,263.0,204.0,243.0,2011,1,2,January,250
3,2011-339,Brisbane,Hard,32,A,20110102,4,104797,8.0,,...,3.0,40.0,1031.0,43.0,975.0,2011,1,2,January,250
4,2011-339,Brisbane,Hard,32,A,20110102,5,103888,4.0,,...,6.0,16.0,1991.0,83.0,600.0,2011,1,2,January,250


(39541, 54)

Sabemos que tendremos mucho trabajo con el preprocesado de los datos, incluso haciendo feature engineering para a√±adir posibles variables √∫tiles.

Haremos entonces lo que nos dijiste en clase: entregar la precisi√≥n del modelo de ML (para asegurarnos que es mejor que la aleatoriedad: >50% de precisi√≥n).

Obviamente nos reservamos el derecho para poder a√±adir features que puedan ser interesantes, para ver a qu√© nivel de precisi√≥n somos capaces de llegar: 70%? 80%?

In [22]:
display(matches.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
tourney_id,39541.0,1894.0,2011-580,127.0,,,,,,,
tourney_name,39541.0,1078.0,Australian Open,1778.0,,,,,,,
surface,39488.0,4.0,Hard,23060.0,,,,,,,
draw_size,39541.0,,,,57.467489,42.144138,2.0,32.0,32.0,96.0,128.0
tourney_level,39541.0,6.0,A,21148.0,,,,,,,
tourney_date,39541.0,,,,20174404.064313,41143.884302,20110102.0,20140319.0,20170731.0,20210726.0,20241218.0
match_num,39541.0,,,,167.239802,151.260611,1.0,21.0,190.0,283.0,1701.0
winner_id,39541.0,,,,117737.747477,29694.476989,100644.0,104607.0,105373.0,109739.0,212721.0
winner_seed,16884.0,,,,7.543592,6.988957,1.0,3.0,5.0,9.0,33.0
winner_entry,5347.0,10.0,Q,2968.0,,,,,,,


Vamos a empezar limpiando los datos.

- Se eliminan identificadores personales: winner_id, loser_id, winner_name, loser_name, winner_ioc, loser_ioc.

- Se eliminan day y month_name porque la fecha ya est√° en tourney_date.

- Se eliminan las variables de seed (winner_seed, loser_seed), ya que pueden derivarse del ranking ATP cercano al partido.

- Se elimina tourney_level.

In [23]:
matches.drop(columns=['tourney_id', 'winner_id', 'loser_id', 'day', 'month_name', 'loser_seed', 'winner_seed', 'tourney_level'], inplace=True)

### Control del Data Leakage

En el contexto del aprendizaje autom√°tico, el *data leakage* sucede cuando el modelo tiene acceso a datos que, en la vida real, no deber√≠a conocer al momento de hacer predicciones. Esto incluye informaci√≥n futura o datos que dependen directamente del resultado que estamos intentando predecir.

En nuestro caso, las fugas pueden presentarse de tres formas:

1. **Temporal** ‚Üí si usamos partidos futuros para generar estad√≠sticas de los jugadores actuales.  
   Soluci√≥n: ordenar cronol√≥gicamente los partidos antes de calcular cualquier feature.

2. **Estructural** ‚Üí si el modelo distingue sistem√°ticamente entre el jugador etiquetado como ‚Äúwinner‚Äù y ‚Äúloser‚Äù.  
   Soluci√≥n: crear una estructura sim√©trica `player1/player2` aleatorizada para evitar sesgos.

3. **De variables** ‚Üí si incluimos estad√≠sticas generadas despu√©s del partido (p. ej. `winner_aces`).  
   Soluci√≥n: eliminar columnas con datos del resultado o dependientes de √©l.

Controlar el *data leakage* es esencial para garantizar que el modelo aprenda patrones reales y generalizables, y no una visi√≥n privilegiada del pasado.


In [24]:
import numpy as np

# Ordenar cronol√≥gicamente ANTES del shuffle (cr√≠tico para evitar data leakage temporal)
matches = matches.sort_values('tourney_date').reset_index(drop=True)
print("DataFrame ordenado por fecha.")

# Configurar semilla para reproducibilidad
np.random.seed(RND)

# Crear m√°scara de shuffle: True = mantener orden, False = invertir
shuffle_mask = np.random.rand(len(matches)) > 0.5

# Inicializar columnas player1 y player2
# Identificar todas las columnas que empiezan con winner_ o loser_
winner_cols = [col for col in matches.columns if col.startswith('winner_')]
loser_cols = [col for col in matches.columns if col.startswith('loser_')]

# Crear diccionario de mapeo
winner_to_player = {col: col.replace('winner_', 'player1_') for col in winner_cols}
loser_to_player = {col: col.replace('loser_', 'player2_') for col in loser_cols}

# Copiar y renombrar columnas
for winner_col, player1_col in winner_to_player.items():
    matches[player1_col] = matches[winner_col]
    
for loser_col, player2_col in loser_to_player.items():
    matches[player2_col] = matches[loser_col]

# Aplicar shuffle: intercambiar player1 y player2 seg√∫n la m√°scara
print(f"Aplicando shuffle aleatorio a {(~shuffle_mask).sum()} de {len(matches)} partidos...")

for i in range(len(matches)):
    if not shuffle_mask[i]:  # Si shuffle_mask[i] es False, intercambiar
        # Intercambiar todas las columnas player1 y player2
        for winner_col in winner_cols:
            player1_col = winner_col.replace('winner_', 'player1_')
            player2_col = winner_col.replace('winner_', 'player2_')
            
            # Swap
            temp = matches.at[i, player1_col]
            matches.at[i, player1_col] = matches.at[i, player2_col]
            matches.at[i, player2_col] = temp

# Crear variable target: player1_wins
# Si shuffle_mask[i] es True, player1 es el winner original (gan√≥)
# Si shuffle_mask[i] es False, player1 es el loser original (perdi√≥)
matches['player1_wins'] = shuffle_mask.astype(int)

print(f"\n‚úì Shuffle completado")
print(f"  - player1_wins = 1: {matches['player1_wins'].sum()} partidos ({matches['player1_wins'].mean()*100:.1f}%)")
print(f"  - player1_wins = 0: {(1-matches['player1_wins']).sum()} partidos ({(1-matches['player1_wins'].mean())*100:.1f}%)")
print(f"\nPrimeras filas con nueva estructura:")
display(matches[['player1_name', 'player2_name', 'player1_wins']].head(10))

DataFrame ordenado por fecha.
Aplicando shuffle aleatorio a 19842 de 39541 partidos...

‚úì Shuffle completado
  - player1_wins = 1: 19699 partidos (49.8%)
  - player1_wins = 0: 19842 partidos (50.2%)

Primeras filas con nueva estructura:


Unnamed: 0,player1_name,player2_name,player1_wins
0,Ryan Harrison,Robin Soderling,0
1,Michael Berrer,Dudi Sela,1
2,Matthew Ebden,John Millman,1
3,Thiemo De Bakker,Denis Istomin,0
4,Adrian Mannarino,Mardy Fish,0
5,Tobias Kamke,Radek Stepanek,0
6,Ricardas Berankis,Arnaud Clement,1
7,Bernard Tomic,Florian Mayer,0
8,Philipp Petzschner,Feliciano Lopez,0
9,Kevin Anderson,Peter Luczak,1


## Shuffle aleatorio: winner/loser ‚Üí player1/player2

**¬øPor qu√© es necesario?**

Si mantenemos la estructura `winner_*` vs `loser_*`, el modelo aprender√° simplemente que "el jugador etiquetado como winner siempre gana", lo cual es **data leakage impl√≠cito**.

**Soluci√≥n:**
1. Para cada partido, shuffle aleatorio de qui√©n es `player1` y qui√©n es `player2`
2. Crear variable target `player1_wins` (1 si player1 gan√≥, 0 si perdi√≥)
3. El modelo aprende de caracter√≠sticas reales, no de la estructura de los datos

## Feature Engineering

Variables que causar√≠an leakage:
- Todas las columnas que empiezan por `winner_` o `loser_` (ya transformadas a `player1`/`player2`)
- `score`
- `minutes`

Estrategia:
- Trabajar siempre con la estructura `player1` vs `player2`.
- Crear nuevas variables a partir de estas fuentes, evitando exponer informaci√≥n del resultado.
- Eliminar las columnas de leakage tras generar las nuevas caracter√≠sticas.

### 1. H2H (Head-to-Head)

Calculamos el historial de enfrentamientos directos entre los dos jugadores. Para cada partido se obtiene cu√°ntas veces un jugador ha ganado al otro anteriormente. Esta feature a√±ade contexto m√°s all√° de las estad√≠sticas individuales, pues por estilo de juego ciertos jugadores dominan a otros.

In [25]:
# Feature #1: H2H (Head-to-Head)
from collections import defaultdict

# --- Preparar la estructura para guardar los r√©cords H2H ---
# La clave ser√° una tupla ordenada con los nombres de los dos jugadores.
# El valor ser√° un contador de victorias para cada uno en ese enfrentamiento.
h2h_records = defaultdict(lambda: defaultdict(int))

# Listas para guardar los resultados calculados
player1_h2h_list = []
player2_h2h_list = []

# --- Iterar sobre cada partido para calcular el H2H ---
print("Calculando H2H para cada partido...")
for idx, row in matches.iterrows():
    player1_name = row['player1_name']
    player2_name = row['player2_name']
    player1_won = row['player1_wins']
    
    # Crear una clave √∫nica y ordenada para el par de jugadores
    player_pair_key = tuple(sorted((player1_name, player2_name)))
    
    # Consultar el historial H2H *antes* de este partido
    player1_wins_before = h2h_records[player_pair_key][player1_name]
    player2_wins_before = h2h_records[player_pair_key][player2_name]
    
    # Guardar los valores hist√≥ricos
    player1_h2h_list.append(player1_wins_before)
    player2_h2h_list.append(player2_wins_before)
    
    # Actualizar el registro con el resultado de *este* partido para el futuro
    if player1_won == 1:
        h2h_records[player_pair_key][player1_name] += 1
    else:
        h2h_records[player_pair_key][player2_name] += 1

# --- A√±adir las nuevas columnas al DataFrame ---
matches['player1_h2h_wins'] = player1_h2h_list
matches['player2_h2h_wins'] = player2_h2h_list
print("‚úì Nuevas columnas H2H a√±adidas al DataFrame.")

# --- Verificar el resultado ---
print("\nPrimeros 10 partidos:")
print(matches[['player1_name', 'player2_name', 'player1_h2h_wins', 'player2_h2h_wins', 'player1_wins']].head(10))
print("\n√öltimos 10 partidos:")
print(matches[['player1_name', 'player2_name', 'player1_h2h_wins', 'player2_h2h_wins', 'player1_wins']].tail(10))

Calculando H2H para cada partido...
‚úì Nuevas columnas H2H a√±adidas al DataFrame.

Primeros 10 partidos:
         player1_name     player2_name  player1_h2h_wins  player2_h2h_wins  \
0       Ryan Harrison  Robin Soderling                 0                 0   
1      Michael Berrer        Dudi Sela                 0                 0   
2       Matthew Ebden     John Millman                 0                 0   
3    Thiemo De Bakker    Denis Istomin                 0                 0   
4    Adrian Mannarino       Mardy Fish                 0                 0   
5        Tobias Kamke   Radek Stepanek                 0                 0   
6   Ricardas Berankis   Arnaud Clement                 0                 0   
7       Bernard Tomic    Florian Mayer                 0                 0   
8  Philipp Petzschner  Feliciano Lopez                 0                 0   
9      Kevin Anderson     Peter Luczak                 0                 0   

   player1_wins  
0             0 

### 2. Victorias y derrotas totales y anuales
Calculamos el rendimiento tanto hist√≥rico como del a√±o en curso de cada jugador. Esta informaci√≥n permite al modelo evaluar la forma general del jugador y su consistencia a lo largo del tiempo. Por otro lado, las estad√≠sticas del a√±o actual (Year-to-Date) ayudan a capturar tendencias y estado de forma.

In [26]:
# Feature #2: Victorias y derrotas totales y anuales (YTD) - VERSI√ìN CORREGIDA
from collections import defaultdict

# Diccionarios para tracking de estad√≠sticas globales y anuales
player_stats = defaultdict(lambda: {'wins': 0, 'losses': 0})
player_stats_ytd = defaultdict(lambda: {'wins': 0, 'losses': 0, 'current_year': None})

# Listas para almacenar las estad√≠sticas calculadas
player1_total_wins_list = []
player1_total_losses_list = []
player1_winrate_list = []
player1_ytd_wins_list = []
player1_ytd_losses_list = []
player1_ytd_winrate_list = []

player2_total_wins_list = []
player2_total_losses_list = []
player2_winrate_list = []
player2_ytd_wins_list = []
player2_ytd_losses_list = []
player2_ytd_winrate_list = []

print("Calculando estad√≠sticas de victorias y derrotas...")
year_resets = 0  # Counter para debug

for idx, row in matches.iterrows():
    player1_name = row['player1_name']
    player2_name = row['player2_name']
    player1_won = row['player1_wins']
    year = row['year']  # ‚úÖ USAR LA COLUMNA YEAR DIRECTAMENTE
    
    # --- PLAYER1 ---
    # Resetear YTD si cambi√≥ el a√±o
    if player_stats_ytd[player1_name]['current_year'] is None:
        # Primera vez que vemos a este jugador
        player_stats_ytd[player1_name]['current_year'] = year
    elif player_stats_ytd[player1_name]['current_year'] != year:
        # Cambi√≥ el a√±o, resetear
        player_stats_ytd[player1_name]['wins'] = 0
        player_stats_ytd[player1_name]['losses'] = 0
        player_stats_ytd[player1_name]['current_year'] = year
        year_resets += 1
    
    # Stats globales ANTES del partido (carrera completa)
    player1_total_wins = player_stats[player1_name]['wins']
    player1_total_losses = player_stats[player1_name]['losses']
    player1_total_matches = player1_total_wins + player1_total_losses
    player1_winrate = player1_total_wins / player1_total_matches if player1_total_matches > 0 else 0.0
    
    # Stats YTD ANTES del partido (solo del a√±o actual)
    player1_ytd_wins = player_stats_ytd[player1_name]['wins']
    player1_ytd_losses = player_stats_ytd[player1_name]['losses']
    player1_ytd_matches = player1_ytd_wins + player1_ytd_losses
    player1_ytd_winrate = player1_ytd_wins / player1_ytd_matches if player1_ytd_matches > 0 else 0.0
    
    # Guardar valores de player1
    player1_total_wins_list.append(player1_total_wins)
    player1_total_losses_list.append(player1_total_losses)
    player1_winrate_list.append(player1_winrate)
    player1_ytd_wins_list.append(player1_ytd_wins)
    player1_ytd_losses_list.append(player1_ytd_losses)
    player1_ytd_winrate_list.append(player1_ytd_winrate)
    
    # --- PLAYER2 ---
    # Resetear YTD si cambi√≥ el a√±o
    if player_stats_ytd[player2_name]['current_year'] is None:
        # Primera vez que vemos a este jugador
        player_stats_ytd[player2_name]['current_year'] = year
    elif player_stats_ytd[player2_name]['current_year'] != year:
        # Cambi√≥ el a√±o, resetear
        player_stats_ytd[player2_name]['wins'] = 0
        player_stats_ytd[player2_name]['losses'] = 0
        player_stats_ytd[player2_name]['current_year'] = year
        year_resets += 1
    
    # Stats globales ANTES del partido (carrera completa)
    player2_total_wins = player_stats[player2_name]['wins']
    player2_total_losses = player_stats[player2_name]['losses']
    player2_total_matches = player2_total_wins + player2_total_losses
    player2_winrate = player2_total_wins / player2_total_matches if player2_total_matches > 0 else 0.0
    
    # Stats YTD ANTES del partido (solo del a√±o actual)
    player2_ytd_wins = player_stats_ytd[player2_name]['wins']
    player2_ytd_losses = player_stats_ytd[player2_name]['losses']
    player2_ytd_matches = player2_ytd_wins + player2_ytd_losses
    player2_ytd_winrate = player2_ytd_wins / player2_ytd_matches if player2_ytd_matches > 0 else 0.0
    
    # Guardar valores de player2
    player2_total_wins_list.append(player2_total_wins)
    player2_total_losses_list.append(player2_total_losses)
    player2_winrate_list.append(player2_winrate)
    player2_ytd_wins_list.append(player2_ytd_wins)
    player2_ytd_losses_list.append(player2_ytd_losses)
    player2_ytd_winrate_list.append(player2_ytd_winrate)
    
    # --- Actualizar estad√≠sticas DESPU√âS del partido ---
    if player1_won == 1:
        # Player1 gana
        player_stats[player1_name]['wins'] += 1
        player_stats_ytd[player1_name]['wins'] += 1
        # Player2 pierde
        player_stats[player2_name]['losses'] += 1
        player_stats_ytd[player2_name]['losses'] += 1
    else:
        # Player2 gana
        player_stats[player2_name]['wins'] += 1
        player_stats_ytd[player2_name]['wins'] += 1
        # Player1 pierde
        player_stats[player1_name]['losses'] += 1
        player_stats_ytd[player1_name]['losses'] += 1

# A√±adir las columnas al DataFrame
matches['player1_total_wins'] = player1_total_wins_list
matches['player1_total_losses'] = player1_total_losses_list
matches['player1_winrate'] = player1_winrate_list
matches['player1_ytd_wins'] = player1_ytd_wins_list
matches['player1_ytd_losses'] = player1_ytd_losses_list
matches['player1_ytd_winrate'] = player1_ytd_winrate_list

matches['player2_total_wins'] = player2_total_wins_list
matches['player2_total_losses'] = player2_total_losses_list
matches['player2_winrate'] = player2_winrate_list
matches['player2_ytd_wins'] = player2_ytd_wins_list
matches['player2_ytd_losses'] = player2_ytd_losses_list
matches['player2_ytd_winrate'] = player2_ytd_winrate_list

print(f"‚úì Columnas de victorias y derrotas a√±adidas al DataFrame.")
print(f"‚úì Se detectaron {year_resets} reseteos de a√±o YTD")

# Verificaci√≥n de que YTD y Total son diferentes
print("\nüîç Verificaci√≥n de diferencias YTD vs Total:")
print(f"   Total wins - Media: {matches['player1_total_wins'].mean():.2f}, Max: {matches['player1_total_wins'].max()}")
print(f"   YTD wins   - Media: {matches['player1_ytd_wins'].mean():.2f}, Max: {matches['player1_ytd_wins'].max()}")
print(f"   Total winrate - Media: {matches['player1_winrate'].mean():.3f}")
print(f"   YTD winrate   - Media: {matches['player1_ytd_winrate'].mean():.3f}")

if matches['player1_ytd_wins'].max() < matches['player1_total_wins'].max():
    print("\n‚úÖ YTD Max < Total Max ‚Üí El YTD est√° funcionando correctamente!")
else:
    print("\n‚ö†Ô∏è  YTD Max == Total Max ‚Üí Revisar implementaci√≥n...")

print("\n√öltimos 10 partidos con comparaci√≥n YTD vs Total:")
display(matches[['player1_name', 'player2_name', 'year',
                 'player1_total_wins', 'player1_ytd_wins',
                 'player1_total_losses', 'player1_ytd_losses',
                 'player1_winrate', 'player1_ytd_winrate', 'player1_wins']].tail(10))

Calculando estad√≠sticas de victorias y derrotas...
‚úì Columnas de victorias y derrotas a√±adidas al DataFrame.
‚úì Se detectaron 4362 reseteos de a√±o YTD

üîç Verificaci√≥n de diferencias YTD vs Total:
   Total wins - Media: 90.94, Max: 812
   YTD wins   - Media: 11.44, Max: 80
   Total winrate - Media: 0.491
   YTD winrate   - Media: 0.479

‚úÖ YTD Max < Total Max ‚Üí El YTD est√° funcionando correctamente!

√öltimos 10 partidos con comparaci√≥n YTD vs Total:


Unnamed: 0,player1_name,player2_name,year,player1_total_wins,player1_ytd_wins,player1_total_losses,player1_ytd_losses,player1_winrate,player1_ytd_winrate,player1_wins
39531,Learner Tien,Arthur Fils,2024,4,4,5,3,0.444444,0.571429,1
39532,Jakub Mensik,Joao Fonseca,2024,28,25,19,18,0.595745,0.581395,0
39533,Joao Fonseca,Arthur Fils,2024,10,10,8,7,0.555556,0.588235,1
39534,Arthur Fils,Jakub Mensik,2024,62,37,46,28,0.574074,0.569231,1
39535,Luca Van Assche,Juncheng Shang,2024,20,8,38,18,0.344828,0.307692,1
39536,Nishesh Basavareddy,Luca Van Assche,2024,1,1,0,0,1.0,1.0,0
39537,Alex Michelsen,Learner Tien,2024,39,32,37,29,0.513158,0.52459,0
39538,Joao Fonseca,Learner Tien,2024,11,11,8,7,0.578947,0.611111,1
39539,Luca Van Assche,Alex Michelsen,2024,22,10,38,18,0.366667,0.357143,0
39540,Nishesh Basavareddy,Alex Michelsen,2024,1,1,1,1,0.5,0.5,0


### 3. Forma actual
Calculamos la proporci√≥n de victorias en los √∫ltimos 10 partidos de cada jugador. Esta feature refleja el estado de forma actual que es crucial porque muchos jugadores tienen rachas de confianza o bajones temporales que pueden afectar significativamente en sus resultados a corto plazo.

In [27]:
# Feature #3: Forma actual - √öltimos 10 partidos
from collections import defaultdict, deque

# Diccionario para mantener los √∫ltimos resultados de cada jugador
# Usamos deque con maxlen=10 para mantener solo los √∫ltimos 10 partidos
player_recent_results = defaultdict(lambda: deque(maxlen=10))

# Listas para almacenar las estad√≠sticas de forma
player1_last10_wins_list = []
player1_last10_losses_list = []
player1_last10_winrate_list = []

player2_last10_wins_list = []
player2_last10_losses_list = []
player2_last10_winrate_list = []

print("Calculando forma actual (√∫ltimos 10 partidos)...")
for idx, row in matches.iterrows():
    player1_name = row['player1_name']
    player2_name = row['player2_name']
    player1_won = row['player1_wins']
    
    # --- PLAYER1 ---
    # Obtener los √∫ltimos resultados ANTES del partido actual
    player1_recent = list(player_recent_results[player1_name])
    player1_last10_wins = sum(player1_recent)  # 1 = victoria, 0 = derrota
    player1_last10_losses = len(player1_recent) - player1_last10_wins
    player1_last10_winrate = player1_last10_wins / len(player1_recent) if len(player1_recent) > 0 else 0.0
    
    # Guardar valores
    player1_last10_wins_list.append(player1_last10_wins)
    player1_last10_losses_list.append(player1_last10_losses)
    player1_last10_winrate_list.append(player1_last10_winrate)
    
    # --- PLAYER2 ---
    # Obtener los √∫ltimos resultados ANTES del partido actual
    player2_recent = list(player_recent_results[player2_name])
    player2_last10_wins = sum(player2_recent)  # 1 = victoria, 0 = derrota
    player2_last10_losses = len(player2_recent) - player2_last10_wins
    player2_last10_winrate = player2_last10_wins / len(player2_recent) if len(player2_recent) > 0 else 0.0
    
    # Guardar valores
    player2_last10_wins_list.append(player2_last10_wins)
    player2_last10_losses_list.append(player2_last10_losses)
    player2_last10_winrate_list.append(player2_last10_winrate)
    
    # --- Actualizar resultados recientes DESPU√âS del partido ---
    if player1_won == 1:
        # Player1 gana (a√±adir 1)
        player_recent_results[player1_name].append(1)
        # Player2 pierde (a√±adir 0)
        player_recent_results[player2_name].append(0)
    else:
        # Player2 gana (a√±adir 1)
        player_recent_results[player2_name].append(1)
        # Player1 pierde (a√±adir 0)
        player_recent_results[player1_name].append(0)

# A√±adir las columnas al DataFrame
matches['player1_last10_wins'] = player1_last10_wins_list
matches['player1_last10_losses'] = player1_last10_losses_list
matches['player1_last10_winrate'] = player1_last10_winrate_list

matches['player2_last10_wins'] = player2_last10_wins_list
matches['player2_last10_losses'] = player2_last10_losses_list
matches['player2_last10_winrate'] = player2_last10_winrate_list

print("‚úì Columnas de forma actual a√±adidas al DataFrame.")
print("\nVerificaci√≥n de las nuevas columnas:")
print(matches[['player1_name', 'player2_name', 'player1_last10_wins', 'player1_last10_losses', 
               'player1_last10_winrate', 'player2_last10_wins', 'player2_last10_losses', 'player2_last10_winrate', 'player1_wins']].tail(10))

Calculando forma actual (√∫ltimos 10 partidos)...
‚úì Columnas de forma actual a√±adidas al DataFrame.

Verificaci√≥n de las nuevas columnas:
              player1_name     player2_name  player1_last10_wins  \
39531         Learner Tien      Arthur Fils                    4   
39532         Jakub Mensik     Joao Fonseca                    6   
39533         Joao Fonseca      Arthur Fils                    7   
39534          Arthur Fils     Jakub Mensik                    5   
39535      Luca Van Assche   Juncheng Shang                    1   
39536  Nishesh Basavareddy  Luca Van Assche                    1   
39537       Alex Michelsen     Learner Tien                    6   
39538         Joao Fonseca     Learner Tien                    7   
39539      Luca Van Assche   Alex Michelsen                    3   
39540  Nishesh Basavareddy   Alex Michelsen                    1   

       player1_last10_losses  player1_last10_winrate  player2_last10_wins  \
39531                      5    

## Eliminar columnas con riesgo de data leakage

En este paso eliminamos todas las columnas originales `winner_*` y `loser_*`, que ya fueron transformadas a `player1_*` y `player2_*`, as√≠ como `score` y `minutes`. 
> score: contiene directamente el resultado del partido.

> time: refleja la duraci√≥n del partido, informaci√≥n que solo se obtiene despu√©s del encuentro.

Tambi√©n se retiran todas las variables que contienen informaci√≥n posterior al resultado del partido, como estad√≠sticas registradas despu√©s de conocer al ganador o perdedor, para evitar que el modelo ‚Äúhaga trampa‚Äù y sobrestime su rendimiento. Esta limpieza es fundamental para que el entrenamiento se base √∫nicamente en datos disponibles antes del partido, garantizando as√≠ la validez del modelo y eliminando cualquier informaci√≥n directa del partido, como aces, dobles faltas o puntos ganados.

In [28]:
# Columnas con data leakage a eliminar
columnas_data_leakage = [
    'winner_aces',
    'winner_double_faults',
    'winner_serve_points_total',
    'winner_first_serves_in',
    'winner_first_serve_points_won',
    'winner_second_serve_points_won',
    'winner_service_games',
    'winner_break_points_saved',
    'winner_break_points_faced',
    'loser_aces',
    'loser_double_faults',
    'loser_serve_points_total',
    'loser_first_serves_in',
    'loser_first_serve_points_won',
    'loser_second_serve_points_won',
    'loser_service_games',
    'loser_break_points_saved',
    'loser_break_points_faced',
    'winner_name',
    'loser_name',
    'score',
    'minutes'
]

# Eliminar tambi√©n las columnas player1_* y player2_* de estad√≠sticas de partido
# (las que vienen del partido en s√≠, no las features engineered)
player_match_stats = [
    'player1_aces',
    'player1_double_faults',
    'player1_serve_points_total',
    'player1_first_serves_in',
    'player1_first_serve_points_won',
    'player1_second_serve_points_won',
    'player1_service_games',
    'player1_break_points_saved',
    'player1_break_points_faced',
    'player2_aces',
    'player2_double_faults',
    'player2_serve_points_total',
    'player2_first_serves_in',
    'player2_first_serve_points_won',
    'player2_second_serve_points_won',
    'player2_service_games',
    'player2_break_points_saved',
    'player2_break_points_faced',
]

# Combinar y eliminar solo las que existen
all_leakage_cols = columnas_data_leakage + player_match_stats
cols_to_drop = [col for col in all_leakage_cols if col in matches.columns]

print(f"Eliminando {len(cols_to_drop)} columnas con data leakage...")
matches.drop(columns=cols_to_drop, inplace=True)

print("‚úì Columnas eliminadas")
print(f"\nColumnas restantes: {len(matches.columns)}")
print("\nPrimeras columnas del DataFrame limpio:")
display(matches.head())

Eliminando 4 columnas con data leakage...
‚úì Columnas eliminadas

Columnas restantes: 79

Primeras columnas del DataFrame limpio:


Unnamed: 0,tourney_name,surface,draw_size,tourney_date,match_num,winner_entry,winner_hand,winner_ht,winner_ioc,winner_age,...,player2_winrate,player2_ytd_wins,player2_ytd_losses,player2_ytd_winrate,player1_last10_wins,player1_last10_losses,player1_last10_winrate,player2_last10_wins,player2_last10_losses,player2_last10_winrate
0,Brisbane,Hard,32,20110102,1,,R,193.0,SWE,26.3,...,0.0,0,0,0.0,0,0,0.0,0,0,0.0
1,Brisbane,Hard,32,20110102,2,,L,193.0,GER,30.5,...,0.0,0,0,0.0,0,0,0.0,0,0,0.0
2,Brisbane,Hard,32,20110102,3,Q,R,188.0,AUS,23.1,...,0.0,0,0,0.0,0,0,0.0,0,0,0.0
3,Brisbane,Hard,32,20110102,4,,R,185.0,UZB,24.3,...,0.0,0,0,0.0,0,0,0.0,0,0,0.0
4,Brisbane,Hard,32,20110102,5,,R,188.0,USA,29.0,...,0.0,0,0,0.0,0,0,0.0,0,0,0.0


Ahora revisaremos que el dataset est√© completo y bien estructurado, comprobando el tama√±o, que las categor√≠as est√©n equilibradas y que las columnas sean coherentes y consistentes. Adem√°s, se eliminan posibles duplicados de las variables winner_* y loser_*, que ya fueron reemplazadas por las nuevas columnas. El objetivo es dejar el dataset completamente limpio y listo para la fase de modelado.

In [29]:
# Verificaci√≥n final del dataset
print("="*80)
print("VERIFICACI√ìN FINAL DEL DATASET")
print("="*80)

print(f"\nüìä Dimensiones: {matches.shape[0]:,} partidos √ó {matches.shape[1]} columnas")

print(f"\nüéØ Target balance:")
print(f"   player1_wins = 1: {matches['player1_wins'].sum():,} ({matches['player1_wins'].mean()*100:.2f}%)")
print(f"   player1_wins = 0: {(1-matches['player1_wins']).sum():,} ({(1-matches['player1_wins'].mean())*100:.2f}%)")

print(f"\nüìù Columnas del dataset:")
player1_cols = [col for col in matches.columns if col.startswith('player1_')]
player2_cols = [col for col in matches.columns if col.startswith('player2_')]
other_cols = [col for col in matches.columns if not col.startswith('player1_') and not col.startswith('player2_')]

print(f"\n   Player1 features ({len(player1_cols)}):")
for col in sorted(player1_cols):
    print(f"      - {col}")

print(f"\n   Player2 features ({len(player2_cols)}):")
for col in sorted(player2_cols):
    print(f"      - {col}")

print(f"\n   Otras columnas ({len(other_cols)}):")
for col in sorted(other_cols):
    print(f"      - {col}")

print("\n" + "="*80)
print("Dataset listo para el modelado ‚úÖ")
print("="*80)

VERIFICACI√ìN FINAL DEL DATASET

üìä Dimensiones: 39,541 partidos √ó 79 columnas

üéØ Target balance:
   player1_wins = 1: 19,699 (49.82%)
   player1_wins = 0: 19,842 (50.18%)

üìù Columnas del dataset:

   Player1 features (19):
      - player1_age
      - player1_entry
      - player1_h2h_wins
      - player1_hand
      - player1_ht
      - player1_ioc
      - player1_last10_losses
      - player1_last10_winrate
      - player1_last10_wins
      - player1_name
      - player1_rank
      - player1_rank_points
      - player1_total_losses
      - player1_total_wins
      - player1_winrate
      - player1_wins
      - player1_ytd_losses
      - player1_ytd_winrate
      - player1_ytd_wins

   Player2 features (18):
      - player2_age
      - player2_entry
      - player2_h2h_wins
      - player2_hand
      - player2_ht
      - player2_ioc
      - player2_last10_losses
      - player2_last10_winrate
      - player2_last10_wins
      - player2_name
      - player2_rank
      - player2

In [None]:
# Limpiar columnas duplicadas winner_* y loser_* que quedaron del original
# (ya tenemos player1_* y player2_* que son las versiones con shuffle)
remaining_winner_loser_cols = [col for col in matches.columns 
                               if col.startswith('winner_') or col.startswith('loser_')]

if remaining_winner_loser_cols:
    print(f"Eliminando {len(remaining_winner_loser_cols)} columnas duplicadas winner_*/loser_*:")
    for col in remaining_winner_loser_cols:
        print(f"   - {col}")
    matches.drop(columns=remaining_winner_loser_cols, inplace=True)
    print(f"\n‚úì Columnas eliminadas. Nuevas dimensiones: {matches.shape}")
else:
    print("‚úì No hay columnas winner_*/loser_* adicionales para eliminar.")

Ahora dividimos entre entrenamiento y test, 70-30.

In [None]:
matches_train, matches_test = train_test_split(matches, test_size=0.3, shuffle=False)

## Creaci√≥n de la variable objetivo (Target)

El dataset ya contiene la columna `player1_wins` que indica si el jugador 1 gan√≥ el partido (1) o perdi√≥ (0).

**Decisi√≥n de nomenclatura:**
- Mantenemos la estructura `player_1` vs `player_2` (est√°ndar en deportes)
- Renombramos `player1_wins` ‚Üí `winner` para mayor claridad
- **Interpretaci√≥n:**
  - `winner = 1` ‚Üí player_1 gan√≥ el partido
  - `winner = 0` ‚Üí player_2 gan√≥ el partido

Esta variable ser√° nuestro **target** para los modelos de clasificaci√≥n.

In [30]:
# Verificar que la columna player1_wins existe
print("üîç Verificando columna objetivo...")
if 'player1_wins' in matches.columns:
    print("‚úÖ La columna 'player1_wins' existe en el dataset")
    
    # Renombrar para mayor claridad conceptual
    matches.rename(columns={'player1_wins': 'winner'}, inplace=True)
    matches_train.rename(columns={'player1_wins': 'winner'}, inplace=True)
    matches_test.rename(columns={'player1_wins': 'winner'}, inplace=True)
    
    print("‚úÖ Columna renombrada: 'player1_wins' ‚Üí 'winner'")
    print(f"\nüìä Balance del target:")
    print(f"   winner = 1 (player_1 gan√≥): {matches_train['winner'].sum():,} partidos ({matches_train['winner'].mean()*100:.2f}%)")
    print(f"   winner = 0 (player_2 gan√≥): {(~matches_train['winner'].astype(bool)).sum():,} partidos ({(1-matches_train['winner'].mean())*100:.2f}%)")
else:
    print("‚ùå ERROR: La columna 'player1_wins' no se encuentra en el dataset")
    print(f"Columnas disponibles: {matches.columns.tolist()}")

üîç Verificando columna objetivo...
‚úÖ La columna 'player1_wins' existe en el dataset


NameError: name 'matches_train' is not defined