In [1]:
# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# --- Cargar nuestro dataset maestro, la única fuente de verdad ---
DATA_PATH = '../data/processed/master_player_dataset.csv'
df = pd.read_csv(DATA_PATH)

print("Dataset maestro cargado. Dimensiones:", df.shape)
df.info()


Dataset maestro cargado. Dimensiones: (6280, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6280 entries, 0 to 6279
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   player_id                 6280 non-null   int64  
 1   name                      6280 non-null   object 
 2   age                       6280 non-null   float64
 3   position                  6280 non-null   object 
 4   sub_position              6280 non-null   object 
 5   foot                      6198 non-null   object 
 6   height_in_cm              6228 non-null   float64
 7   country_of_citizenship    6084 non-null   object 
 8   current_club_name         6280 non-null   object 
 9   contract_expiration_date  5504 non-null   object 
 10  goals                     6280 non-null   int64  
 11  assists                   6280 non-null   int64  
 12  minutes_played            6280 non-null   int64  
 13  games_played  

In [3]:
# =============================================================================
# 1. Features de Rendimiento Normalizadas
# =============================================================================
# Las métricas brutas (goles, asistencias) son engañosas. 
# Un jugador con 10 goles en 3000 minutos es diferente a uno con 10 goles en 1000 minutos.
# Normalizamos por cada 90 minutos jugados.

df['minutes_per_90'] = df['minutes_played'] / 90

# Evitar división por cero si un jugador tiene 0 minutos (aunque ya filtramos > 90)
df['goals_per_90'] = df['goals'] / df['minutes_per_90']
df['assists_per_90'] = df['assists'] / df['minutes_per_90']
df['goals_plus_assists_per_90'] = (df['goals'] + df['assists']) / df['minutes_per_90']

# Llenar posibles NaNs o Infs si 'minutes_per_90' es 0
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df[['goals_per_90', 'assists_per_90', 'goals_plus_assists_per_90']] = df[['goals_per_90', 'assists_per_90', 'goals_plus_assists_per_90']].fillna(0)

print("\nFeatures 'por 90 minutos' creadas.")
print(df[['name', 'minutes_played', 'goals', 'goals_per_90']].head())


Features 'por 90 minutos' creadas.
                  name  minutes_played  goals  goals_per_90
0         James Milner            1231      0      0.000000
1   Zlatan Ibrahimović             144      1      0.625000
2   Christophe Lepoint            1756      1      0.051253
3  Anastasios Tsokanis            1315      3      0.205323
4        Jonas Hofmann            2738     12      0.394449


In [4]:
# =============================================================================
# 2. Features de Contrato y Mercado
# =============================================================================
# La duración restante del contrato es un factor clave en el valor de mercado.
# Un jugador con 6 meses de contrato vale menos porque puede irse gratis pronto.

df['contract_expiration_date'] = pd.to_datetime(df['contract_expiration_date'], errors='coerce')
df['valuation_date'] = pd.to_datetime(df['valuation_date'])

# Calcular meses restantes de contrato en el momento de la valoración
df['contract_months_remaining'] = (df['contract_expiration_date'] - df['valuation_date']).dt.days / 30.44
# Si el contrato ya expiró (valor negativo) o no hay fecha, lo tratamos como 0.
df['contract_months_remaining'] = df['contract_months_remaining'].fillna(0).clip(lower=0)

print("\nFeature 'contract_months_remaining' creada.")
print(df[['name', 'valuation_date', 'contract_expiration_date', 'contract_months_remaining']].head())


Feature 'contract_months_remaining' creada.
                  name valuation_date contract_expiration_date  \
0         James Milner     2024-12-16               2025-06-30   
1   Zlatan Ibrahimović     2023-06-15                      NaT   
2   Christophe Lepoint     2023-06-09               2024-06-30   
3  Anastasios Tsokanis     2024-12-18               2025-06-30   
4        Jonas Hofmann     2025-03-27               2027-06-30   

   contract_months_remaining  
0                   6.438896  
1                   0.000000  
2                  12.713535  
3                   6.373193  
4                  27.102497  


In [5]:
# 3. Codificación de Variables Categóricas
# =============================================================================
# Los modelos de ML necesitan números, no texto. Convertiremos las posiciones y el pie preferido.
# Usaremos One-Hot Encoding (pd.get_dummies).

# Primero, simplifiquemos las posiciones para evitar demasiadas columnas
df['position_simple'] = df['position'].replace({
    'Attack': 'Ofensivo', 'Midfield': 'Mediocampo', 
    'Defender': 'Defensa', 'Goalkeeper': 'Portero'
})

categorical_features = ['position_simple', 'foot']
df_encoded = pd.get_dummies(df, columns=categorical_features, prefix=categorical_features, drop_first=True)

print(f"\nVariables categóricas codificadas. Nuevas dimensiones: {df_encoded.shape}")
df_encoded.info()


Variables categóricas codificadas. Nuevas dimensiones: (6280, 25)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6280 entries, 0 to 6279
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   player_id                   6280 non-null   int64         
 1   name                        6280 non-null   object        
 2   age                         6280 non-null   float64       
 3   position                    6280 non-null   object        
 4   sub_position                6280 non-null   object        
 5   height_in_cm                6228 non-null   float64       
 6   country_of_citizenship      6084 non-null   object        
 7   current_club_name           6280 non-null   object        
 8   contract_expiration_date    5504 non-null   datetime64[ns]
 9   goals                       6280 non-null   int64         
 10  assists                     6280 non-null   int64    

In [6]:
# --- Vista Previa del Dataset Enriquecido ---
print("\n--- Vista Previa Final ---")
# Mostramos algunas de las columnas nuevas y antiguas
preview_cols = [
    'name', 'market_value_in_eur', 'age', 'goals_per_90', 
    'contract_months_remaining', 'position_simple_Ofensivo', 
    'position_simple_Portero', 'foot_Right'
]
# Filtrar para asegurarse de que las columnas existen
preview_cols_exist = [col for col in preview_cols if col in df_encoded.columns]
print(df_encoded[preview_cols_exist].head())


--- Vista Previa Final ---
                  name  market_value_in_eur        age  goals_per_90  \
0         James Milner              1000000  38.948665      0.000000   
1   Zlatan Ibrahimović              2000000  41.697467      0.625000   
2   Christophe Lepoint                50000  38.622861      0.051253   
3  Anastasios Tsokanis               300000  33.631759      0.205323   
4        Jonas Hofmann              3000000  32.700890      0.394449   

   contract_months_remaining  position_simple_Ofensivo  \
0                   6.438896                     False   
1                   0.000000                      True   
2                  12.713535                     False   
3                   6.373193                     False   
4                  27.102497                     False   

   position_simple_Portero  
0                    False  
1                    False  
2                    False  
3                    False  
4                    False  


In [7]:
# --- Guardar el dataset enriquecido para el siguiente paso ---
OUTPUT_PATH = '../data/processed/featured_player_dataset.csv'
df_encoded.to_csv(OUTPUT_PATH, index=False)
print(f"Dataset enriquecido guardado en: {OUTPUT_PATH}")

Dataset enriquecido guardado en: ../data/processed/featured_player_dataset.csv
