In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import fastf1
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import xgboost as xgb
import lightgbm as lgb
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Configuración de fastf1
fastf1.Cache.enable_cache('f1_cache')  # Asegúrate de que esta carpeta exista

# Clase para el modelo PyTorch
class LapTimeNN(nn.Module):
    def __init__(self, input_size):
        super(LapTimeNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )
    
    def forward(self, x):
        return self.model(x)



In [None]:
def load_and_prepare_data(race_year=2023, race_name='Spain'):
    """
    Carga y prepara los datos de la carrera especificada
    """
    # Cargar la sesión de carrera
    race = fastf1.get_session(race_year, race_name, 'R')
    race.load()
    
    # Obtener laps y telemetría
    laps_data = race.laps
    
    # Convertir a DataFrame para manipulación
    df = pd.DataFrame(laps_data)
    
    # Eliminar outliers (vueltas muy lentas por safety car, paradas, etc.)
    # Esto requiere análisis exploratorio para determinar umbrales apropiados
    q1 = df['LapTime'].quantile(0.05)
    q3 = df['LapTime'].quantile(0.95)
    df = df[(df['LapTime'] >= q1) & (df['LapTime'] <= q3)]
    
    # Convertir LapTime a segundos (si viene como timedelta)
    if df['LapTime'].dtype == 'timedelta64[ns]':
        df['LapTime'] = df['LapTime'].dt.total_seconds()
    
    # Feature Engineering
    # 1. Edad de los neumáticos
    df['TyreAge'] = df['TyreLife']
    
    # 2. Cambio de posición (comparado con la vuelta anterior)
    df['PositionChange'] = df.groupby('Driver')['Position'].diff().fillna(0)
    
    # 3. Sector times (si están disponibles)
    if 'Sector1Time' in df.columns and df['Sector1Time'].dtype == 'timedelta64[ns]':
        df['Sector1Time'] = df['Sector1Time'].dt.total_seconds()
        df['Sector2Time'] = df['Sector2Time'].dt.total_seconds()
        df['Sector3Time'] = df['Sector3Time'].dt.total_seconds()
    
    # 4. Carga de combustible (aproximación basada en la vuelta)
    max_lap = df['LapNumber'].max()
    df['FuelLoad'] = 1 - (df['LapNumber'] / max_lap)  # Aproximación simple
    
    # Seleccionar variables para el modelo
    features = ['TyreCompound', 'TrackTemp', 'AirTemp', 'TyreAge', 
                'PositionChange', 'FuelLoad', 'Driver']
    
    # Asegurarse de que todas las variables existan en el dataframe
    features = [f for f in features if f in df.columns]
    
    # Añadir columnas adicionales si están disponibles
    if 'Rainfall' in df.columns:
        features.append('Rainfall')
    if 'WindSpeed' in df.columns:
        features.append('WindSpeed')
    
    # Seleccionar solo las filas con todos los datos completos
    model_df = df[features + ['LapTime']].dropna()
    
    return model_df



In [None]:
def preprocess_data(df):
    """
    Preprocesa los datos para el modelado
    """
    # Separar características y objetivo
    X = df.drop('LapTime', axis=1)
    y = df['LapTime']
    
    # Identificar columnas categóricas y numéricas
    cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    
    # Crear preprocesadores
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    # Combinar preprocesadores
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ])
    
    # Dividir datos en entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test, preprocessor








In [None]:
def train_xgboost(X_train, X_test, y_train, y_test, preprocessor):
    """
    Entrena un modelo XGBoost
    """
    # Crear pipeline con preprocesamiento y modelo
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', xgb.XGBRegressor(objective='reg:squarederror'))
    ])
    
    # Parámetros para Grid Search
    param_grid = {
        'regressor__n_estimators': [100, 200],
        'regressor__learning_rate': [0.01, 0.1],
        'regressor__max_depth': [3, 5, 7],
        'regressor__min_child_weight': [1, 3]
    }
    
    # Grid Search
    grid_search = GridSearchCV(
        model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1
    )
    
    # Entrenar modelo
    grid_search.fit(X_train, y_train)
    
    # Mejores parámetros
    print("Mejores parámetros XGBoost:", grid_search.best_params_)
    
    # Predecir
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    # Evaluar
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"XGBoost - MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
    
    return best_model, y_pred


In [None]:

def train_pytorch(X_train, X_test, y_train, y_test, preprocessor):
    """
    Entrena un modelo de red neuronal con PyTorch
    """
    # Aplicar preprocesamiento
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    
    # Convertir a tensores
    X_train_tensor = torch.FloatTensor(X_train_processed.toarray())
    y_train_tensor = torch.FloatTensor(y_train.values).reshape(-1, 1)
    X_test_tensor = torch.FloatTensor(X_test_processed.toarray())
    y_test_tensor = torch.FloatTensor(y_test.values).reshape(-1, 1)
    
    # Crear conjuntos de datos y cargadores
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    # Inicializar modelo
    input_size = X_train_processed.shape[1]
    model = LapTimeNN(input_size)
    
    # Definir criterio y optimizador
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Entrenar modelo
    num_epochs = 100
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        if (epoch+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}')
    
    # Evaluar modelo
    model.eval()
    with torch.no_grad():
        y_pred_tensor = model(X_test_tensor)
        y_pred = y_pred_tensor.numpy().flatten()
        
    # Métricas
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"PyTorch NN - MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
    
    return model, y_pred


In [None]:
def visualize_results(y_test, y_pred_xgb, y_pred_nn=None):
    """
    Visualiza los resultados de los modelos
    """
    plt.figure(figsize=(12, 6))
    
    # Plot de dispersión para XGBoost
    plt.subplot(1, 2 if y_pred_nn is not None else 1, 1)
    plt.scatter(y_test, y_pred_xgb, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel('Tiempo real (s)')
    plt.ylabel('Tiempo predicho (s)')
    plt.title('XGBoost: Predicciones vs Reales')
    
    # Si hay predicciones de red neuronal
    if y_pred_nn is not None:
        plt.subplot(1, 2, 2)
        plt.scatter(y_test, y_pred_nn, alpha=0.5)
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
        plt.xlabel('Tiempo real (s)')
        plt.ylabel('Tiempo predicho (s)')
        plt.title('Neural Network: Predicciones vs Reales')
    
    plt.tight_layout()
    plt.savefig('outputs/week3/prediction_results.png')
    plt.show()

In [None]:
def save_models(xgb_model, nn_model=None):
    """
    Guarda los modelos entrenados
    """
    # Crear carpeta si no existe
    import os
    os.makedirs('models/week3', exist_ok=True)
    
    # Guardar modelo XGBoost
    joblib.dump(xgb_model, 'models/week3/xgboost_laptime.joblib')
    
    # Guardar modelo PyTorch si existe
    if nn_model is not None:
        torch.save(nn_model.state_dict(), 'models/week3/nn_laptime.pth')

In [None]:
def main():
    # Cargar y preparar datos
    print("Cargando datos...")
    data = load_and_prepare_data()
    
    # Análisis exploratorio básico
    print("\nResumen de datos:")
    print(data.describe())
    
    # Correlaciones
    print("\nMatriz de correlación:")
    numeric_data = data.select_dtypes(include=['int64', 'float64'])
    plt.figure(figsize=(10, 8))
    sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm')
    plt.savefig('outputs/week3/correlation_matrix.png')
    
    # Preprocesar datos
    print("\nPreprocesando datos...")
    X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data)
    
    # Entrenar modelo XGBoost
    print("\nEntrenando modelo XGBoost...")
    xgb_model, y_pred_xgb = train_xgboost(X_train, X_test, y_train, y_test, preprocessor)
    
    # Entrenar modelo PyTorch (opcional)
    train_nn = input("¿Desea entrenar también un modelo de red neuronal? (s/n): ").lower() == 's'
    if train_nn:
        print("\nEntrenando modelo de red neuronal...")
        nn_model, y_pred_nn = train_pytorch(X_train, X_test, y_train, y_test, preprocessor)
    else:
        nn_model, y_pred_nn = None, None
    
    # Visualizar resultados
    print("\nVisualizando resultados...")
    visualize_results(y_test, y_pred_xgb, y_pred_nn)
    
    # Guardar modelos
    print("\nGuardando modelos...")
    save_models(xgb_model, nn_model)
    
    print("\n¡Proceso completado!")

if __name__ == "__main__":
    main()