In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from catboost import CatBoostRegressor
import optuna

from sklearn.model_selection import train_test_split

import torch
from sklearn.model_selection import cross_val_score

import psutil
import time

def log_system_resources():
    """Log current system resource usage"""
    mem = psutil.virtual_memory()
    print(f"\nMemory Usage: {mem.used/1e9:.2f}GB / {mem.total/1e9:.2f}GB ({mem.percent}%)")
    print(f"CPU Usage: {psutil.cpu_percent()}%")
    print(f"Time: {time.strftime('%H:%M:%S')}")


In [2]:
label_encoders = {}
categorical_features = []
numerical_features = []

In [3]:
def load_data():
    """Load all datasets"""
    print("Loading datasets...")
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    val_df = pd.read_csv('val.csv')
    
    print(f"Train shape: {train_df.shape}")
    print(f"Test shape: {test_df.shape}")
    print(f"Validation shape: {val_df.shape}")
    
    return train_df, test_df, val_df

In [4]:
def exploratory_analysis(train_df):
    """Quick EDA to understand the data better"""
    print("\n=== EXPLORATORY DATA ANALYSIS ===")
    print(f"Target variable stats:")
    print(train_df['Lap_Time_Seconds'].describe())
    
    print(f"\nMissing values in train:")
    print(train_df.isnull().sum().sort_values(ascending=False).head(10))
    
    print(f"\nCorrelation with target (top 10):")
    corr_target = train_df.corr()['Lap_Time_Seconds'].abs().sort_values(ascending=False)
    print(corr_target.head(10))

In [5]:
def feature_engineering(df, is_train=True):
    """Competition-grade feature engineering"""
    df = df.copy()
    
    # Handle missing values
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].fillna('Unknown')
        else:
            df[col] = df[col].fillna(df[col].median())
    
    # Core performance features
    if all(col in df.columns for col in ['Circuit_Length_km', 'Laps']):
        df['Total_Distance'] = df['Circuit_Length_km'] * df['Laps']
        df['Inverse_Circuit_Length'] = 1 / (df['Circuit_Length_km'] + 1e-6)
    
    # Speed dynamics
    if 'Avg_Speed_kmh' in df.columns:
        df['Speed_Squared'] = df['Avg_Speed_kmh'] ** 2
        if 'Circuit_Length_km' in df.columns:
            df['Speed_to_Length_Ratio'] = df['Avg_Speed_kmh'] / (df['Circuit_Length_km'] + 1e-6)
    
    # Position and championship features
    if 'Grid_Position' in df.columns:
        df['Grid_Advantage'] = 1 / (df['Grid_Position'] + 1)
        df['Top_3_Grid'] = (df['Grid_Position'] <= 3).astype(int)
    
    if all(col in df.columns for col in ['Championship_Points', 'Championship_Position']):
        df['Points_per_Position'] = df['Championship_Points'] / (df['Championship_Position'] + 1)
        df['Is_Champion'] = (df['Championship_Position'] == 1).astype(int)
    
    # Weather interactions
    if 'Weather_Condition' in df.columns:
        weather_map = {'Dry': 0, 'Wet': 1, 'Mixed': 0.5, 'Unknown': 0.25}
        df['Weather_Numeric'] = df['Weather_Condition'].map(weather_map).fillna(0)
        
        if 'Avg_Speed_kmh' in df.columns:
            df['Speed_Weather_Interaction'] = df['Avg_Speed_kmh'] * (1 - df['Weather_Numeric'])
    
    # Tire strategy
    if all(col in df.columns for col in ['Tire_Compound_Front', 'Tire_Compound_Rear']):
        df['Same_Tire_Compound'] = (df['Tire_Compound_Front'] == df['Tire_Compound_Rear']).astype(int)
    
    # Track characteristics
    if all(col in df.columns for col in ['Corners_per_Lap', 'Circuit_Length_km']):
        df['Corner_Density'] = df['Corners_per_Lap'] / (df['Circuit_Length_km'] + 1e-6)
    
    # Rider experience (only for training data)
    if is_train and all(col in df.columns for col in ['Years_active', 'Starts']):
        df['Experience_Factor'] = np.log1p(df['Years_active']) * np.log1p(df['Starts'])
        df['Finish_Rate'] = df['Finishes'] / (df['Starts'] + 1e-6)
    
    return df

In [6]:
def prepare_features(train_df, test_df, val_df):
    """Prepare features for modeling - FIXED VERSION"""
    global label_encoders, categorical_features, numerical_features
    
    print("\n=== FEATURE PREPARATION ===")
    
    # Apply feature engineering
    train_processed = feature_engineering(train_df, is_train=True)
    test_processed = feature_engineering(test_df, is_train=False)
    val_processed = feature_engineering(val_df, is_train=True)
    
    # Identify categorical and numerical features
    categorical_cols = [col for col in train_processed.columns 
                       if train_processed[col].dtype == 'object' and col != 'Lap_Time_Seconds']
    numerical_cols = [col for col in train_processed.columns 
                     if col != 'Lap_Time_Seconds' and col not in categorical_cols]
    
    categorical_features = categorical_cols
    numerical_features = numerical_cols
    
    print(f"Categorical features: {len(categorical_cols)}")
    print(f"Numerical features: {len(numerical_cols)}")
    
    # Encode categorical features
    for col in categorical_cols:
        le = LabelEncoder()
        # Combine all data to ensure consistent encoding
        combined = pd.concat([train_processed[col], test_processed[col], val_processed[col]])
        le.fit(combined.astype(str))
        train_processed[col] = le.transform(train_processed[col].astype(str))
        test_processed[col] = le.transform(test_processed[col].astype(str))
        val_processed[col] = le.transform(val_processed[col].astype(str))
        label_encoders[col] = le
        
    return train_processed, test_processed, val_processed


In [7]:
def objective(trial, X, y, cv_folds=5):
    """Optuna objective function for hyperparameter tuning"""
    
    params = {
        'iterations': trial.suggest_int('iterations', 800, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'random_strength': trial.suggest_float('random_strength', 0, 1),
        'od_type': 'Iter',
        'od_wait': 50,
        'random_seed': 42,
        'verbose': False
    }
    
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    cv_scores = []
    
    for train_idx, val_idx in kf.split(X):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
        
        model = CatBoostRegressor(**params)
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=(X_val_fold, y_val_fold),
            cat_features=categorical_features,
            verbose=False
        )
        
        y_pred = model.predict(X_val_fold)
        rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
        cv_scores.append(rmse)
        
    return np.mean(cv_scores)

In [8]:
def cross_validate(X, y, cv_folds=5):
    """Cross validation to estimate model performance"""
    print(f"\n=== {cv_folds}-FOLD CROSS VALIDATION ===")
    
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    cv_scores = []
    mae_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"Training fold {fold + 1}/{cv_folds}...")
        
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
        
        model = CatBoostRegressor(
            iterations=1000,
            learning_rate=0.1,
            depth=6,
            l2_leaf_reg=3,
            random_seed=42,
            verbose=False
        )
        
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=(X_val_fold, y_val_fold),
            cat_features=categorical_features,
            verbose=False
        )
        
        y_pred = model.predict(X_val_fold)
        rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
        mae = mean_absolute_error(y_val_fold, y_pred)
        
        cv_scores.append(rmse)
        mae_scores.append(mae)
        print(f"Fold {fold + 1} RMSE: {rmse:.4f}, MAE: {mae:.4f}")
    
    print(f"\nCV Results:")
    print(f"Mean RMSE: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
    print(f"Mean MAE: {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}")
    
    return cv_scores, mae_scores

In [9]:
def train_model(X_train, y_train, optimize_hyperparams=True):
    """Competition-optimized model training for single CatBoost model"""
    print("\n=== MODEL TRAINING ===")
    
    if optimize_hyperparams:
        print("Starting hyperparameter optimization...")
        
        # Use smaller subset for faster tuning
        X_tune, _, y_tune, _ = train_test_split(
            X_train, y_train, 
            train_size=0.2,  # Use 20% of data for tuning
            random_state=42
        )
        
        def objective(trial):
            # Progress reporting
            trial_number = trial.number
            if trial_number % 5 == 0:
                print(f"Running trial {trial_number}...")
            
            params = {
                'iterations': trial.suggest_int('iterations', 800, 1200),  # Reduced range
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
                'depth': trial.suggest_int('depth', 6, 8),  # Smaller range
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 5),
                'random_strength': trial.suggest_float('random_strength', 0.1, 0.3),
                'bagging_temperature': trial.suggest_float('bagging_temperature', 0.5, 0.8),
                'border_count': trial.suggest_int('border_count', 64, 128),  # Reduced
                'grow_policy': 'Depthwise',  # Fixed for simplicity
                'loss_function': 'RMSE',
                'eval_metric': 'RMSE',
                'random_seed': 42,
                'task_type': 'GPU' if torch.cuda.is_available() else 'CPU',
                'verbose': False
            }
            
            # Faster 2-fold CV
            kf = KFold(n_splits=2, shuffle=True, random_state=42)
            cv_scores = []
            
            for train_idx, val_idx in kf.split(X_tune):
                X_train_fold, X_val_fold = X_tune.iloc[train_idx], X_tune.iloc[val_idx]
                y_train_fold, y_val_fold = y_tune.iloc[train_idx], y_tune.iloc[val_idx]
                
                model = CatBoostRegressor(**params)
                model.fit(
                    X_train_fold, y_train_fold,
                    cat_features=categorical_features,
                    verbose=False,
                    early_stopping_rounds=20  # Added early stopping
                )
                
                y_pred = model.predict(X_val_fold)
                rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
                cv_scores.append(rmse)
            
            return np.mean(cv_scores)
        
        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=15, timeout=1800)  # 15 trials max, 30 min timeout
        
        best_params = study.best_params
        print(f"Best parameters: {best_params}")
        print(f"Best CV RMSE: {study.best_value:.4f}")
    else:
        # Default parameters
        best_params = {
            'iterations': 1000,
            'learning_rate': 0.05,
            'depth': 7,
            'l2_leaf_reg': 3,
            'random_strength': 0.2,
            'bagging_temperature': 0.7,
            'border_count': 128,
            'grow_policy': 'Depthwise',
            'loss_function': 'RMSE',
            'random_seed': 42,
            'task_type': 'GPU' if torch.cuda.is_available() else 'CPU'
        }
    
    # Train final model on full data with early stopping
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )
    
    final_params = {
        **best_params,
        'early_stopping_rounds': 50,
        'verbose': 100
    }
    
    model = CatBoostRegressor(**final_params)
    model.fit(
        X_train_split, y_train_split,
        eval_set=(X_val_split, y_val_split),
        cat_features=categorical_features,
        use_best_model=True
    )
    
    # Feature importance
    feature_importance = model.get_feature_importance(prettified=True)
    print("\nTop 20 most important features:")
    print(feature_importance.head(20))
    
    return model, feature_importance

In [10]:
def validate_model(model, X_val, y_val):
    val_pred = model.predict(X_val)
    val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    val_mae = mean_absolute_error(y_val, val_pred)

    print(f"Validation RMSE: {val_rmse:.4f}")
    print(f"Validation MAE: {val_mae:.4f}")

    return val_pred, val_rmse, val_mae


 

In [11]:
def validate_model(model, X_val, y_val):
    """Validate model on validation set"""
    print("\n=== VALIDATION SET PERFORMANCE ===")
    
    val_pred = model.predict(X_val)
    val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    val_mae = mean_absolute_error(y_val, val_pred)
    
    print(f"Validation RMSE: {val_rmse:.4f}")
    print(f"Validation MAE: {val_mae:.4f}")
    
    return val_pred, val_rmse, val_mae

In [12]:
def predict_test(model, X_test):
    """Make predictions on test set"""
    print("\n=== GENERATING TEST PREDICTIONS ===")
    test_pred = model.predict(X_test)
    return test_pred

def create_submission(test_pred, sample_submission_path='sample_submission.csv'):
    """Create submission file"""
    try:
        sample_sub = pd.read_csv(sample_submission_path)
        sample_sub.iloc[:, 1] = test_pred  # Assuming second column is the target
        sample_sub.to_csv('submission.csv', index=False)
        print("Submission file created: submission.csv")
    except:
        # If sample submission not available, create basic submission
        submission = pd.DataFrame({
            'id': range(len(test_pred)),
            'Lap_Time_Seconds': test_pred
        })
        submission.to_csv('submission.csv', index=False)
        print("Basic submission file created: submission.csv")

In [13]:
def run_competition_pipeline():
    """Complete competition-optimized pipeline with single CatBoost model"""
    print("🏁 STARTING COMPETITION PIPELINE 🏁")
    
    # 1. Load and prepare data
    print("\n=== LOADING DATA ===")
    train_df, test_df, val_df = load_data()
    
    # 2. Feature engineering
    print("\n=== FEATURE ENGINEERING ===")
    train_processed = feature_engineering(train_df, is_train=True)
    test_processed = feature_engineering(test_df, is_train=False)
    val_processed = feature_engineering(val_df, is_train=True)
    
    # 3. Feature encoding and selection
    print("\n=== FEATURE PREPARATION ===")
    train_final, test_final, val_final = prepare_features(
        train_processed, test_processed, val_processed
    )
    
    # Prepare datasets
    feature_cols = categorical_features + numerical_features
    X_train = train_final[feature_cols]
    y_train = train_final['Lap_Time_Seconds']
    X_val = val_final[feature_cols]
    y_val = val_final['Lap_Time_Seconds']
    X_test = test_final[feature_cols]
    
    # 4. Train single CatBoost model
    print("\n=== TRAINING SINGLE MODEL ===")
    model, feature_importance = train_model(X_train, y_train, optimize_hyperparams=True)
    
    # 5. Validate model
    print("\n=== MODEL VALIDATION ===")
    val_pred, val_rmse, val_mae = validate_model(model, X_val, y_val)
    
    # 6. Make test predictions
    print("\n=== TEST PREDICTIONS ===")
    test_pred = model.predict(X_test)
    
    # 7. Create submission
    print("\n=== CREATING SUBMISSION ===")
    submission = pd.DataFrame({
        'Unique ID': test_df['Unique ID'],
        'Lap_Time_Seconds': test_pred
    })
    submission.to_csv('competition_submission.csv', index=False)
    
    print("\n🏆 COMPETITION PIPELINE COMPLETE! 🏆")
    print(f"Final validation RMSE: {val_rmse:.4f}")
    print("Submission saved as 'competition_submission.csv'")
    
    return {
        'model': model,
        'val_rmse': val_rmse,
        'val_mae': val_mae,
        'test_predictions': test_pred
    }

In [14]:
def quick_test():
    """Quick test with minimal hyperparameter tuning"""
    print("🏍️ QUICK TEST MODE - MOTOGP LAP TIME PREDICTION 🏍️")
    
    try:
        # Load data
        train_df, test_df, val_df = load_data()
        
        # EDA
        exploratory_analysis(train_df)
        
        # Prepare features
        train_processed, test_processed, val_processed = prepare_features(
            train_df, test_df, val_df
        )
        
        # Prepare training data
        feature_cols = categorical_features + numerical_features
        X_train = train_processed[feature_cols]
        y_train = train_processed['Lap_Time_Seconds']
        
        X_test = test_processed[feature_cols]
        X_val = val_processed[feature_cols]
        y_val = val_processed['Lap_Time_Seconds']
        
        print(f"\nFinal feature set: {len(feature_cols)} features")
        
        # Quick cross validation (3 folds)
        cv_scores, mae_scores = cross_validate(X_train, y_train, cv_folds=3)
        
        # Train model (no hyperparameter optimization)
        model, feature_importance = train_model(X_train, y_train, optimize_hyperparams=False)
        
        # Validate
        val_pred, val_rmse, val_mae = validate_model(model, X_val, y_val)
        
        # Predict test set
        test_pred = predict_test(model, X_test)
        
        # Create submission
        create_submission(test_pred)
        
        print("\n🏆 QUICK TEST COMPLETED SUCCESSFULLY! 🏆")
        print(f"Final validation RMSE: {val_rmse:.4f}")
        print("Check 'submission.csv' for your predictions!")
        
        return {
            'model': model,
            'cv_rmse_mean': np.mean(cv_scores),
            'cv_rmse_std': np.std(cv_scores),
            'val_rmse': val_rmse,
            'val_mae': val_mae,
            'feature_importance': feature_importance,
            'test_predictions': test_pred
        }
        
    except Exception as e:
        print(f"Error during testing: {str(e)}")
        print("Please check your data files and try again.")
        return None



In [15]:
def full_test():
    """Full test with hyperparameter optimization"""
    print("🏍️ FULL TEST MODE - MOTOGP LAP TIME PREDICTION 🏍️")
    
    try:
        # Load data
        train_df, test_df, val_df = load_data()
        
        # EDA
        exploratory_analysis(train_df)
        
        # Prepare features
        train_processed, test_processed, val_processed = prepare_features(
            train_df, test_df, val_df
        )
        
        # Prepare training data
        feature_cols = categorical_features + numerical_features
        X_train = train_processed[feature_cols]
        y_train = train_processed['Lap_Time_Seconds']
        
        X_test = test_processed[feature_cols]
        X_val = val_processed[feature_cols]
        y_val = val_processed['Lap_Time_Seconds']
        
        print(f"\nFinal feature set: {len(feature_cols)} features")
        
        # Full cross validation
        cv_scores, mae_scores = cross_validate(X_train, y_train, cv_folds=5)
        
        # Train model with hyperparameter optimization
        model, feature_importance = train_model(X_train, y_train, optimize_hyperparams=True)
        
        # Validate
        val_pred, val_rmse, val_mae = validate_model(model, X_val, y_val)
        
        # Predict test set
        test_pred = predict_test(model, X_test)
        
        # Create submission
        create_submission(test_pred)
        
        print("\n🏆 FULL TEST COMPLETED SUCCESSFULLY! 🏆")
        print(f"Final validation RMSE: {val_rmse:.4f}")
        print("Check 'submission.csv' for your predictions!")
        
        return {
            'model': model,
            'cv_rmse_mean': np.mean(cv_scores),
            'cv_rmse_std': np.std(cv_scores),
            'val_rmse': val_rmse,
            'val_mae': val_mae,
            'feature_importance': feature_importance,
            'test_predictions': test_pred
        }
        
    except Exception as e:
        print(f"Error during testing: {str(e)}")
        print("Please check your data files and try again.")
        return None

In [17]:
# 1. Load data
train_df, test_df, val_df = load_data()

# 2. Feature engineering
train_processed = feature_engineering(train_df, is_train=True)
test_processed = feature_engineering(test_df, is_train=False)
val_processed = feature_engineering(val_df, is_train=True)  # This was missing

# 3. Prepare features
train_final, test_final, val_final = prepare_features(train_processed, test_processed, val_processed)

# 4. Get feature columns
feature_cols = categorical_features + numerical_features
X_test = test_final[feature_cols]

# 5. Train model (or load your trained model)
model, _ = train_model(train_final[feature_cols], train_final['Lap_Time_Seconds'], optimize_hyperparams=False)

# 6. Make predictions
test_pred = model.predict(X_test)

# 7. Create submission file
submission = pd.DataFrame({
    'Unique ID': test_df['Unique ID'],
    'Lap_Time_Seconds': test_pred
})
submission.to_csv('submission.csv', index=False)

Loading datasets...
Train shape: (1914056, 45)
Test shape: (546874, 44)
Validation shape: (273437, 45)

=== FEATURE PREPARATION ===
Categorical features: 13
Numerical features: 41

=== MODEL TRAINING ===
0:	learn: 11.5035512	test: 11.5162518	best: 11.5162518 (0)	total: 1.07s	remaining: 17m 48s
100:	learn: 11.0224320	test: 11.0023837	best: 11.0023837 (100)	total: 56.6s	remaining: 8m 24s
200:	learn: 10.6516039	test: 10.6356449	best: 10.6356449 (200)	total: 1m 47s	remaining: 7m 9s
300:	learn: 10.2718925	test: 10.2626600	best: 10.2626600 (300)	total: 2m 47s	remaining: 6m 28s
400:	learn: 9.9194596	test: 9.9177826	best: 9.9177826 (400)	total: 3m 44s	remaining: 5m 35s
500:	learn: 9.5906057	test: 9.5945687	best: 9.5945687 (500)	total: 4m 40s	remaining: 4m 39s
600:	learn: 9.2861783	test: 9.2933390	best: 9.2933390 (600)	total: 5m 37s	remaining: 3m 43s
700:	learn: 8.9951090	test: 9.0051753	best: 9.0051753 (700)	total: 6m 33s	remaining: 2m 47s
800:	learn: 8.7123601	test: 8.7246278	best: 8.7246278 