In [1]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBRegressor, plot_importance
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from scipy.stats import randint, uniform

# Load and preprocess data
def load_data(filepath, dtype_dict, na_values):
    data = pd.read_csv(filepath, dtype=dtype_dict, low_memory=False, na_values=na_values)
    # Drop specified columns if they exist
    columns_to_drop = ['url_x', 'resultId', 'racerId', 'driverId', 'url', 'positionText_x', 'positionOrder', 'statusId', 'driver_code', 'url_y', 'positionText_y']
    data.drop(columns=[col for col in columns_to_drop if col in data.columns], inplace=True)
    return data

def add_features(df):
    # Driver experience
    df['driver_experience'] = df['year'] - df.groupby('constructorId')['year'].transform('min')
    
    # Average lap time
    df['avg_lap_time'] = df['timetaken_in_millisec'] / df['laps']
    
    # Lap time statistics
    df['lap_time_std'] = df.groupby('driverId')['timetaken_in_millisec'].transform('std')
    df['lap_time_max'] = df.groupby('driverId')['timetaken_in_millisec'].transform('max')
    df['lap_time_median'] = df.groupby('driverId')['timetaken_in_millisec'].transform('median')
    
    # Speed metrics
    df['avg_speed'] = df['max_speed'] / df['laps']
    df['speed_var'] = df.groupby('driverId')['max_speed'].transform('var')
    
    # Grid position changes
    df['grid_pos_change'] = df['grid'] - df.groupby('driverId')['grid'].transform('mean')
    
    # Race duration
    df['race_duration'] = df['laps'] * df['avg_lap_time']
    
    # Race completion rate
    df['completion_rate'] = df['laps'] / df.groupby('driverId')['laps'].transform('max')
    
    # Interaction features
    df['interaction_grid_laps'] = df['grid'] * df['laps']
    
    # Log transformations
    df['log_avg_lap_time'] = np.log1p(df['avg_lap_time'])
    df['log_race_duration'] = np.log1p(df['race_duration'])
    
    # Date-Time features (Assuming there are date-time fields, if not, this can be skipped)
    # df['race_month'] = pd.to_datetime(df['date']).dt.month
    # df['race_day'] = pd.to_datetime(df['date']).dt.day
    # df['race_hour'] = pd.to_datetime(df['date']).dt.hour
    
    return df

# Setup pipeline
def get_pipeline(numeric_features, categorical_features):
    numeric_transformer = Pipeline([
        ('imputer', IterativeImputer(random_state=0)),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    return Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(random_state=42, objective='reg:squarederror', n_jobs=-1))
    ])

# Perform training and evaluation
def train_and_evaluate(X_train, y_train, X_val, y_val):
    param_distributions = {
        'regressor__n_estimators': randint(100, 200),
        'regressor__learning_rate': uniform(0.01, 0.1),
        'regressor__max_depth': randint(3, 10)
    }
    search = RandomizedSearchCV(pipeline, param_distributions, n_iter=50, cv=3, n_jobs=-1, scoring='neg_mean_squared_error', random_state=42)
    search.fit(X_train, y_train)

    best_model = search.best_estimator_

    # Validate the model
    y_val_pred = best_model.predict(X_val)
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    print(f'Validation RMSE: {val_rmse}')

    return best_model

# Plot feature importances
def plot_features(model):
    plt.figure(figsize=(10, 6))
    plot_importance(model.named_steps['regressor'], max_num_features=10)
    plt.show()

# Load datasets
dtype_dict = {
    'number': 'str',
    'time_x': 'str',
    'timetaken_in_millisec': 'float',
    'fastestLap': 'str',
    'result_driver_standing': 'str'
}
na_values = ['\\N']

train_df = load_data('train.csv', dtype_dict, na_values)
test_df = load_data('test.csv', dtype_dict, na_values)
validation_df = load_data('validation.csv', dtype_dict, na_values)

train_df = add_features(train_df)
test_df = add_features(test_df)
validation_df = add_features(validation_df)

# Separate features and target for training and validation
features = ['constructorId', 'grid', 'points', 'laps', 'fastestLap', 'rank',
            'max_speed', 'year', 'round', 'circuitId', 'driver_experience', 'avg_lap_time',
            'lap_time_std', 'lap_time_max', 'lap_time_median', 'avg_speed', 'speed_var', 'grid_pos_change',
            'race_duration', 'completion_rate', 'interaction_grid_laps', 'log_avg_lap_time', 'log_race_duration']
X_train = train_df[features]
y_train = train_df['position_x'].astype(float)

X_val = validation_df[features]
y_val = validation_df['position_x'].astype(float)

# Impute missing values in the target variable
y_train.fillna(y_train.median(), inplace=True)
y_val.fillna(y_val.median(), inplace=True)

# Define numerical and categorical features
numeric_features = ['grid', 'points', 'laps', 'rank', 'max_speed', 'year', 'round', 'driver_experience', 'avg_lap_time',
                    'lap_time_std', 'lap_time_max', 'lap_time_median', 'avg_speed', 'speed_var', 'grid_pos_change',
                    'race_duration', 'completion_rate', 'interaction_grid_laps', 'log_avg_lap_time', 'log_race_duration']
categorical_features = ['constructorId', 'fastestLap', 'circuitId']

# Get the pipeline
pipeline = get_pipeline(numeric_features, categorical_features)
best_model = train_and_evaluate(X_train, y_train, X_val, y_val)

# Plot feature importance
plot_features(best_model)

# Prepare test data
X_test = test_df[features]
y_test_pred = best_model.predict(X_test)
y_test_pred = np.round(y_test_pred).astype(int)  # Convert predictions to integer

# Generate submission file
submission = pd.DataFrame({
    'Position': y_test_pred,
    'Driver Standings': test_df['result_driver_standing']
})
submission.to_csv('train.csv/submission.csv', index=False)
print("Submission file created successfully.")

KeyError: 'driverId'