In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import KNNImputer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
import warnings
warnings.filterwarnings('ignore')

In [10]:
df = pd.read_csv('train.csv', parse_dates=['dob', 'date'])
df = df.replace('\\N', np.nan)

# Drop unnecessary columns
columns_to_drop = ['fastestLap','rank','fastestLapTime','time_y','fp1_date','fp1_time','fp2_date','fp2_time','fp3_date','fp3_time','quali_date','quali_time','sprint_date','sprint_time','driver_num','driver_code','resultId','driverId','constructorId','number','grand_prix','date']
df = df.drop(columns=columns_to_drop)

df = df.iloc[:50000]

In [None]:
df['experience'] = df.groupby('driverRef')['round'].transform('count')
df['win_ratio'] = df['wins'] / df['experience']
df['laps_completed_ratio'] = df['laps'] / df.groupby('racerId')['laps'].transform('max')
df['points_per_race'] = df['points'] / df['experience']
df['qualification_performance'] = df['grid'] / df.groupby('racerId')['grid'].transform('max')
df['constructor_performance'] = df.groupby('constructorRef')['points'].transform('mean')
df['track_familiarity'] = df.groupby(['driverRef', 'circuitId'])['round'].transform('count')
df['season_performance'] = df.groupby(['driverRef', 'year'])['points'].transform('cumsum')
df['recent_form'] = df.groupby('driverRef')['points'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())
df['grid_position_diff'] = df['grid'] - df['position']
df['points_to_laps_ratio'] = df['points'] / (df['laps'] + 1)  # Adding 1 to avoid division by zero
# df['avg_speed'] = df['distance'] / (df['timetaken_in_millisec'] / 3600000 + 1e-6)  # Adding small value to avoid division by zero

# Handling time features
df['timetaken_in_millisec'] = pd.to_numeric(df['timetaken_in_millisec'], errors='coerce')
df['timetaken_in_seconds'] = df['timetaken_in_millisec'] / 1000
df['avg_laptime'] = df['timetaken_in_seconds'] / (df['laps'] + 1e-6)  

In [None]:
numeric_features = df.select_dtypes(include=[np.number]).columns
categorical_features = df.select_dtypes(exclude=[np.number]).columns

# Handle outliers with a more robust method
def robust_cap_outliers(df, column, factor=1.5):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR
    df[column] = df[column].clip(lower_bound, upper_bound)
    return df

for col in numeric_features:
    df = robust_cap_outliers(df, col)

In [None]:
le = LabelEncoder()
for col in categorical_features:
    df[col] = le.fit_transform(df[col].astype(str))

# Select initial features
initial_features = ['grid', 'points', 'laps', 'timetaken_in_seconds', 'max_speed', 
                    'experience', 'win_ratio', 'laps_completed_ratio',
                    'points_per_race', 'qualification_performance', 'constructor_performance',
                    'track_familiarity', 'season_performance', 'recent_form',
                    'avg_laptime', 'driverRef', 'status', 'round',
                    'grid_position_diff', 'points_to_laps_ratio']

X = df[initial_features]
y = df['position']

In [None]:
imputer = KNNImputer(n_neighbors=5)
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Feature selection
selector = SelectKBest(score_func=f_regression, k='all')
X_selected = selector.fit_transform(X_imputed, y)
selected_features = X.columns[selector.get_support()].tolist()

# Add polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
X_poly = poly.fit_transform(X_imputed[selected_features])

# Combine original and polynomial features
X_combined = np.hstack((X_imputed, X_poly))


KeyboardInterrupt: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for GridSearchCV
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_iter': [300, 500, 700],
    'max_depth': [5, 10, 15],
    'min_samples_leaf': [10, 20, 30],
    'l2_regularization': [0.1, 1.0, 10.0]
}

# Initialize and train the HistGradientBoostingRegressor with GridSearchCV
hgbr = HistGradientBoostingRegressor(random_state=42)
grid_search = GridSearchCV(estimator=hgbr, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_hgbr = grid_search.best_estimator_

# Make predictions
y_pred = best_hgbr.predict(X_test_scaled)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R^2 Score: {r2:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")