In [2]:
import pandas as pd
import numpy as np
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
import joblib
import warnings
warnings.filterwarnings('ignore')

# Load the test data
test_df = pd.read_csv('test.csv', parse_dates=['dob', 'date'])

# Replace '\\N' with NaN
test_df = test_df.replace('\\N', np.nan)

# Drop unnecessary columns
columns_to_drop = ['fastestLap', 'rank', 'fastestLapTime', 'time_y', 'fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time', 'driver_num', 'driver_code', 'resultId', 'driverId', 'constructorId', 'number', 'grand_prix', 'date']
test_df = test_df.drop(columns=columns_to_drop)

# Create derived features
test_df['experience'] = test_df.groupby('driverRef')['round'].transform('count')
test_df['win_ratio'] = test_df['wins'] / test_df['experience']
test_df['laps_completed_ratio'] = test_df['laps'] / test_df.groupby('racerId')['laps'].transform('max')
test_df['points_per_race'] = test_df['points'] / test_df['experience']
test_df['qualification_performance'] = test_df['grid'] / test_df.groupby('racerId')['grid'].transform('max')
test_df['constructor_performance'] = test_df.groupby('constructorRef')['points'].transform('mean')
test_df['track_familiarity'] = test_df.groupby(['driverRef', 'circuitId'])['round'].transform('count')
test_df['season_performance'] = test_df.groupby(['driverRef', 'year'])['points'].transform('cumsum')
test_df['recent_form'] = test_df.groupby('driverRef')['points'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())

test_df['timetaken_in_millisec'] = pd.to_numeric(test_df['timetaken_in_millisec'], errors='coerce')
test_df['timetaken_in_seconds'] = test_df['timetaken_in_millisec'] / 1000
test_df['avg_laptime'] = test_df['timetaken_in_seconds'] / test_df['laps']

# Handle missing values using IterativeImputer for numeric and SimpleImputer for categorical
numeric_features = test_df.select_dtypes(include=[np.number]).columns
categorical_features = test_df.select_dtypes(exclude=[np.number]).columns

numeric_imputer = IterativeImputer(max_iter=10, random_state=0)
test_df[numeric_features] = numeric_imputer.fit_transform(test_df[numeric_features])

categorical_imputer = SimpleImputer(strategy='most_frequent')
test_df[categorical_features] = categorical_imputer.fit_transform(test_df[categorical_features])

# Encode categorical features
le = LabelEncoder()
for col in categorical_features:
    test_df[col] = le.fit_transform(test_df[col].astype(str))

# Select features for the model
features = ['grid', 'points', 'laps', 'timetaken_in_seconds', 'max_speed', 
            'experience', 'win_ratio', 'laps_completed_ratio',
            'points_per_race', 'qualification_performance', 'constructor_performance',
            'track_familiarity', 'season_performance', 'recent_form',
            'avg_laptime', 'driverRef', 'nationality', 'constructorRef', 
            'status', 'round', 'year']

X_test = test_df[features]

# Load the scaler and model
scaler = joblib.load('scaler.pkl')
model = joblib.load('model.pkl')

# Scale the test features
X_test_scaled = scaler.transform(X_test)

# Predict positions
test_df['position'] = model.predict(X_test_scaled)

# Ensure result_driver_standing column exists
test_df['result_driver_standing'] = le.fit_transform(test_df['result_driver_standing'].astype(str))

# Prepare submission DataFrame
submission_df = test_df[['position', 'result_driver_standing']].rename(columns={
    'position': 'Position',
    'result_driver_standing': 'Driver Standings'
})
submission_df.to_csv('test_predictions.csv', index=False)

# Print first few rows of the prediction DataFrame
print(submission_df.head())




    Position  Driver Standings
0  13.119203            329756
1  13.119203            329861
2  13.119203            329963
3  13.119203            330066
4  13.119203            330166
