In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [2]:
dtype_spec = {
    'number': str,
    'grid': 'float64',
    'position_x': 'float64',
    'positionOrder': 'float64',
    'time_x': str,
    'timetaken_in_millisec': 'float64',
    'fastestLapTime': str,
    'max_speed': 'float64',
    'url_x': str,
    'url_y': str,
    'url': str,
}

In [3]:
na_values = ['\\N', 'null', 'None', '']

In [4]:
train_df = pd.read_csv('train.csv', dtype=dtype_spec, na_values=na_values, low_memory=False)
test_df = pd.read_csv('test.csv', dtype=dtype_spec, na_values=na_values, low_memory=False)

In [5]:
def time_to_seconds(time_str):
    if pd.isna(time_str):
        return np.nan
    parts = time_str.split(':')
    if len(parts) == 3:
        return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
    elif len(parts) == 2:
        return int(parts[0]) * 60 + float(parts[1])
    else:
        return float(parts[0])

In [6]:
train_df['time_x'] = train_df['time_x'].apply(time_to_seconds)
train_df['fastestLapTime'] = train_df['fastestLapTime'].apply(time_to_seconds)
test_df['time_x'] = test_df['time_x'].apply(time_to_seconds)
test_df['fastestLapTime'] = test_df['fastestLapTime'].apply(time_to_seconds)

In [7]:
train_df.fillna(method='ffill', inplace=True)
test_df.fillna(method='ffill', inplace=True)

In [8]:
categorical_cols = ['driverRef', 'constructorRef', 'grand_prix', 'status']
for col in categorical_cols:
    train_df[col], _ = pd.factorize(train_df[col])
    test_df[col] = pd.Categorical(test_df[col], categories=_).codes

In [9]:
train_df['age'] = pd.to_datetime(train_df['date']).dt.year - pd.to_datetime(train_df['dob']).dt.year
test_df['age'] = pd.to_datetime(test_df['date']).dt.year - pd.to_datetime(test_df['dob']).dt.year

In [10]:
features = [
    'grid', 'points', 'laps', 'time_x', 'timetaken_in_millisec', 
    'fastestLap', 'rank', 'fastestLapTime', 'max_speed', 'age'
] + categorical_cols

In [11]:
X = train_df[features]
y = train_df['position']

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

In [14]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'seed': 42,
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5
}

In [15]:
from lightgbm import early_stopping

In [16]:
cv_results = lgb.cv(
    params,
    train_data,
    num_boost_round=1000,
    nfold=5,
    metrics='rmse',
    callbacks=[
        early_stopping(stopping_rounds=100, verbose=True),
        lgb.log_evaluation(period=100)
    ],
    seed=42
)



Training until validation scores don't improve for 100 rounds
[100]	cv_agg's valid rmse: 8.44286 + 0.00472558
[200]	cv_agg's valid rmse: 8.09301 + 0.0052032
[300]	cv_agg's valid rmse: 7.91895 + 0.00716199
[400]	cv_agg's valid rmse: 7.81046 + 0.00801938
[500]	cv_agg's valid rmse: 7.74085 + 0.00847994
[600]	cv_agg's valid rmse: 7.69578 + 0.0103916
[700]	cv_agg's valid rmse: 7.65948 + 0.0107798
[800]	cv_agg's valid rmse: 7.62912 + 0.00982691
[900]	cv_agg's valid rmse: 7.60236 + 0.0102815
[1000]	cv_agg's valid rmse: 7.57883 + 0.0106389
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid rmse: 7.57883 + 0.0106389


In [17]:
if 'valid rmse-mean' in cv_results:
    best_num_boost_round = len(cv_results['valid rmse-mean'])
else:
    best_num_boost_round = len(next(iter(cv_results.values())))

print(f'Best number of boosting rounds: {best_num_boost_round}')

Best number of boosting rounds: 1000


In [18]:
final_model = lgb.train(
    params,
    train_data,
    num_boost_round=best_num_boost_round,
    valid_sets=[valid_data],
    callbacks=[lgb.log_evaluation(period=100)]
)

[100]	valid_0's rmse: 8.44978
[200]	valid_0's rmse: 8.09456
[300]	valid_0's rmse: 7.92091
[400]	valid_0's rmse: 7.8091
[500]	valid_0's rmse: 7.73586
[600]	valid_0's rmse: 7.68954
[700]	valid_0's rmse: 7.65606
[800]	valid_0's rmse: 7.62274
[900]	valid_0's rmse: 7.59661
[1000]	valid_0's rmse: 7.57112


In [19]:
y_pred = final_model.predict(X_valid)
rmse = mean_squared_error(y_valid, y_pred, squared=False)
print(f'Validation RMSE: {rmse}')

Validation RMSE: 7.571120134222269


In [20]:
X_test = test_df[features]
test_predictions = final_model.predict(X_test)

In [24]:
submission = pd.DataFrame({
    'position': test_predictions,
    'result_driver_standing': test_df['result_driver_standing']  
})

In [25]:
submission['position'] = submission['position'].round().astype(int)
submission['position'] = submission['position'].clip(1, 20)
submission['result_driver_standing'] = submission['result_driver_standing'].astype(int)

In [26]:
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully.")

Submission file created successfully.
