In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [2]:
dtype_spec = {
    'number': str,
    'grid': 'float64',
    'position_x': 'float64',
    'positionOrder': 'float64',
    'time_x': str,
    'timetaken_in_millisec': 'float64',
    'fastestLapTime': str,
    'max_speed': 'float64',
    'url_x': str,
    'url_y': str,
    'url': str,
}

In [3]:
na_values = ['\\N', 'null', 'None', '']

In [4]:
train_df = pd.read_csv('train.csv', dtype=dtype_spec, na_values=na_values, low_memory=False)
test_df = pd.read_csv('test.csv', dtype=dtype_spec, na_values=na_values, low_memory=False)

In [5]:
def time_to_seconds(time_str):
    if pd.isna(time_str):
        return np.nan
    parts = time_str.split(':')
    if len(parts) == 3:
        return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
    elif len(parts) == 2:
        return int(parts[0]) * 60 + float(parts[1])
    else:
        return float(parts[0])

In [6]:
train_df['time_x'] = train_df['time_x'].apply(time_to_seconds)
train_df['fastestLapTime'] = train_df['fastestLapTime'].apply(time_to_seconds)
test_df['time_x'] = test_df['time_x'].apply(time_to_seconds)
test_df['fastestLapTime'] = test_df['fastestLapTime'].apply(time_to_seconds)

In [7]:
train_df.fillna(method='ffill', inplace=True)
test_df.fillna(method='ffill', inplace=True)

  train_df.fillna(method='ffill', inplace=True)
  test_df.fillna(method='ffill', inplace=True)


In [8]:
categorical_cols = ['driverRef', 'constructorRef', 'grand_prix', 'status']
for col in categorical_cols:
    train_df[col], _ = pd.factorize(train_df[col])
    test_df[col] = pd.Categorical(test_df[col], categories=_).codes

In [9]:
train_df['age'] = pd.to_datetime(train_df['date']).dt.year - pd.to_datetime(train_df['dob']).dt.year
test_df['age'] = pd.to_datetime(test_df['date']).dt.year - pd.to_datetime(test_df['dob']).dt.year

  test_df['age'] = pd.to_datetime(test_df['date']).dt.year - pd.to_datetime(test_df['dob']).dt.year
  test_df['age'] = pd.to_datetime(test_df['date']).dt.year - pd.to_datetime(test_df['dob']).dt.year


In [10]:
features = [
    'grid', 'points', 'laps', 'time_x', 'timetaken_in_millisec', 
    'fastestLap', 'rank', 'fastestLapTime', 'max_speed', 'age'
] + categorical_cols

In [11]:
X = train_df[features]
y = train_df['position']

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

In [14]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'seed': 42,
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5
}

In [15]:
from lightgbm import early_stopping

cv_results = lgb.cv(
    params,
    train_data,
    num_boost_round=1000,
    nfold=5,
    metrics='rmse',
    callbacks=[
        early_stopping(stopping_rounds=100, verbose=True),
        lgb.log_evaluation(period=100)
    ],
    seed=42
)

if 'valid rmse-mean' in cv_results:
    best_num_boost_round = len(cv_results['valid rmse-mean'])
else:
    best_num_boost_round = len(next(iter(cv_results.values())))

print(f'Best number of boosting rounds: {best_num_boost_round}')

final_model = lgb.train(
    params,
    train_data,
    num_boost_round=best_num_boost_round,
    valid_sets=[valid_data],
    callbacks=[lgb.log_evaluation(period=100)]
)



Training until validation scores don't improve for 100 rounds
[100]	cv_agg's valid rmse: 8.44286 + 0.00472558
[200]	cv_agg's valid rmse: 8.09301 + 0.0052032
[300]	cv_agg's valid rmse: 7.91895 + 0.00716199
[400]	cv_agg's valid rmse: 7.81046 + 0.00801938
[500]	cv_agg's valid rmse: 7.74085 + 0.00847994
[600]	cv_agg's valid rmse: 7.69578 + 0.0103916
[700]	cv_agg's valid rmse: 7.65948 + 0.0107798
[800]	cv_agg's valid rmse: 7.62912 + 0.00982691
[900]	cv_agg's valid rmse: 7.60236 + 0.0102815
[1000]	cv_agg's valid rmse: 7.57883 + 0.0106389
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid rmse: 7.57883 + 0.0106389
Best number of boosting rounds: 1000
[100]	valid_0's rmse: 8.44978
[200]	valid_0's rmse: 8.09456
[300]	valid_0's rmse: 7.92091
[400]	valid_0's rmse: 7.8091
[500]	valid_0's rmse: 7.73586
[600]	valid_0's rmse: 7.68954
[700]	valid_0's rmse: 7.65606
[800]	valid_0's rmse: 7.62274
[900]	valid_0's rmse: 7.59661
[1000]	valid_0's rmse: 7.57112


In [17]:
y_pred = final_model.predict(X_valid)
rmse = mean_squared_error(y_valid, y_pred, squared=False)
print(f'Validation RMSE: {rmse}')

X_test = test_df[features]
test_predictions = final_model.predict(X_test)

submission = pd.DataFrame({
    'position': test_predictions,
    'result_driver_standing': test_df['result_driver_standing'] 
})

submission['position'] = submission['position'].round().astype(int)

submission['position'] = submission['position'].clip(1, 20)

submission['result_driver_standing'] = submission['result_driver_standing'].astype(int)

submission.to_csv('submission.csv', index=False)

print("Submission file created successfully.")

Validation RMSE: 7.571120134222269
Submission file created successfully.


In [18]:
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.ensemble import AdaBoostRegressor

In [19]:
# Initialize the models
rand_model = RandomForestRegressor()
tree_model = DecisionTreeRegressor()
xgb_model = xgb.XGBRegressor()
adb_model = AdaBoostRegressor()

In [20]:
# Create an imputer object with a strategy to fill missing values
imputer = SimpleImputer(strategy='mean')

In [21]:
# Fit the imputer on the training data
X_train_imputed = imputer.fit_transform(X_train)
X_valid_imputed = imputer.transform(X_valid)

In [22]:
# Fit the models
rand_model.fit(X_train_imputed, y_train)
tree_model.fit(X_train_imputed, y_train)
xgb_model.fit(X_train_imputed, y_train)
adb_model.fit(X_train_imputed, y_train)

In [23]:
# Predict and evaluate the models
y_pred_rand = rand_model.predict(X_valid_imputed)
y_pred_tree = tree_model.predict(X_valid_imputed)
y_pred_xgb = xgb_model.predict(X_valid_imputed)
y_pred_adb = adb_model.predict(X_valid_imputed)

In [24]:
# Calculate RMSE for each model
rmse_rand = mean_squared_error(y_valid, y_pred_rand, squared=False)
rmse_tree = mean_squared_error(y_valid, y_pred_tree, squared=False)
rmse_xgb = mean_squared_error(y_valid, y_pred_xgb, squared=False)
rmse_adb = mean_squared_error(y_valid, y_pred_adb, squared=False)

print(f'Validation RMSE (Random Forest): {rmse_rand}')
print(f'Validation RMSE (Decision Tree): {rmse_tree}')
print(f'Validation RMSE (XGBoost): {rmse_xgb}')
print(f'Validation RMSE (AdaBoost): {rmse_adb}')

Validation RMSE (Random Forest): 7.313971880329701
Validation RMSE (Decision Tree): 7.314245315065247
Validation RMSE (XGBoost): 7.395562015265371
Validation RMSE (AdaBoost): 9.949862097143258


In [25]:
from sklearn import metrics

In [29]:
accuracy_rand = compute_accuracy(y_valid, y_pred_rand, acceptable_deviation)
accuracy_tree = compute_accuracy(y_valid, y_pred_tree, acceptable_deviation)
accuracy_xgb = compute_accuracy(y_valid, y_pred_xgb, acceptable_deviation)
accuracy_adb = compute_accuracy(y_valid, y_pred_adb, acceptable_deviation)

print(f'Validation Accuracy within ±{acceptable_deviation} positions (Random Forest): {accuracy_rand:.2f}%')
print(f'Validation Accuracy within ±{acceptable_deviation} positions (Decision Tree): {accuracy_tree:.2f}%')
print(f'Validation Accuracy within ±{acceptable_deviation} positions (XGBoost): {accuracy_xgb:.2f}%')
print(f'Validation Accuracy within ±{acceptable_deviation} positions (AdaBoost): {accuracy_adb:.2f}%')


Validation Accuracy within ±1 positions (Random Forest): 17.00%
Validation Accuracy within ±1 positions (Decision Tree): 17.03%
Validation Accuracy within ±1 positions (XGBoost): 16.58%
Validation Accuracy within ±1 positions (AdaBoost): 11.37%


In [31]:
from sklearn.preprocessing import StandardScaler

In [33]:
from sklearn.model_selection import GridSearchCV

In [39]:
!pip install optuna



In [41]:
import optuna

In [43]:
# Normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)
test_df[features] = scaler.transform(test_df[features])

# Split data into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Use a smaller subset of the data for hyperparameter tuning
X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, test_size=0.8, random_state=42)

# Define objective function for Optuna
def objective(trial):
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10)
    }

    train_data = lgb.Dataset(X_train_sample, label=y_train_sample)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

    model = lgb.train(
        param,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=True),
            lgb.log_evaluation(period=100)
        ]
    )

    y_pred = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, y_pred, squared=False)
    return rmse

# Optimize hyperparameters using Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

# Get best parameters
best_params = study.best_params
best_params['objective'] = 'regression'
best_params['metric'] = 'rmse'
best_params['verbosity'] = -1

# Train the final model with the best parameters
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

final_model = lgb.train(
    best_params,
    train_data,
    num_boost_round=1000,
    valid_sets=[valid_data],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100, verbose=True),
        lgb.log_evaluation(period=100)
    ]
)

# Evaluate the final model on the validation set
y_pred = final_model.predict(X_valid)
rmse = mean_squared_error(y_valid, y_pred, squared=False)
print(f'Validation RMSE: {rmse}')

# Make predictions on the test set
X_test = test_df[features]
test_predictions = final_model.predict(X_test)

# Create a submission DataFrame
submission = pd.DataFrame({
    'position': test_predictions
    'result_driver_standing': test_df['result_driver_standing'],  # Assuming 'resultId' is the index or a column in test_df
})

# Save submission to CSV
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully.")

[I 2024-06-29 01:16:05,303] A new study created in memory with name: no-name-7b82d3de-bb9e-4e7f-9056-9a48a74275b8


Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 7.65291
[200]	valid_0's rmse: 7.51839
[300]	valid_0's rmse: 7.45195
[400]	valid_0's rmse: 7.42233
[500]	valid_0's rmse: 7.40894
[600]	valid_0's rmse: 7.39962
[700]	valid_0's rmse: 7.39761
[800]	valid_0's rmse: 7.39705
[900]	valid_0's rmse: 7.3978
Early stopping, best iteration is:
[822]	valid_0's rmse: 7.39621


[I 2024-06-29 01:16:45,990] Trial 0 finished with value: 7.39620660161211 and parameters: {'learning_rate': 0.0597217211555161, 'num_leaves': 38, 'feature_fraction': 0.8736476579531497, 'bagging_fraction': 0.8912787916342938, 'bagging_freq': 5}. Best is trial 0 with value: 7.39620660161211.


Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 7.84762
[200]	valid_0's rmse: 7.55962
[300]	valid_0's rmse: 7.48628
[400]	valid_0's rmse: 7.44816
[500]	valid_0's rmse: 7.42881
[600]	valid_0's rmse: 7.41251
[700]	valid_0's rmse: 7.40092
[800]	valid_0's rmse: 7.39285
[900]	valid_0's rmse: 7.39076
[1000]	valid_0's rmse: 7.38901
Did not meet early stopping. Best iteration is:
[980]	valid_0's rmse: 7.38894


[I 2024-06-29 01:18:05,968] Trial 1 finished with value: 7.388939842762505 and parameters: {'learning_rate': 0.01763453624838673, 'num_leaves': 90, 'feature_fraction': 0.9816623731942931, 'bagging_fraction': 0.8986541043398814, 'bagging_freq': 5}. Best is trial 1 with value: 7.388939842762505.


Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 7.77985
[200]	valid_0's rmse: 7.61045
[300]	valid_0's rmse: 7.52966
[400]	valid_0's rmse: 7.47998
[500]	valid_0's rmse: 7.44855
[600]	valid_0's rmse: 7.42533
[700]	valid_0's rmse: 7.41057
[800]	valid_0's rmse: 7.39935
[900]	valid_0's rmse: 7.39128
[1000]	valid_0's rmse: 7.38626
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 7.38626


[I 2024-06-29 01:19:17,476] Trial 2 finished with value: 7.386255261152265 and parameters: {'learning_rate': 0.03454246810533098, 'num_leaves': 42, 'feature_fraction': 0.9088724634051651, 'bagging_fraction': 0.9915773646309484, 'bagging_freq': 4}. Best is trial 2 with value: 7.386255261152265.


Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 8.18958
[200]	valid_0's rmse: 7.81628
[300]	valid_0's rmse: 7.66254
[400]	valid_0's rmse: 7.59562
[500]	valid_0's rmse: 7.55852
[600]	valid_0's rmse: 7.52754
[700]	valid_0's rmse: 7.50149
[800]	valid_0's rmse: 7.47873
[900]	valid_0's rmse: 7.46035
[1000]	valid_0's rmse: 7.44655
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 7.44655


[I 2024-06-29 01:20:23,991] Trial 3 finished with value: 7.446545324166264 and parameters: {'learning_rate': 0.01230093696586767, 'num_leaves': 56, 'feature_fraction': 0.8514181222671509, 'bagging_fraction': 0.9755788858020719, 'bagging_freq': 1}. Best is trial 2 with value: 7.386255261152265.


Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 7.69974
[200]	valid_0's rmse: 7.54563
[300]	valid_0's rmse: 7.47898
[400]	valid_0's rmse: 7.44596
[500]	valid_0's rmse: 7.42263
[600]	valid_0's rmse: 7.4042
[700]	valid_0's rmse: 7.39773
[800]	valid_0's rmse: 7.39406
[900]	valid_0's rmse: 7.38828
[1000]	valid_0's rmse: 7.38715
Did not meet early stopping. Best iteration is:
[948]	valid_0's rmse: 7.38517


[I 2024-06-29 01:21:27,192] Trial 4 finished with value: 7.385171403190093 and parameters: {'learning_rate': 0.033580729761321725, 'num_leaves': 56, 'feature_fraction': 0.996510617015941, 'bagging_fraction': 0.928083358701578, 'bagging_freq': 10}. Best is trial 4 with value: 7.385171403190093.


Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 7.71077
[200]	valid_0's rmse: 7.57337
[300]	valid_0's rmse: 7.5043
[400]	valid_0's rmse: 7.45514
[500]	valid_0's rmse: 7.43373
[600]	valid_0's rmse: 7.42072
[700]	valid_0's rmse: 7.41152
[800]	valid_0's rmse: 7.40955
[900]	valid_0's rmse: 7.40934
Early stopping, best iteration is:
[820]	valid_0's rmse: 7.40812


[I 2024-06-29 01:22:12,293] Trial 5 finished with value: 7.408117278856206 and parameters: {'learning_rate': 0.051493779750945726, 'num_leaves': 36, 'feature_fraction': 0.9068244120431425, 'bagging_fraction': 0.8302083991638234, 'bagging_freq': 4}. Best is trial 4 with value: 7.385171403190093.


Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 7.47481
[200]	valid_0's rmse: 7.40956
[300]	valid_0's rmse: 7.40257
Early stopping, best iteration is:
[248]	valid_0's rmse: 7.40116


[I 2024-06-29 01:22:33,604] Trial 6 finished with value: 7.401156439313183 and parameters: {'learning_rate': 0.07099669422380556, 'num_leaves': 84, 'feature_fraction': 0.7569041219054111, 'bagging_fraction': 0.803480815777288, 'bagging_freq': 4}. Best is trial 4 with value: 7.385171403190093.


Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 7.91535
[200]	valid_0's rmse: 7.73898
[300]	valid_0's rmse: 7.63937
[400]	valid_0's rmse: 7.57581
[500]	valid_0's rmse: 7.5277
[600]	valid_0's rmse: 7.49347
[700]	valid_0's rmse: 7.47199
[800]	valid_0's rmse: 7.45267
[900]	valid_0's rmse: 7.44151
[1000]	valid_0's rmse: 7.43182
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 7.43182


[I 2024-06-29 01:23:20,773] Trial 7 finished with value: 7.43181852175245 and parameters: {'learning_rate': 0.042337199195333367, 'num_leaves': 22, 'feature_fraction': 0.9736736945728879, 'bagging_fraction': 0.8663488968580109, 'bagging_freq': 2}. Best is trial 4 with value: 7.385171403190093.


Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 7.47071
[200]	valid_0's rmse: 7.402
[300]	valid_0's rmse: 7.39387
[400]	valid_0's rmse: 7.39553
Early stopping, best iteration is:
[318]	valid_0's rmse: 7.3916


[I 2024-06-29 01:23:49,073] Trial 8 finished with value: 7.391598144039277 and parameters: {'learning_rate': 0.06624631167926337, 'num_leaves': 86, 'feature_fraction': 0.9776283944184022, 'bagging_fraction': 0.9111148014974881, 'bagging_freq': 10}. Best is trial 4 with value: 7.385171403190093.


Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 8.06466
[200]	valid_0's rmse: 7.76618
[300]	valid_0's rmse: 7.64815
[400]	valid_0's rmse: 7.59543
[500]	valid_0's rmse: 7.55791
[600]	valid_0's rmse: 7.52891
[700]	valid_0's rmse: 7.50379
[800]	valid_0's rmse: 7.48317
[900]	valid_0's rmse: 7.46527
[1000]	valid_0's rmse: 7.4531
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 7.4531


[I 2024-06-29 01:24:53,574] Trial 9 finished with value: 7.453095518753085 and parameters: {'learning_rate': 0.017139611191592758, 'num_leaves': 44, 'feature_fraction': 0.8107275383128932, 'bagging_fraction': 0.737106423859892, 'bagging_freq': 3}. Best is trial 4 with value: 7.385171403190093.


Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 7.68755
[200]	valid_0's rmse: 7.51956
[300]	valid_0's rmse: 7.44144
[400]	valid_0's rmse: 7.39533
[500]	valid_0's rmse: 7.36536
[600]	valid_0's rmse: 7.34162
[700]	valid_0's rmse: 7.32815
[800]	valid_0's rmse: 7.31709
[900]	valid_0's rmse: 7.30944
[1000]	valid_0's rmse: 7.3034
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 7.3034
Validation RMSE: 7.303401791828516
Submission file created successfully.
