In [3]:
import pandas as pd
import lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [4]:

# Load your data
data = pd.read_csv('race_predictor_data.csv')

# Define features
continuous_features = ['averageCumRacerPoints', 'qualifyingALPC', 'averageDriverExpYears', 'avgDriverRaceCount', 'FRALPC',
                       'absolute_position_diff', 'absGridDelta']
categorical_features = ['topTenDiversity', 'circuitId', 'countryId']
all_features = continuous_features + categorical_features
target = 'totalDNFs'
target_alpc = 'ALPC'

# Ensure categorical dtype
for col in categorical_features:
    data[col] = data[col].astype('category')

X = data[all_features]
y = data[target]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=7
)

In [6]:
# Set up KFold
kf = KFold(n_splits=7, shuffle=True, random_state=7)

mse_scores = []
r2_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"\n--- Fold {fold + 1} ---")
    
    X_kf_train, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_kf_train, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = LGBMRegressor(
        objective='regression',
        n_estimators=1000,
        random_state=42
    )

    model.fit(
        X_kf_train,
        y_kf_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        callbacks=[early_stopping(stopping_rounds=77)],
        categorical_feature=categorical_features
    )
    
    y_val_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)

    print(f"Validation MSE: {mse:.4f}")
    print(f"Validation R²: {r2:.4f}")

    mse_scores.append(mse)
    r2_scores.append(r2)

print("\n=== Cross-Validation Results ===")
print(f"Average Validation MSE: {sum(mse_scores)/len(mse_scores):.4f}")
print(f"Average Validation R²: {sum(r2_scores)/len(r2_scores):.4f}")


--- Fold 1 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004842 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 894
[LightGBM] [Info] Number of data points in the train set: 483, number of used features: 10
[LightGBM] [Info] Start training from score 6.594203
Training until validation scores don't improve for 77 rounds
Early stopping, best iteration is:
[29]	valid_0's rmse: 1.98737	valid_0's l2: 3.94964
Validation MSE: 3.9496
Validation R²: 0.8088

--- Fold 2 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 896
[LightGBM] [Info] Number of data points in the train set: 483, number of used features: 10
[LightGBM] [Info] Start training from score 6.774327
Training until validation scores don't improve for 77 rounds
Early stopping, best iteration is:
[2

In [7]:
# Results summary
print("\n=== Cross-Validation Results ===")
print(f"Average MSE: {np.mean(mse_scores):.4f}")
print(f"Average R²: {np.mean(r2_scores):.4f}")


=== Cross-Validation Results ===
Average MSE: 4.6694
Average R²: 0.7604


In [8]:
# Train the model on the Training Set
dnf_model = LGBMRegressor(
    objective='regression',
    n_estimators=77,
    random_state=7
)

dnf_model.fit(
    X_train,
    y_train,
    categorical_feature=categorical_features
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000112 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1008
[LightGBM] [Info] Number of data points in the train set: 564, number of used features: 10
[LightGBM] [Info] Start training from score 6.696809


In [9]:
# Step 4: Evaluate on the held-out test set
y_test_pred = dnf_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\n=== Final Test Set Performance ===")
print(f"Test Set MSE: {test_mse:.4f}")
print(f"Test Set R²: {test_r2:.4f}")



=== Final Test Set Performance ===
Test Set MSE: 5.1407
Test Set R²: 0.7184


In [10]:
data['predicted_DNFs'] = dnf_model.predict(data[all_features])

In [11]:
alpc_features = ['averageCumRacerPoints', 'qualifyingALPC', 'averageDriverExpYears', 
                 'avgDriverRaceCount', 'FRALPC', 'predicted_DNFs'] #+ categorical_features

X_alpc = data[alpc_features]
y_alpc = data[target_alpc]

In [12]:
X_train_alpc, X_test_alpc, y_train_alpc, y_test_alpc = train_test_split(
    X_alpc, y_alpc, test_size=0.2, random_state=7
)

In [13]:
kf = KFold(n_splits=7, shuffle=True, random_state=7)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_alpc)):
    print(f"\n--- Fold {fold + 1} ---")
    X_kf_train, X_val = X_train_alpc.iloc[train_idx], X_train_alpc.iloc[val_idx]
    y_kf_train, y_val = y_train_alpc.iloc[train_idx], y_train_alpc.iloc[val_idx]

    alpc_model = LGBMRegressor(n_estimators=1000, random_state=7)
    alpc_model.fit(
        X_kf_train, y_kf_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        callbacks=[early_stopping(stopping_rounds=50)]
        #categorical_feature=categorical_features
    )

    y_val_pred = alpc_model.predict(X_val)
    mse = mean_squared_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)

    print(f"Validation MSE: {mse:.4f}")
    print(f"Validation R²: {r2:.4f}")


--- Fold 1 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 744
[LightGBM] [Info] Number of data points in the train set: 483, number of used features: 6
[LightGBM] [Info] Start training from score 2.277162
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[12]	valid_0's rmse: 0.78583	valid_0's l2: 0.617529
Validation MSE: 0.6175
Validation R²: 0.1006

--- Fold 2 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000077 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 746
[LightGBM] [Info] Number of data points in the train set: 483, number of used features: 6
[LightGBM] [Info] Start training from score 2.228782
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[31

In [14]:
final_alpc_model = LGBMRegressor(n_estimators=5, random_state=7)
final_alpc_model.fit(
    X_train_alpc, y_train_alpc
    #categorical_feature=categorical_features
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000077 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 848
[LightGBM] [Info] Number of data points in the train set: 564, number of used features: 6
[LightGBM] [Info] Start training from score 2.245112


In [15]:
y_test_pred = final_alpc_model.predict(X_test_alpc)
test_mse = mean_squared_error(y_test_alpc, y_test_pred)
test_r2 = r2_score(y_test_alpc, y_test_pred)

print("\n=== Final ALPC Test Set Performance ===")
print(f"Test Set MSE: {test_mse:.4f}")
print(f"Test Set R²: {test_r2:.4f}")


=== Final ALPC Test Set Performance ===
Test Set MSE: 0.9395
Test Set R²: 0.1121
