In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

In [21]:
# Load your dataset
df = pd.read_csv("race_predictor_data.csv")

# Define features and target
features = ['averageCumRacerPoints', 'qualifyingALPC', 'averageDriverExpYears', 
            'avgDriverRaceCount', 'FRALPC', 'absolute_position_diff', 'topTenDiversity', 'circuitId', 'countryId']
target = 'totalDNFs'

X = df[features]
y = df[target]

In [22]:
# If categorical features are present, encode them (e.g., with one-hot or label encoding)
# For simplicity, let's label encode here:
for col in ['topTenDiversity', 'circuitId', 'countryId']:
    le = LabelEncoder()
    X.loc[:, col] = le.fit_transform(X[col])

In [24]:
kf = KFold(n_splits=7, shuffle=True, random_state=7)

mse_scores = []
r2_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n--- Fold {fold + 1} ---")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Model
    model = DecisionTreeRegressor(random_state=42)
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    print(f"MSE: {mse:.4f}")
    print(f"R²: {r2:.4f}")

    mse_scores.append(mse)
    r2_scores.append(r2)

# Average results
print("\n=== Average Performance Across Folds ===")
print(f"Average MSE: {sum(mse_scores)/len(mse_scores):.4f}")
print(f"Average R²: {sum(r2_scores)/len(r2_scores):.4f}")


--- Fold 1 ---
MSE: 9.1782
R²: 0.4934

--- Fold 2 ---
MSE: 7.8416
R²: 0.5952

--- Fold 3 ---
MSE: 8.7426
R²: 0.5403

--- Fold 4 ---
MSE: 7.6337
R²: 0.6307

--- Fold 5 ---
MSE: 9.6238
R²: 0.4838

--- Fold 6 ---
MSE: 8.8300
R²: 0.5728

--- Fold 7 ---
MSE: 11.0900
R²: 0.4345

=== Average Performance Across Folds ===
Average MSE: 8.9914
Average R²: 0.5358


In [25]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

# Initialize and train the decision tree regressor
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)

# Predict on test set
y_pred = tree_model.predict(X_test)

In [26]:
# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test Set MSE: {mse:.4f}")
print(f"Test Set R²: {r2:.4f}")

Test Set MSE: 8.8298
Test Set R²: 0.5163
