# **SVM**

## Imports

In [None]:
import numpy as np
import pandas as pd

# Model selection & evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
)

# Preprocessing & pipelines
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

## Configuration and Data Loading

In this section, we:

- Define a few global **configuration variables** (e.g., `RANDOM_STATE`, `TEST_SIZE`) to keep experiments consistent across models.
- **Load** the pre-cleaned F1 dataset from disk.
- Do a quick **sanity check** of the data shape and the target distribution for `target_finish` (DNF vs finished).

In [None]:
# Configuration
RANDOM_STATE = 42

DATA_PATH = "processed_data.csv"  # adjust to your actual path

# Load dataset
df = pd.read_csv(DATA_PATH)

# Quick sanity checks
print("Dataset shape:", df.shape)
print("\nColumns:\n", df.columns.tolist())

print("\nTarget distribution (counts):")
print(df["target_finish"].value_counts())

print("\nTarget distribution (proportions):")
print(df["target_finish"].value_counts(normalize=True))

Dataset shape: (10000, 37)

Columns:
 ['year', 'round', 'grid', 'alt', 'target_finish', 'constructorRef_brabham', 'constructorRef_ferrari', 'constructorRef_ligier', 'constructorRef_mclaren', 'constructorRef_red_bull', 'constructorRef_renault', 'constructorRef_sauber', 'constructorRef_team_lotus', 'constructorRef_tyrrell', 'constructorRef_williams', 'circuitRef_hockenheimring', 'circuitRef_hungaroring', 'circuitRef_interlagos', 'circuitRef_monaco', 'circuitRef_monza', 'circuitRef_nurburgring', 'circuitRef_red_bull_ring', 'circuitRef_silverstone', 'circuitRef_spa', 'circuitRef_villeneuve', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'target_dnf']

Target distribution (counts):
target_finish
0    7105
1    2895
Name: count, dtype: int64

Target distribution (proportions):
target_finish
0    0.7105
1    0.2895
Name: proportion, dtype: float64


## Train/Test Split (Time-based, aligned with main pipeline)

We follow the global preprocessing pipeline and split the data by year:

- **Train set**: races up to and including 2015  
- **Test set**: races after 2015  

The feature matrix `X` excludes both `target_finish` and `target_dnf`, and the
label vector `y` uses `target_dnf` (1 = DNF, 0 = finish).

In [31]:
## Train/Test Split (Time-based, aligned with main pipeline)

cutoff_year = 2015

# Use the loaded processed dataframe (df) instead of encoded_data
train_data = df[df["year"] <= cutoff_year].copy()
test_data  = df[df["year"] >  cutoff_year].copy()

X_train = train_data.drop(columns=["target_finish", "target_dnf"])
y_train = train_data["target_dnf"]

X_test  = test_data.drop(columns=["target_finish", "target_dnf"])
y_test  = test_data["target_dnf"]

print("Training size (X, y):", X_train.shape, ",", len(y_train))
print("Test size (X, y):", X_test.shape, ",", len(y_test))

print("\nChecking that targets are not in X columns:")
for col in ["target_finish", "target_dnf"]:
    print(f"{col} in X_train? {col in X_train.columns}")

Training size (X, y): (8550, 35) , 8550
Test size (X, y): (1450, 35) , 1450

Checking that targets are not in X columns:
target_finish in X_train? False
target_dnf in X_train? False


## SVM Baseline Model

We start with a simple Support Vector Machine classifier using:
- `StandardScaler` for feature normalization
- default `SVC` settings


We evaluate performance using Accuracy, Precision, Recall, and F1 (with DNF = 1 as the positive class).

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Baseline SVM pipeline
svm_baseline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", SVC(random_state=RANDOM_STATE))
])

# Fit model
svm_baseline.fit(X_train, y_train)

# Predict test set
y_pred = svm_baseline.predict(X_test)

# Evaluate
print("=== Baseline SVM Performance (DNF = 1) ===")
print(f"Accuracy:   {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision:  {precision_score(y_test, y_pred):.4f}")
print(f"Recall:     {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score:   {f1_score(y_test, y_pred):.4f}")

=== Baseline SVM Performance (DNF = 1) ===
Accuracy:   0.6303
Precision:  0.5785
Recall:     0.7585
F1 Score:   0.6564


## SVM Hyperparameter Search (Grid Search)

In this section, we perform a hyperparameter search for the SVM model using `GridSearchCV`.

We vary:
- `kernel` (linear, rbf, poly)
- `C` (regularization strength)
- `gamma` (kernel coefficient for rbf and poly)
- optionally `class_weight` (to account for class imbalance)

We use 5-fold cross-validation on the training data and optimize for **F1 score** with DNF = 1 as the positive class.

In [36]:
from sklearn.model_selection import GridSearchCV

# Define SVM pipeline
svm_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", SVC(random_state=RANDOM_STATE))
])

# Hyperparameter grid
param_grid = {
    "model__kernel": ["linear", "rbf", "poly"],
    "model__C": [0.1, 1, 10],
    "model__gamma": ["scale", 0.1, 0.01],  # gamma used for rbf/poly
    # "model__class_weight": [None, "balanced"],  # optional
}

# Grid search object (no print statements here)
svm_grid = GridSearchCV(
    estimator=svm_pipe,
    param_grid=param_grid,
    scoring="f1",
    cv=5,
    n_jobs=-1,
    verbose=0   # turn off console spam
)

# Fit on training data (silent)
svm_grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__C': [0.1, 1, ...], 'model__gamma': ['scale', 0.1, ...], 'model__kernel': ['linear', 'rbf', ...]}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,10
,kernel,'rbf'
,degree,3
,gamma,0.01
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


## Top-K SVM Models: Test Set Performance

Instead of only looking at the single best SVM model, we also inspect the
top **K** hyperparameter configurations from the grid search.

For each of the top-K models (ranked by mean CV F1 score), we:
- refit the model on the full training set
- evaluate it on the test set
- report Accuracy, Precision, Recall, and F1 (DNF = 1 as the positive class)

In [37]:
from sklearn.base import clone
from sklearn.dummy import DummyClassifier

TOP_K = 5  # number of top SVM models you want to inspect

cv_results = pd.DataFrame(svm_grid.cv_results_)

# Sort by mean_test_score (which corresponds to CV F1 because scoring="f1")
cv_results_sorted = cv_results.sort_values(
    by="mean_test_score", ascending=False
).reset_index(drop=True)

top_k_rows = cv_results_sorted.head(TOP_K)

records = []

# ==== 1) Add Dummy Baseline as Rank 0 ====

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
y_dummy = dummy_clf.predict(X_test)

records.append({
    "rank": 0,
    "params": "Dummy(most_frequent)",
    "cv_mean_f1": None,
    "cv_std_f1": None,
    "test_accuracy": accuracy_score(y_test, y_dummy),
    "test_precision": precision_score(y_test, y_dummy, zero_division=0),
    "test_recall": recall_score(y_test, y_dummy, zero_division=0),
    "test_f1": f1_score(y_test, y_dummy, zero_division=0),
})

# ==== 2) Add Top-K SVM Models ====

for idx, row in top_k_rows.iterrows():
    params = row["params"]

    # Clone base pipeline and set hyperparameters
    model = clone(svm_grid.estimator)
    model.set_params(**params)

    # Fit on full training data
    model.fit(X_train, y_train)

    # Evaluate on test data
    y_pred = model.predict(X_test)

    rec = {
        "rank": idx + 1,
        "params": params,
        "cv_mean_f1": row["mean_test_score"],
        "cv_std_f1": row["std_test_score"],
        "test_accuracy": accuracy_score(y_test, y_pred),
        "test_precision": precision_score(y_test, y_pred),
        "test_recall": recall_score(y_test, y_pred),
        "test_f1": f1_score(y_test, y_pred),
    }
    records.append(rec)

# Convert to DataFrame
comparison_df = pd.DataFrame(records)

pd.set_option("display.max_colwidth", None)
display(comparison_df)

Unnamed: 0,rank,params,cv_mean_f1,cv_std_f1,test_accuracy,test_precision,test_recall,test_f1
0,0,Dummy(most_frequent),,,0.465517,0.465517,1.0,0.635294
1,1,"{'model__C': 10, 'model__gamma': 0.01, 'model__kernel': 'rbf'}",0.871087,0.005852,0.632414,0.585749,0.718519,0.645376
2,2,"{'model__C': 10, 'model__gamma': 'scale', 'model__kernel': 'rbf'}",0.870956,0.004408,0.63931,0.610145,0.623704,0.61685
3,3,"{'model__C': 1, 'model__gamma': 'scale', 'model__kernel': 'poly'}",0.868662,0.005802,0.605517,0.555917,0.758519,0.641604
4,4,"{'model__C': 10, 'model__gamma': 0.01, 'model__kernel': 'poly'}",0.868447,0.003263,0.608276,0.557465,0.768889,0.646326
5,5,"{'model__C': 1, 'model__gamma': 'scale', 'model__kernel': 'rbf'}",0.86826,0.005652,0.630345,0.578531,0.758519,0.65641
