# **SVM**

## Imports

In [8]:
import numpy as np
import pandas as pd

# Model selection & evaluation
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

# Models
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.base import clone

# Preprocessing & pipelines
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

## Configuration and Data Loading

In this section, we:

- Define a few global **configuration variables** (e.g., `RANDOM_STATE`, `TEST_SIZE`) to keep experiments consistent across models.
- **Load** the pre-cleaned F1 dataset from disk.
- Do a quick **sanity check** of the data shape and the target distribution for `target_finish` (DNF vs finished).

In [12]:
# Configuration
RANDOM_STATE = 42
DATA_PATH = "processed_data.csv"

# Helper function to print evaluation metrics
def evaluate_model(name, y_true, y_pred):
    print(f"=== {name} ===")
    print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall   : {recall_score(y_true, y_pred):.4f}")
    print(f"F1       : {f1_score(y_true, y_pred):.4f}")

# Load dataset
df = pd.read_csv(DATA_PATH)

# Quick sanity checks
print("Dataset shape:", df.shape)
print("\nColumns:\n", df.columns.tolist())

print("\nTarget distribution (counts):")
print(df["target_finish"].value_counts())

print("\nTarget distribution (proportions):")
print(df["target_finish"].value_counts(normalize=True))

Dataset shape: (10000, 37)

Columns:
 ['year', 'round', 'grid', 'alt', 'target_finish', 'constructorRef_brabham', 'constructorRef_ferrari', 'constructorRef_ligier', 'constructorRef_mclaren', 'constructorRef_red_bull', 'constructorRef_renault', 'constructorRef_sauber', 'constructorRef_team_lotus', 'constructorRef_tyrrell', 'constructorRef_williams', 'circuitRef_hockenheimring', 'circuitRef_hungaroring', 'circuitRef_interlagos', 'circuitRef_monaco', 'circuitRef_monza', 'circuitRef_nurburgring', 'circuitRef_red_bull_ring', 'circuitRef_silverstone', 'circuitRef_spa', 'circuitRef_villeneuve', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'target_dnf']

Target distribution (counts):
target_finish
0    7105
1    2895
Name: count, dtype: int64

Target distribution (proportions):
target_finish
0    0.7105
1    0.2895
Name: proportion, dtype: float64


## Train/Test Split (Time-based, aligned with main pipeline)

We follow the global preprocessing pipeline and split the data by year:

- **Train set**: races up to and including 2015  
- **Test set**: races after 2015  

The feature matrix `X` excludes both `target_finish` and `target_dnf`, and the
label vector `y` uses `target_dnf` (1 = DNF, 0 = finish).

In [4]:
## Train/Test Split (Time-based, aligned with main pipeline)

cutoff_year = 2015

# Use the loaded processed dataframe (df) instead of encoded_data
train_data = df[df["year"] <= cutoff_year].copy()
test_data  = df[df["year"] >  cutoff_year].copy()

# Feature matrix X excludes both target columns
X_train = train_data.drop(columns=["target_finish", "target_dnf"])
y_train = train_data["target_dnf"]

X_test  = test_data.drop(columns=["target_finish", "target_dnf"])
y_test  = test_data["target_dnf"]

print("Training size (X, y):", X_train.shape, ",", len(y_train))
print("Test size (X, y):", X_test.shape, ",", len(y_test))

print("\nChecking that targets are not in X columns:")
for col in ["target_finish", "target_dnf"]:
    print(f"{col} in X_train? {col in X_train.columns}")

Training size (X, y): (8550, 35) , 8550
Test size (X, y): (1450, 35) , 1450

Checking that targets are not in X columns:
target_finish in X_train? False
target_dnf in X_train? False


## SVM Baseline Model

We start with a simple Support Vector Machine classifier using:
- `StandardScaler` for feature normalization
- default `SVC` settings


We evaluate performance using Accuracy, Precision, Recall, and F1 (with DNF = 1 as the positive class).

In [None]:
# Baseline SVM pipeline: StandardScaler + default SVC
svm_baseline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", SVC(random_state=RANDOM_STATE))
])

# Fit model
svm_baseline.fit(X_train, y_train)

# Predict test set
y_pred_baseline = svm_baseline.predict(X_test)

# Evaluate
evaluate_model("Baseline SVM (DNF = 1)", y_test, y_pred_baseline)

=== Baseline SVM (DNF = 1) ===
Accuracy : 0.6303
Precision: 0.5785
Recall   : 0.7585
F1       : 0.6564


## SVM Hyperparameter Search (Grid Search)

In this section, we perform a hyperparameter search for the SVM model using `GridSearchCV`.

We vary:
- `kernel` (linear, rbf, poly)
- `C` (regularization strength)
- `gamma` (kernel coefficient for rbf and poly)
- optionally `class_weight` (to account for class imbalance)

We use 5-fold cross-validation on the training data and optimize for **F1 score** with DNF = 1 as the positive class.

In [None]:
# Define SVM pipeline
svm_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", SVC(random_state=RANDOM_STATE))
])

# Hyperparameter grid
param_grid = {
    "model__kernel": ["linear", "rbf", "poly"],
    "model__C": [0.1, 1, 10],
    "model__gamma": ["scale", 0.1, 0.01],  # gamma used for rbf/poly
}

# Grid search object
svm_grid = GridSearchCV(
    estimator=svm_pipe,
    param_grid=param_grid,
    scoring="f1",
    cv=5,
    n_jobs=-1,
    verbose=0,
)

# Fit on training data
svm_grid.fit(X_train, y_train)

print("Best params from grid search:")
print(svm_grid.best_params_)
print(f"Best CV F1: {svm_grid.best_score_:.4f}")

Best params from grid search:
{'model__C': 10, 'model__gamma': 0.01, 'model__kernel': 'rbf'}
Best CV F1: 0.8711


## Top-K SVM Models: Test Set Performance

Instead of only looking at the single best SVM model, we also inspect the
top **K** hyperparameter configurations from the grid search.

For each of the top-K models (ranked by mean CV F1 score), we:
- refit the model on the full training set
- evaluate it on the test set
- report Accuracy, Precision, Recall, and F1 (DNF = 1 as the positive class)

In [9]:
TOP_K = 10  # number of top SVM models you want to inspect

# Extract and sort by mean_test_score (which corresponds to CV F1 because scoring="f1")
cv_results = pd.DataFrame(svm_grid.cv_results_)
cv_results_sorted = cv_results.sort_values(
    by="mean_test_score", ascending=False
).reset_index(drop=True)

top_k_rows = cv_results_sorted.head(TOP_K)

records = []

# ==== 1) Add Dummy Baseline as Rank 0 ====

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
y_dummy = dummy_clf.predict(X_test)

records.append({
    "rank": 0,
    "params": "Dummy(most_frequent)",
    "cv_mean_f1": None,
    "cv_std_f1": None,
    "test_accuracy": accuracy_score(y_test, y_dummy),
    "test_precision": precision_score(y_test, y_dummy, zero_division=0),
    "test_recall": recall_score(y_test, y_dummy, zero_division=0),
    "test_f1": f1_score(y_test, y_dummy, zero_division=0),
})

# ==== 2) Add Top-K SVM Models ====

for idx, row in top_k_rows.iterrows():
    params = row["params"]

    # Clone base pipeline and set hyperparameters
    model = clone(svm_grid.estimator)
    model.set_params(**params)

    # Fit on full training data
    model.fit(X_train, y_train)

    # Evaluate on test data
    y_pred = model.predict(X_test)

    rec = {
        "rank": idx + 1,
        "params": params,
        "cv_mean_f1": row["mean_test_score"],
        "cv_std_f1": row["std_test_score"],
        "test_accuracy": accuracy_score(y_test, y_pred),
        "test_precision": precision_score(y_test, y_pred, zero_division=0),
        "test_recall": recall_score(y_test, y_pred, zero_division=0),
        "test_f1": f1_score(y_test, y_pred, zero_division=0),
    }
    records.append(rec)

# Convert to DataFrame
comparison_df = pd.DataFrame(records)

pd.set_option("display.max_colwidth", None)
display(comparison_df)

Unnamed: 0,rank,params,cv_mean_f1,cv_std_f1,test_accuracy,test_precision,test_recall,test_f1
0,0,Dummy(most_frequent),,,0.465517,0.465517,1.0,0.635294
1,1,"{'model__C': 10, 'model__gamma': 0.01, 'model__kernel': 'rbf'}",0.871087,0.005852,0.631724,0.585248,0.717037,0.644474
2,2,"{'model__C': 10, 'model__gamma': 'scale', 'model__kernel': 'rbf'}",0.870956,0.004408,0.63931,0.610145,0.623704,0.61685
3,3,"{'model__C': 1, 'model__gamma': 'scale', 'model__kernel': 'poly'}",0.868662,0.005802,0.605517,0.555917,0.758519,0.641604
4,4,"{'model__C': 10, 'model__gamma': 0.01, 'model__kernel': 'poly'}",0.868447,0.003263,0.608276,0.557465,0.768889,0.646326
5,5,"{'model__C': 1, 'model__gamma': 'scale', 'model__kernel': 'rbf'}",0.86826,0.005652,0.630345,0.578531,0.758519,0.65641
6,6,"{'model__C': 1, 'model__gamma': 0.01, 'model__kernel': 'rbf'}",0.868021,0.004663,0.628966,0.575691,0.771852,0.659494
7,7,"{'model__C': 0.1, 'model__gamma': 0.1, 'model__kernel': 'poly'}",0.86779,0.003505,0.634483,0.588523,0.714074,0.645248
8,8,"{'model__C': 10, 'model__gamma': 'scale', 'model__kernel': 'poly'}",0.867469,0.003387,0.633103,0.595716,0.659259,0.625879
9,9,"{'model__C': 1, 'model__gamma': 0.01, 'model__kernel': 'linear'}",0.866362,0.003448,0.606897,0.565543,0.671111,0.613821


## SVM Feature Subset Experiments
Using the best SVM hyperparameters found by the grid search (`kernel`, `C`, and `gamma`),  
we investigate how performance changes when we restrict the model to fewer input features.

We compute the absolute Pearson correlation between each **numeric** feature and the target
   label `target_dnf`.
We define four feature sets:
   - **all_features** – all columns used in the original SVM
   - **top_15_corr** – the 15 most correlated features with `target_dnf`
   - **top_10_corr** – the 10 most correlated features
   - **top_5_corr**  – the 5 most correlated features

For each feature set we:
   - train the same “best” SVM (same kernel, `C`, and `gamma`),
   - evaluate Accuracy, Precision, Recall, and F1 on the test set.


In [None]:
# Use the best hyperparameters from the grid search
best_params = svm_grid.best_params_
print("Best SVM params:", best_params)

# Compute correlations with the target for numeric features
corr = df.corr(numeric_only=True)["target_dnf"].abs().sort_values(ascending=False)

# Drop the target itself
corr = corr.drop(labels=["target_dnf", "target_finish"], errors="ignore")

print("\nTop 15 features by |correlation with target_dnf|:")
print(corr.head(15))

# Define different feature subsets
feature_sets = {
    "all_features": X_train.columns.tolist(),
    "top_15_corr": corr.head(15).index.tolist(),
    "top_10_corr": corr.head(10).index.tolist(),
    "top_5_corr":  corr.head(5).index.tolist(),
}

# Train the same best SVM on each subset and evaluate
feature_records = []

for name, cols in feature_sets.items():
    print(f"\n=== Training best SVM on {name} ({len(cols)} features) ===")
    
    X_train_sub = X_train[cols]
    X_test_sub  = X_test[cols]
    
    svm_best = Pipeline([
        ("scaler", StandardScaler()),
        ("model", SVC(random_state=RANDOM_STATE)),
    ])
    svm_best.set_params(**best_params)
    
    svm_best.fit(X_train_sub, y_train)
    y_pred_sub = svm_best.predict(X_test_sub)
    
    feature_records.append({
        "features": name,
        "n_features": len(cols),
        "test_accuracy":  accuracy_score(y_test, y_pred_sub),
        "test_precision": precision_score(y_test, y_pred_sub, zero_division=0),
        "test_recall":    recall_score(y_test, y_pred_sub, zero_division=0),
        "test_f1":        f1_score(y_test, y_pred_sub, zero_division=0),
    })

feature_results_df = pd.DataFrame(feature_records)
display(feature_results_df)

Best SVM params: {'model__C': 10, 'model__gamma': 0.01, 'model__kernel': 'rbf'}

Top 15 features by |correlation with target_dnf|:
grid                       0.344964
year                       0.276936
constructorRef_ferrari     0.183788
constructorRef_red_bull    0.179034
constructorRef_mclaren     0.145330
round                      0.123115
constructorRef_williams    0.107466
constructorRef_ligier      0.076682
month_11                   0.059469
constructorRef_tyrrell     0.050984
constructorRef_brabham     0.049485
month_5                    0.047390
circuitRef_monaco          0.044537
circuitRef_spa             0.043378
alt                        0.043222
Name: target_dnf, dtype: float64

=== Training best SVM on all_features (35 features) ===

=== Training best SVM on top_15_corr (15 features) ===

=== Training best SVM on top_10_corr (10 features) ===

=== Training best SVM on top_5_corr (5 features) ===


Unnamed: 0,features,n_features,test_accuracy,test_precision,test_recall,test_f1
0,all_features,35,0.631724,0.585248,0.717037,0.644474
1,top_15_corr,15,0.626207,0.566969,0.834074,0.67506
2,top_10_corr,10,0.626207,0.567104,0.832593,0.67467
3,top_5_corr,5,0.624138,0.563725,0.851852,0.678466


## Uncertainty via Distance to the Decision Boundary
We take the best SVM pipeline from the grid search and compute
  `decision_function(X_test)`, which returns a signed score proportional to the distance
  of each test example from the decision boundary.
- We look at the **absolute value** of these scores (the margin) and summarize them by:
  - median |margin|,
  - 10th percentile |margin| (examples close to the boundary, more uncertain),
  - 90th percentile |margin| (examples far from the boundary, more confident).

In [None]:
# Best full pipeline from grid search (includes scaler + SVC)
best_svm = svm_grid.best_estimator_

# decision_function gives signed distance-like scores to the decision boundary
decision_scores = best_svm.decision_function(X_test)
abs_scores = np.abs(decision_scores)

print("Number of test examples:", len(abs_scores))
print(f"Median |margin|          : {np.median(abs_scores):.4f}")
print(f"10th percentile |margin| : {np.percentile(abs_scores, 10):.4f}")
print(f"90th percentile |margin| : {np.percentile(abs_scores, 90):.4f}")

Number of test examples: 1450
Median |margin|          : 0.7052
10th percentile |margin| : 0.1773
90th percentile |margin| : 1.6067
