In [None]:
%load_ext autoreload
%autoreload 2

import sys
import gc
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.svm import SVC

# Setup path to access modular scripts in src/
sys.path.append(str(Path.cwd().parent))
from src import config, preprocessing, evaluation, utils

# Initialize folder to save results for SVM Within-dataset scenario
exp_paths = utils.setup_experiment_folder()
print(f"Saving SVM results to: {exp_paths['root']}")

In [None]:
dataset_year = '2018' # Change to '2018' when running on 2018 dataset
print(f"\n--- 1. LOADING & SAMPLING DATASET ({dataset_year}) ---")

df, y = preprocessing.load_single_dataset_year(dataset_year, binary_mode=True)

# Sampling strategy to allow SVM to run on 16GB RAM
SAMPLE_SIZE = 100000 
if len(df) > SAMPLE_SIZE:
    print(f"Sampling {SAMPLE_SIZE} records from {len(df)} total records...")
    df_sample = df.sample(n=SAMPLE_SIZE, random_state=config.RANDOM_STATE)
    y_sample = pd.Series(y).iloc[df_sample.index].values
else:
    df_sample = df
    y_sample = y

print(f"Final Shape for SVM: {df_sample.shape}")
print(f"Label Distribution:\n{pd.Series(y_sample).value_counts()}")

del df, y
gc.collect()

In [None]:
print("\n--- 2. PREPROCESSING & SCALING ---")

# Get index of Top-25 mRMR features
all_features = config.SELECTED_FEATURES
# Note: You should define MRMR_TOP_25 in config.py or slice list here
mrmr_25_list = config.mRMR_FEATURES[:25] 
indices = [all_features.index(f) for f in mrmr_25_list]

X = df_sample.values[:, indices]

# Split Train/Test by 80/20 ratio
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_sample, test_size=0.2, random_state=config.RANDOM_STATE, stratify=y_sample
)

# Scaling (Use training set to avoid data leakage)
scaler = preprocessing.get_scaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Train set: {X_train_scaled.shape}, Test set: {X_test_scaled.shape}")

In [None]:
print("\n--- 3. TRAINING SVM (Linear/RBF Kernel) ---")

# Initialize SVM with probability (probability=True) for plotting AUROC
svm_model = SVC(
    kernel='rbf', 
    C=1.0, 
    probability=True, 
    random_state=config.RANDOM_STATE,
    verbose=True
)

svm_model.fit(X_train_scaled, y_train)

# Save model
model_save_path = exp_paths['models'] / f"svm_{dataset_year}_within.joblib"
joblib.dump(svm_model, model_save_path)
print(f"SVM Model saved to: {model_save_path}")

In [None]:
print("\n--- 4. EVALUATION ---")

metrics = evaluation.evaluate_model(
    svm_model, 
    X_test_scaled, 
    y_test, 
    save_dir=exp_paths['figures'],
    dataset_name=f"SVM Within-Dataset ({dataset_year})"
)

print("\n=== SVM RESULTS (Baseline) ===")
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"MCC:      {metrics['mcc']:.4f}")
#print(f"F1-score: {metrics['f1_macro']:.4f}")
#print(f"Recall:   {metrics['recall_macro']:.4f}")