# Support Vector Machines - SVM

In [2]:
from pandas import DataFrame
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler

def drop_empty_items(data_frame: DataFrame) -> DataFrame:
    data_frame = data_frame.drop(['Insulin'], axis=1)
    return data_frame[(data_frame['Glucose'] != 0) & (data_frame['BloodPressure'] != 0) & (data_frame['BMI'] != 0)]


def scale_features(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test


def print_score(y_test, y_pred) -> None:
    # Calculate other evaluation metrics for test set
    print(f"SIMPLE SVM")
    print("===========================================================================")
    print("\nClassification Report:\n",
          classification_report(y_test, y_pred, target_names=['non-diabetic', 'diabetic']))
    print("SVM Model Evaluation:")

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("Accuracy: ", accuracy)
    print(f"Precision (Weighted): {precision:.6f}")
    print(f"Recall (Weighted): {recall:.6f}")
    print(f"F1-Score (Weighted): {f1:.6f}")

    print("Class distribution before SMOTE:", y_train.value_counts().to_dict())
    print("Class distribution after SMOTE:", pd.Series(y_train_smote).value_counts().to_dict())

    roc_auc = roc_auc_score(y_test, y_pred)
    print(f"ROC-AUC: {roc_auc:.6f}")

    # Print confusion matrix with class labels
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_names = ['non-diabetic', 'diabetic']
    print("\nConfusion Matrix with Class Labels:")
    print(pd.DataFrame(conf_matrix, index=class_names, columns=class_names))

    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp)
    print("Specificity: ", specificity)
    sensitivity = tp / (tp + fn)
    print("Sensitivity: ", sensitivity)

In [4]:
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC

features = ['Pregnancies', 'Glucose', 'BloodPressure', 'BMI', 'SkinThickness', 'DiabetesPedigreeFunction', 'Age']

df = pd.read_csv('data/diabetes.csv')
df = drop_empty_items(df)

# Feature/Output Separation
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Test Train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale Features
X_train, X_test = scale_features(X_train, X_test)

# SMOTE - Artificial Data Creation
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto', 0.1],
}

grid_search = GridSearchCV(
    estimator=SVC(random_state=42),
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit Grid Search on SMOTE-balanced training data
grid_search.fit(X_train_smote, y_train_smote)

# Print best parameters and score
print("\nBest Hyperparameters:")
print(grid_search.best_params_)
print(f"Best Cross-Validation F1-Weighted Score: {grid_search.best_score_:.4f}")

# Train best model on full SMOTE training data
best_rf = grid_search.best_estimator_
best_rf.fit(X_train_smote, y_train_smote)

# Evaluate best model on test set
y_pred = best_rf.predict(X_test)
print_score(y_test, y_pred)

Fitting 5 folds for each of 18 candidates, totalling 90 fits

Best Hyperparameters:
{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Best Cross-Validation F1-Weighted Score: 0.8006
SIMPLE SVM

Classification Report:
               precision    recall  f1-score   support

non-diabetic       0.88      0.75      0.81       102
    diabetic       0.56      0.74      0.64        43

    accuracy                           0.75       145
   macro avg       0.72      0.75      0.73       145
weighted avg       0.78      0.75      0.76       145

SVM Model Evaluation:
Accuracy:  0.7517241379310344
Precision (Weighted): 0.782002
Recall (Weighted): 0.751724
F1-Score (Weighted): 0.759956
Class distribution before SMOTE: {0: 373, 1: 206}
Class distribution after SMOTE: {0: 373, 1: 373}
ROC-AUC: 0.749544

Confusion Matrix with Class Labels:
              non-diabetic  diabetic
non-diabetic            77        25
diabetic                11        32
Specificity:  0.7549019607843137
Sensitivity:  0.744186