In [18]:
#This is the code for SVM 

In [19]:
import pandas as pd
import numpy as np
import os
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

In [20]:
train_files = [
    "One-Hot/Train_Orig_OH.csv",
    "One-Hot/Scaled/Train_Scaled_Cont_OH.csv",
    "One-Hot/MinMax/Train_MM_OH.csv",
    "One-Hot/MinMax/train_OH_MM_PCA15.csv",
    "One-Hot/MinMax/train_OH_MM_PCA20.csv",
    "One-Hot/MinMax/train_OH_MM_PCA25.csv",
    "One-Hot/MinMax/train_OH_MM_PCA30.csv",
    "One-Hot/MinMax/train_OH_MM_PCA35.csv",
    "IntClasses/Train_Orig_Int.csv",
    "IntClasses/Scaled/Train_Scaled_Cont_Int.csv",
    "IntClasses/MinMax/Train_MM_Int.csv",
    "IntClasses/MinMax/train_Int_MM_PCA10.csv",
    "IntClasses/MinMax/train_Int_MM_PCA15.csv",
    "IntClasses/MinMax/train_Int_MM_PCA20.csv",
    "IntClasses/MinMax/train_Int_MM_PCA25.csv",
]

In [22]:
#First up lets start with Polynomial Kernel SVM 
#(we won't do linear as data is complex and not linearly separable)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_metrics(y_true, y_pred, set_name):
    print(f"Performance Metrics on {set_name} Data:")
    cm = confusion_matrix(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='binary', pos_label=1)
    rec = recall_score(y_true, y_pred, average='binary', pos_label=1)
    f1 = f1_score(y_true, y_pred, average='binary', pos_label=1)

    print("Confusion Matrix:")
    print(cm)
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    print("-" * 50)

    return acc, prec, rec, f1


In [17]:
#i. Polynomial Kernel SVM
from sklearn.svm import SVC

results = []

for path in train_files:
    print("\n" + "="*80)
    print(f"Processing dataset: {path}")
    
    train_df = pd.read_csv(path)
    valid_path = path.replace("Train", "Valid").replace("train", "valid")
    test_path = path.replace("Train", "Test").replace("train", "test")

    valid_df = pd.read_csv(valid_path)
    test_df = pd.read_csv(test_path)

    is_pca = 'PCA' in path
    target_col = -1 if is_pca else -2

    X_train = train_df.iloc[:, :target_col]
    y_train = train_df.iloc[:, target_col]

    X_valid = valid_df.iloc[:, :target_col]
    y_valid = valid_df.iloc[:, target_col]

    X_test = test_df.iloc[:, :target_col]
    y_test = test_df.iloc[:, target_col]

    # Train RBF SVM
    model = SVC(kernel='poly', C=1.0, gamma='scale')
    model.fit(X_train, y_train)

    name = os.path.basename(path)
    evaluate_metrics(y_valid, model.predict(X_valid), "Validation")
    acc, prec, rec, f1 = evaluate_metrics(y_test, model.predict(X_test), "Test")


    results.append({
        "Dataset": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1
    })

# Final result summary
results_df = pd.DataFrame(results)
results_df



Processing dataset: One-Hot/Train_Orig_OH.csv
Performance Metrics on Validation Data:
Confusion Matrix:
[[2962   82]
 [ 444  512]]
Accuracy: 0.8685
Precision: 0.8620
Recall: 0.5356
F1-score: 0.6606
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      3044
           1       0.86      0.54      0.66       956

    accuracy                           0.87      4000
   macro avg       0.87      0.75      0.79      4000
weighted avg       0.87      0.87      0.86      4000

--------------------------------------------------
Performance Metrics on Test Data:
Confusion Matrix:
[[2960   84]
 [ 445  511]]
Accuracy: 0.8678
Precision: 0.8588
Recall: 0.5345
F1-score: 0.6589
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      3044
           1       0.86      0.53      0.66       956

    accuracy                           0.87      4000
   macro avg 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Performance Metrics on Test Data:
Confusion Matrix:
[[3044    0]
 [ 956    0]]
Accuracy: 0.7610
Precision: 0.0000
Recall: 0.0000
F1-score: 0.0000
Classification Report:
              precision    recall  f1-score   support

         0.0       0.76      1.00      0.86      3044
         1.0       0.00      0.00      0.00       956

    accuracy                           0.76      4000
   macro avg       0.38      0.50      0.43      4000
weighted avg       0.58      0.76      0.66      4000

--------------------------------------------------

Processing dataset: One-Hot/MinMax/train_OH_MM_PCA15.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Performance Metrics on Validation Data:
Confusion Matrix:
[[3029   15]
 [ 928   28]]
Accuracy: 0.7642
Precision: 0.6512
Recall: 0.0293
F1-score: 0.0561
Classification Report:
              precision    recall  f1-score   support

         0.0       0.77      1.00      0.87      3044
         1.0       0.65      0.03      0.06       956

    accuracy                           0.76      4000
   macro avg       0.71      0.51      0.46      4000
weighted avg       0.74      0.76      0.67      4000

--------------------------------------------------
Performance Metrics on Test Data:
Confusion Matrix:
[[3033   11]
 [ 931   25]]
Accuracy: 0.7645
Precision: 0.6944
Recall: 0.0262
F1-score: 0.0504
Classification Report:
              precision    recall  f1-score   support

         0.0       0.77      1.00      0.87      3044
         1.0       0.69      0.03      0.05       956

    accuracy                           0.76      4000
   macro avg       0.73      0.51      0.46      4000
weight

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Performance Metrics on Test Data:
Confusion Matrix:
[[3044    0]
 [ 956    0]]
Accuracy: 0.7610
Precision: 0.0000
Recall: 0.0000
F1-score: 0.0000
Classification Report:
              precision    recall  f1-score   support

         0.0       0.76      1.00      0.86      3044
         1.0       0.00      0.00      0.00       956

    accuracy                           0.76      4000
   macro avg       0.38      0.50      0.43      4000
weighted avg       0.58      0.76      0.66      4000

--------------------------------------------------

Processing dataset: IntClasses/MinMax/train_Int_MM_PCA10.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Performance Metrics on Validation Data:
Confusion Matrix:
[[3010   34]
 [ 865   91]]
Accuracy: 0.7752
Precision: 0.7280
Recall: 0.0952
F1-score: 0.1684
Classification Report:
              precision    recall  f1-score   support

         0.0       0.78      0.99      0.87      3044
         1.0       0.73      0.10      0.17       956

    accuracy                           0.78      4000
   macro avg       0.75      0.54      0.52      4000
weighted avg       0.77      0.78      0.70      4000

--------------------------------------------------
Performance Metrics on Test Data:
Confusion Matrix:
[[3015   29]
 [ 863   93]]
Accuracy: 0.7770
Precision: 0.7623
Recall: 0.0973
F1-score: 0.1725
Classification Report:
              precision    recall  f1-score   support

         0.0       0.78      0.99      0.87      3044
         1.0       0.76      0.10      0.17       956

    accuracy                           0.78      4000
   macro avg       0.77      0.54      0.52      4000
weight

Unnamed: 0,Dataset,Accuracy,Precision,Recall,F1
0,Train_Orig_OH.csv,0.86775,0.858824,0.534519,0.65893
1,Train_Scaled_Cont_OH.csv,0.942,0.94802,0.801255,0.868481
2,Train_MM_OH.csv,0.761,0.0,0.0,0.0
3,train_OH_MM_PCA15.csv,0.7645,0.694444,0.026151,0.050403
4,train_OH_MM_PCA20.csv,0.77625,0.638009,0.14749,0.239592
5,train_OH_MM_PCA25.csv,0.91925,0.879042,0.767782,0.819654
6,train_OH_MM_PCA30.csv,0.92175,0.875146,0.784519,0.827358
7,train_OH_MM_PCA35.csv,0.9895,0.982068,0.973849,0.977941
8,Train_Orig_Int.csv,0.8685,0.858333,0.538703,0.661954
9,Train_Scaled_Cont_Int.csv,0.936,0.928922,0.792887,0.85553


In [23]:
# Final result summary of Polynomial Kernel SVM
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Dataset,Accuracy,Precision,Recall,F1
0,Train_Orig_OH.csv,0.86775,0.858824,0.534519,0.65893
1,Train_Scaled_Cont_OH.csv,0.942,0.94802,0.801255,0.868481
2,Train_MM_OH.csv,0.761,0.0,0.0,0.0
3,train_OH_MM_PCA15.csv,0.7645,0.694444,0.026151,0.050403
4,train_OH_MM_PCA20.csv,0.77625,0.638009,0.14749,0.239592
5,train_OH_MM_PCA25.csv,0.91925,0.879042,0.767782,0.819654
6,train_OH_MM_PCA30.csv,0.92175,0.875146,0.784519,0.827358
7,train_OH_MM_PCA35.csv,0.9895,0.982068,0.973849,0.977941
8,Train_Orig_Int.csv,0.8685,0.858333,0.538703,0.661954
9,Train_Scaled_Cont_Int.csv,0.936,0.928922,0.792887,0.85553


In [12]:
#verifying if C=1 is the best choice (it is!)
for c in [0.1, 1, 10]:
    model = SVC(kernel='rbf', C=c, gamma='scale')
    model.fit(X_train, y_train)
    acc = accuracy_score(y_valid, model.predict(X_valid))
    print(f"C={c}, Accuracy={acc:.4f}")

C=0.1, Accuracy=0.9908
C=1, Accuracy=0.9998
C=10, Accuracy=0.9998


In [16]:
#ii. Now for the RBF (Gaussian) Kernel SVM
from sklearn.svm import SVC

results = []

for path in train_files:
    print("\n" + "="*80)
    print(f"Processing dataset: {path}")
    
    train_df = pd.read_csv(path)
    valid_path = path.replace("Train", "Valid").replace("train", "valid")
    test_path = path.replace("Train", "Test").replace("train", "test")

    valid_df = pd.read_csv(valid_path)
    test_df = pd.read_csv(test_path)

    is_pca = 'PCA' in path
    target_col = -1 if is_pca else -2

    X_train = train_df.iloc[:, :target_col]
    y_train = train_df.iloc[:, target_col]

    X_valid = valid_df.iloc[:, :target_col]
    y_valid = valid_df.iloc[:, target_col]

    X_test = test_df.iloc[:, :target_col]
    y_test = test_df.iloc[:, target_col]

    # Train RBF SVM
    model = SVC(kernel='rbf', C=1.0, gamma='scale')
    model.fit(X_train, y_train)

    name = os.path.basename(path)
    evaluate_metrics(y_valid, model.predict(X_valid), "Validation")
    acc, prec, rec, f1 = evaluate_metrics(y_test, model.predict(X_test), "Test")


    results.append({
        "Dataset": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1
    })

# Final result summary
results_df = pd.DataFrame(results)
results_df



Processing dataset: One-Hot/Train_Orig_OH.csv
Performance Metrics on Validation Data:
Confusion Matrix:
[[2893  151]
 [ 335  621]]
Accuracy: 0.8785
Precision: 0.8044
Recall: 0.6496
F1-score: 0.7188
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.95      0.92      3044
           1       0.80      0.65      0.72       956

    accuracy                           0.88      4000
   macro avg       0.85      0.80      0.82      4000
weighted avg       0.87      0.88      0.87      4000

--------------------------------------------------
Performance Metrics on Test Data:
Confusion Matrix:
[[2897  147]
 [ 335  621]]
Accuracy: 0.8795
Precision: 0.8086
Recall: 0.6496
F1-score: 0.7204
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.95      0.92      3044
           1       0.81      0.65      0.72       956

    accuracy                           0.88      4000
   macro avg 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Performance Metrics on Test Data:
Confusion Matrix:
[[3044    0]
 [ 956    0]]
Accuracy: 0.7610
Precision: 0.0000
Recall: 0.0000
F1-score: 0.0000
Classification Report:
              precision    recall  f1-score   support

         0.0       0.76      1.00      0.86      3044
         1.0       0.00      0.00      0.00       956

    accuracy                           0.76      4000
   macro avg       0.38      0.50      0.43      4000
weighted avg       0.58      0.76      0.66      4000

--------------------------------------------------

Processing dataset: One-Hot/MinMax/train_OH_MM_PCA15.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Performance Metrics on Validation Data:
Confusion Matrix:
[[3003   41]
 [ 850  106]]
Accuracy: 0.7772
Precision: 0.7211
Recall: 0.1109
F1-score: 0.1922
Classification Report:
              precision    recall  f1-score   support

         0.0       0.78      0.99      0.87      3044
         1.0       0.72      0.11      0.19       956

    accuracy                           0.78      4000
   macro avg       0.75      0.55      0.53      4000
weighted avg       0.77      0.78      0.71      4000

--------------------------------------------------
Performance Metrics on Test Data:
Confusion Matrix:
[[3007   37]
 [ 834  122]]
Accuracy: 0.7823
Precision: 0.7673
Recall: 0.1276
F1-score: 0.2188
Classification Report:
              precision    recall  f1-score   support

         0.0       0.78      0.99      0.87      3044
         1.0       0.77      0.13      0.22       956

    accuracy                           0.78      4000
   macro avg       0.78      0.56      0.55      4000
weight

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Performance Metrics on Test Data:
Confusion Matrix:
[[3044    0]
 [ 956    0]]
Accuracy: 0.7610
Precision: 0.0000
Recall: 0.0000
F1-score: 0.0000
Classification Report:
              precision    recall  f1-score   support

         0.0       0.76      1.00      0.86      3044
         1.0       0.00      0.00      0.00       956

    accuracy                           0.76      4000
   macro avg       0.38      0.50      0.43      4000
weighted avg       0.58      0.76      0.66      4000

--------------------------------------------------

Processing dataset: IntClasses/MinMax/train_Int_MM_PCA10.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Performance Metrics on Validation Data:
Confusion Matrix:
[[2971   73]
 [ 474  482]]
Accuracy: 0.8632
Precision: 0.8685
Recall: 0.5042
F1-score: 0.6380
Classification Report:
              precision    recall  f1-score   support

         0.0       0.86      0.98      0.92      3044
         1.0       0.87      0.50      0.64       956

    accuracy                           0.86      4000
   macro avg       0.87      0.74      0.78      4000
weighted avg       0.86      0.86      0.85      4000

--------------------------------------------------
Performance Metrics on Test Data:
Confusion Matrix:
[[2972   72]
 [ 489  467]]
Accuracy: 0.8598
Precision: 0.8664
Recall: 0.4885
F1-score: 0.6247
Classification Report:
              precision    recall  f1-score   support

         0.0       0.86      0.98      0.91      3044
         1.0       0.87      0.49      0.62       956

    accuracy                           0.86      4000
   macro avg       0.86      0.73      0.77      4000
weight

Unnamed: 0,Dataset,Accuracy,Precision,Recall,F1
0,Train_Orig_OH.csv,0.8795,0.808594,0.649582,0.720418
1,Train_Scaled_Cont_OH.csv,0.962,0.932258,0.906904,0.919406
2,Train_MM_OH.csv,0.761,0.0,0.0,0.0
3,train_OH_MM_PCA15.csv,0.78225,0.767296,0.127615,0.218834
4,train_OH_MM_PCA20.csv,0.7895,0.706522,0.203975,0.316558
5,train_OH_MM_PCA25.csv,0.92925,0.893567,0.799163,0.843733
6,train_OH_MM_PCA30.csv,0.93475,0.89177,0.827406,0.858383
7,train_OH_MM_PCA35.csv,0.9965,0.992678,0.992678,0.992678
8,Train_Orig_Int.csv,0.87975,0.809648,0.649582,0.720836
9,Train_Scaled_Cont_Int.csv,0.95475,0.920738,0.887029,0.90357


In [24]:
# Final result summary of the RBF Kernel SVM
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Dataset,Accuracy,Precision,Recall,F1
0,Train_Orig_OH.csv,0.86775,0.858824,0.534519,0.65893
1,Train_Scaled_Cont_OH.csv,0.942,0.94802,0.801255,0.868481
2,Train_MM_OH.csv,0.761,0.0,0.0,0.0
3,train_OH_MM_PCA15.csv,0.7645,0.694444,0.026151,0.050403
4,train_OH_MM_PCA20.csv,0.77625,0.638009,0.14749,0.239592
5,train_OH_MM_PCA25.csv,0.91925,0.879042,0.767782,0.819654
6,train_OH_MM_PCA30.csv,0.92175,0.875146,0.784519,0.827358
7,train_OH_MM_PCA35.csv,0.9895,0.982068,0.973849,0.977941
8,Train_Orig_Int.csv,0.8685,0.858333,0.538703,0.661954
9,Train_Scaled_Cont_Int.csv,0.936,0.928922,0.792887,0.85553
