In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_validate
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import make_scorer
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
import seaborn as sns
import time # For time complexity analysis

### Reading the data of HIGGS.csv

In [2]:
# Read the dataset
file_path = "E:\ML\HIGGS.csv\HIGGS.csv"
df = pd.read_csv(file_path, header=None)

# Sample 5% of the dataset
df_sample = df.sample(frac=0.01, random_state=39)
# print(df_sample.head())

  file_path = "E:\ML\HIGGS.csv\HIGGS.csv"


### Implementing Linear SVM

In [3]:
# Analyzing the data

# Check for any missing values
# print("\nMissing values in each column:\n", df_sample.isnull().sum()

# Assuming the first column as target column
X = df_sample.drop(columns=[0])
y = df_sample[0]

# Data Preprocessing and Feature Selection
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply polynomial feature generation
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

# Check the shape to see the new number of features
print("Original number of features:", X_scaled.shape)
print("New number of features with polynomial features:", X_poly.shape)

# Feature Selection: SelectKBest to reduce dimensionality
select_k = SelectKBest(f_classif, k=10) 
X_selected = select_k.fit_transform(X_scaled, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=39)

# 2. Linear SVM Implementation
linear_svm = SVC(kernel='linear', random_state=39)

# Cross-validation on training data
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

cv = cross_validate(linear_svm, X_train, y_train, cv=5, scoring=scoring)
print("Cross-Validation Results (Linear SVM):")
print("Accuracy:", np.mean(cv['test_accuracy']))
print("Precision:", np.mean(cv['test_precision']))
print("Recall:", np.mean(cv['test_recall']))
print("F1 Score:", np.mean(cv['test_f1']))
print("Area_ROC:", np.mean(cv['test_roc_auc']))

# Fitting and Cross-validation on testing data
start_time = time.time()
linear_svm.fit(X_train, y_train)
train_time_linear = time.time() - start_time

start_time = time.time()
y_predict_linear = linear_svm.predict(X_test)
predict_time_linear = time.time() - start_time

# Evaluation on test set
print("\nTest Set Evaluation (Linear SVM):")
print("Accuracy:", accuracy_score(y_test, y_predict_linear))
print("Precision:", precision_score(y_test, y_predict_linear))
print("Recall:", recall_score(y_test, y_predict_linear))
print("F1 Score:", f1_score(y_test, y_predict_linear))
print("AUC:", roc_auc_score(y_test, y_predict_linear))

print(f"Linear SVM Training Time: {train_time_linear:.2f} seconds")
print(f"Linear SVM Prediction Time: {predict_time_linear:.2f} seconds")

Original number of features: (110000, 28)
New number of features with polynomial features: (110000, 406)
Cross-Validation Results (Linear SVM):
Accuracy: 0.6351038961038962
Precision: 0.6182215449337651
Recall: 0.8035944788491746
F1 Score: 0.6988199165461573
Area_ROC: 0.6736944022470748

Test Set Evaluation (Linear SVM):
Accuracy: 0.6358484848484849
Precision: 0.6193308550185873
Recall: 0.8104040288428522
F1 Score: 0.702099705000124
AUC: 0.6248980082382495
Linear SVM Training Time: 297.96 seconds
Linear SVM Prediction Time: 32.35 seconds


### Implementing Stochastic Gradeient Descent 

In [4]:
sgd_svm = SGDClassifier(loss='hinge', random_state=39, max_iter=1000, tol=1e-3)

# Cross-validation for SGDClassifier
cv_sgd = cross_validate(sgd_svm, X_train, y_train, cv=5, scoring=scoring)
print("\nCross-Validation Results (SGDClassifier):")
print("Accuracy:", np.mean(cv_sgd['test_accuracy']))
print("Precision:", np.mean(cv_sgd['test_precision']))
print("Recall:", np.mean(cv_sgd['test_recall']))
print("F1 Score:", np.mean(cv_sgd['test_f1']))
print("Area under ROC:", np.mean(cv_sgd['test_roc_auc']))

# Cross-Validation on testing data
start_time = time.time()
sgd_svm.fit(X_train, y_train)
train_time_sgd = time.time() - start_time

start_time = time.time()
y_predict_sgd = sgd_svm.predict(X_test)
predict_time_sgd = time.time() - start_time

print("\nTest Set Evaluation (SGDClassifier):")
print("Accuracy:", accuracy_score(y_test, y_predict_sgd))
print("Precision:", precision_score(y_test, y_predict_sgd))
print("Recall:", recall_score(y_test, y_predict_sgd))
print("F1 Score:", f1_score(y_test, y_predict_sgd))
print("AUC:", roc_auc_score(y_test, y_predict_sgd))

# Hyperparameter Tuning using GridSearchCV for SGDClassifier
param_grid = {'alpha': [0.0001, 0.001, 0.01], 'max_iter': [1000, 2000]}
grid_search = GridSearchCV(SGDClassifier(loss='hinge'), param_grid, cv=3)
grid_search.fit(X_train, y_train)

print("\nBest Parameters for SGD SVM:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

print(f"SGD SVM Training Time: {train_time_sgd:.2f} seconds")
print(f"SGD SVM Prediction Time: {predict_time_sgd:.2f} seconds")


Cross-Validation Results (SGDClassifier):
Accuracy: 0.6305454545454545
Precision: 0.6237754671295578
Recall: 0.7544368457033552
F1 Score: 0.6824685529530137
Area under ROC: 0.6677014500057961

Test Set Evaluation (SGDClassifier):
Accuracy: 0.6339393939393939
Precision: 0.6136823469903895
Recall: 0.8331807256495365
F1 Score: 0.706781882615661
AUC: 0.6214402919758697

Best Parameters for SGD SVM: {'alpha': 0.001, 'max_iter': 2000}
Best Cross-Validation Score: 0.6343507003327082
SGD SVM Training Time: 0.37 seconds
SGD SVM Prediction Time: 0.00 seconds


### Implementing polynomial SVM 
In this, polynomial SVM is implemented using RBF kernel and a custom kernel

In [None]:
results = []
# Function to calculate the metrics for a model with a certain degree
def evaluate_model(model, X_train, X_test, y_train, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time

    start_time = time.time()
    y_pred = model.predict(X_test)
    predict_time = time.time() - start_time

    # Calculation of metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    return accuracy, precision, recall, f1, auc, train_time, predict_time

# Polynomial Kernel with degrees (2,3,4)
for degree in [2, 3, 4]:
    svm_polynomial = SVC(kernel='poly', degree=degree, random_state=39)
    metrics = evaluate_model(svm_polynomial, X_train, X_test, y_train, y_test)
    print(f"\nPolynomial SVM (degree={degree}) Accuracy:", metrics[0])
    results.append({
        'Kernel': 'Polynomial',
        'Degree': degree,
        'Accuracy': metrics[0],
        'Precision': metrics[1],
        'Recall': metrics[2],
        'F1 Score': metrics[3],
        'AUC': metrics[4],
        'Training Time (s)': metrics[5],
        'Prediction Time (s)': metrics[6]
    })

# Tuning the C value for polynomial kernel and for each degree
param_grid_poly = {'C': [0.1, 1, 10], 'degree': [2, 3, 4]}
grid_search_poly = GridSearchCV(SVC(kernel='poly'), param_grid_poly, cv=3)
grid_search_poly.fit(X_train, y_train)

best_poly = grid_search_poly.best_estimator_
metrics_best_poly = evaluate_model(best_poly, X_train, X_test, y_train, y_test)
print("\nBest Parameters for Polynomial SVM:", grid_search_poly.best_params_)
print("Best Polynomial SVM Accuracy:", metrics_best_poly[0])

results.append({
    'Kernel': 'Polynomial',
    'Degree': grid_search_poly.best_params_['degree'],
    'Accuracy': metrics_best_poly[0],
    'Precision': metrics_best_poly[1],
    'Recall': metrics_best_poly[2],
    'F1 Score': metrics_best_poly[3],
    'AUC': metrics_best_poly[4],
    'Training Time (s)': metrics_best_poly[5],
    'Prediction Time (s)': metrics_best_poly[6]
})

# RBF Kernel with Gamma Tuning
param_grid_rbf = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.5, 1, 'scale']}
grid_search_rbf = GridSearchCV(SVC(kernel='rbf'), param_grid_rbf, cv=3)
grid_search_rbf.fit(X_train, y_train)

best_rbf = grid_search_rbf.best_estimator_
metrics_rbf = evaluate_model(best_rbf, X_train, X_test, y_train, y_test)

print("\nBest Parameters for RBF SVM:", grid_search_rbf.best_params_)
print("RBF SVM Accuracy:", metrics_rbf[0])
print(f"Precision: {metrics_rbf[1]}, Recall: {metrics_rbf[2]}, F1 Score: {metrics_rbf[3]}, AUC: {metrics_rbf[4]}")

results.append({
    'Kernel': 'RBF',
    'Degree': 'N/A',
    'Accuracy': metrics_rbf[0],
    'Precision': metrics_rbf[1],
    'Recall': metrics_rbf[2],
    'F1 Score': metrics_rbf[3],
    'AUC': metrics_rbf[4],
    'Training Time (s)': metrics_rbf[5],
    'Prediction Time (s)': metrics_rbf[6]
})

# Implementation of Custom Sigmoid Kernel
def sigmoid_kernel(X, Y, a=1, b=1):
    # a is the scaling term
    # b is the offset term 
    # changing them can create new sigmoid kernel
    return np.tanh(a * np.dot(X, Y.T) + b)

# Tuning C for Custom Sigmoid Kernel
param_grid_custom = {'C': [0.1, 1, 10]}
grid_search_custom = GridSearchCV(SVC(kernel=sigmoid_kernel), param_grid_custom, cv=3)
grid_search_custom.fit(X_train, y_train)

best_custom = grid_search_custom.best_estimator_
metrics_custom = evaluate_model(best_custom, X_train, X_test, y_train, y_test)

print("\nBest Parameters for Custom Sigmoid SVM:", grid_search_custom.best_params_)
print("Custom Sigmoid Kernel SVM Accuracy:", metrics_custom[0])
print(f"Precision: {metrics_custom[1]}, Recall: {metrics_custom[2]}, F1 Score: {metrics_custom[3]}, AUC: {metrics_custom[4]}")

results.append({
    'Kernel': 'Custom Sigmoid',
    'Degree': 'N/A',
    'Accuracy': metrics_custom[0],
    'Precision': metrics_custom[1],
    'Recall': metrics_custom[2],
    'F1 Score': metrics_custom[3],
    'AUC': metrics_custom[4],
    'Training Time (s)': metrics_custom[5],
    'Prediction Time (s)': metrics_custom[6]
})

# Comparisons
results_df = pd.DataFrame(results)
print("\nKernel Result Comparison:")
print(results_df)


### Hyperparameter Tuning
Using random search to tune the hyperparameters (C, degree) for polynomial kernel, (C, gamma) for RBF kernel, C for custom kernel

In [6]:

# Hyperparameter Tuning for Polynomial Kernel using RandomizedSearchCV
param_dist_poly = {'C': [0.1, 1, 10], 'degree': [2, 3, 4]}
random_search_poly = RandomizedSearchCV(SVC(kernel='poly'), param_distributions=param_dist_poly, n_iter=10, cv=3, random_state=42)
random_search_poly.fit(X_train, y_train)

best_poly_random = random_search_poly.best_estimator_
metrics_best_poly_random = evaluate_model(best_poly_random, X_test, y_test)
print("\nBest Parameters for Polynomial SVM (Random Search):", random_search_poly.best_params_)
print("Best Polynomial SVM Accuracy:", metrics_best_poly_random[0])

# Hyperparameter Tuning for RBF Kernel using RandomizedSearchCV
param_dist_rbf = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.5, 1, 'scale']}
random_search_rbf = RandomizedSearchCV(SVC(kernel='rbf'), param_distributions=param_dist_rbf, n_iter=10, cv=3, random_state=42)
random_search_rbf.fit(X_train, y_train)

best_rbf_random = random_search_rbf.best_estimator_
metrics_rbf_random = evaluate_model(best_rbf_random, X_test, y_test)
print("\nBest Parameters for RBF SVM (Random Search):", random_search_rbf.best_params_)
print("RBF SVM Accuracy:", metrics_rbf_random[0])

# Hyperparameter Tuning for Custom Sigmoid Kernel using RandomizedSearchCV
param_dist_custom = {'C': [0.1, 1, 10]}
random_search_custom = RandomizedSearchCV(SVC(kernel=sigmoid_kernel), param_distributions=param_dist_custom, n_iter=10, cv=3, random_state=42)
random_search_custom.fit(X_train, y_train)

best_custom_random = random_search_custom.best_estimator_
metrics_custom_random = evaluate_model(best_custom_random, X_test, y_test)
print("\nBest Parameters for Custom Sigmoid SVM (Random Search):", random_search_custom.best_params_)
print("Custom Sigmoid Kernel SVM Accuracy:", metrics_custom_random[0])



NameError: name 'evaluate_model' is not defined

### Hyperparameter senstivity analysis
Visulaizing the results of SVM performance using heatmaps

In [None]:
# Create a grid of hyperparameters for sensitivity analysis
c_values = [0.1, 1, 10]
gamma_values = [0.1, 0.5, 1, 'scale']
results = []

# Iterate over C and gamma values for RBF SVM
for c in c_values:
    for gamma in gamma_values:
        svm = SVC(kernel='rbf', C=c, gamma=gamma)
        cv_score = cross_val_score(svm, X_train, y_train, cv=5, scoring='accuracy')
        results.append((c, gamma, np.mean(cv_score)))

# Convert results to DataFrame for heatmap
results_df = pd.DataFrame(results, columns=['C', 'Gamma', 'Accuracy'])
pivot_table = results_df.pivot('C', 'Gamma', 'Accuracy')

# Plotting heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(pivot_table, annot=True, cmap='viridis')
plt.title('Hyperparameter Sensitivity Analysis for RBF Kernel')
plt.xlabel('Gamma')
plt.ylabel('C')
plt.show()


### Explainability and Interpretability

In [None]:
import shap

# Initialize SHAP explainer on the best-performing model
explainer = shap.KernelExplainer(best_rbf.predict, X_train[:100])  
shap_values = explainer.shap_values(X_test[:100])

# Plotting summaries
shap.summary_plot(shap_values, X_test[:100], plot_type="bar")