# SVM for Breast Cancer Detection that will not only be created for utility, but for comparison to a QSVM of each ones efficiency.

## Niko Tsiolas, July 2024

In [1]:
#Import libraries and read the CSV file

import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score,f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV


data = pd.read_csv('Breast_cancer_data.csv')

print(data.head())
print(data.info())


In [63]:
#Checking for missing values and data types

print("Columns:", data.columns.tolist())

print("Number of Rows:" ,data.shape[0])
print("Number of Columns:" ,data.shape[1])

print("\nMissing Values:")
print(data.isnull().sum())

print("\nData Types:")
print(data.dtypes)


Columns: ['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area', 'mean_smoothness', 'diagnosis']
Number of Rows: 569
Number of Columns: 6

Missing Values:
mean_radius        0
mean_texture       0
mean_perimeter     0
mean_area          0
mean_smoothness    0
diagnosis          0
dtype: int64

Data Types:
mean_radius        float64
mean_texture       float64
mean_perimeter     float64
mean_area          float64
mean_smoothness    float64
diagnosis            int64
dtype: object


In [64]:
#seperate the features(X) and the target(y)

X = data.drop('diagnosis', axis=1)
y = data['diagnosis']


print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

#Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=30)

#Seeing the distribution of the the split
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of X: (569, 5)
Shape of y: (569,)
Shape of X_train: (455, 5)
Shape of X_test: (114, 5)
Shape of y_train: (455,)
Shape of y_test: (114,)


In [65]:
#Initialize Scaler and Scale the features

scaler = StandardScaler()

#scaling the data for the model of test and train
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Train the SVM Model 
svm_model = SVC(kernel='rbf', random_state=30)
svm_model.fit(X_train_scaled, y_train)

#predictions
y_pred = svm_model.predict(X_test_scaled)

#rules for the paramater grid.
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

#Creates a tool for best combo 
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train_scaled, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best cross validation Score: {grid_search.best_score_}")

best_svm = grid_search.best_estimator_
y_pred_best = best_svm.predict(X_test_scaled)



Best Parameters: {'C': 100, 'gamma': 1, 'kernel': 'poly'}
Best cross validation Score: 0.9472527472527472


In [66]:

#Creates a Recursive Feature Elimination Cross Validation object, uses a linear SVM as the estimator
rfe_selector = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)

#applies the RFECV process to the scaled training data
rfe_selector = rfe_selector.fit(X_train_scaled, y_train)

#Identifies which features were selected by the RFECV process, .support being a boolean array indicating selected features. 
selected_features = X_train.columns[rfe_selector.support_]
print("Number of selected features:", len(selected_features))
print("selected features:", selected_features)

#Creates new versions of the training and test sets. ONLY include ones that were selected by the RFECV.
X_train_scaled_selected = X_train_scaled[:, rfe_selector.support_]
X_test_scaled_selected = X_test_scaled[:, rfe_selector.support_]

#Creates a new SVM model using the best params from the earlier gridsearchCV. 
svm_selected = SVC(**grid_search.best_params_)




Number of selected features: 4
selected features: Index(['mean_texture', 'mean_perimeter', 'mean_area', 'mean_smoothness'], dtype='object')


In [68]:

#trains the new svm model using only the selected features 
svm_selected = SVC(**grid_search.best_params_)
svm_selected.fit(X_train_scaled_selected, y_train)

y_pred_selected = svm_selected.predict(X_test_scaled_selected)

In [70]:


#Evaluating the model
print("\n Best Original SVM Model results:")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"ROC AUC:" ,roc_auc_score(y_test, y_pred))

print("SVM with Selected Features Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_selected):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_selected):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_selected))

print("\nHyperparameter-tuned SVM (all features) Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_best):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best))



 Best Original SVM Model results:
Accuracy: 0.8860

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.80      0.84        44
           1       0.88      0.94      0.91        70

    accuracy                           0.89       114
   macro avg       0.89      0.87      0.88       114
weighted avg       0.89      0.89      0.88       114

ROC AUC: 0.8691558441558441
SVM with Selected Features Results:
Accuracy: 0.9123
ROC AUC: 0.8906

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.80      0.88        44
           1       0.88      0.99      0.93        70

    accuracy                           0.91       114
   macro avg       0.93      0.89      0.90       114
weighted avg       0.92      0.91      0.91       114


Hyperparameter-tuned SVM (all features) Results:
Accuracy: 0.9123
ROC AUC: 0.8948

Classification Report:
              precision    recall  f1-scor