# SVM for Breast Cancer Detection that will not only be created for utility, but for comparison to a QSVM of each ones efficiency.

## Niko Tsiolas, July 2024

In [29]:
#Import libraries and read the CSV file

import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE

data = pd.read_csv('Breast_cancer_data.csv')

print(data.head())
print(data.info())


   mean_radius  mean_texture  mean_perimeter  mean_area  mean_smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   diagnosis  
0          0  
1          0  
2          0  
3          0  
4          0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   mean_radius      569 non-null    float64
 1   mean_texture     569 non-null    float64
 2   mean_perimeter   569 non-null    float64
 3   mean_area        569 non-null    float64
 4   mean_smoothness  569 non-null    float64
 5   diagnosis        569 n

In [30]:
#Checking for missing values and data types

print("Columns:", data.columns.tolist())

print("Number of Rows:" ,data.shape[0])
print("Number of Columns:" ,data.shape[1])

print("\nMissing Values:")
print(data.isnull().sum())

print("\nData Types:")
print(data.dtypes)


Columns: ['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area', 'mean_smoothness', 'diagnosis']
Number of Rows: 569
Number of Columns: 6

Missing Values:
mean_radius        0
mean_texture       0
mean_perimeter     0
mean_area          0
mean_smoothness    0
diagnosis          0
dtype: int64

Data Types:
mean_radius        float64
mean_texture       float64
mean_perimeter     float64
mean_area          float64
mean_smoothness    float64
diagnosis            int64
dtype: object


In [31]:
#Seperate the featrues(X) and the target(y)

X = data.drop('diagnosis', axis=1)
y = data['diagnosis']


print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

#Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=30)

#Seeing the distribution of the the split
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of X: (569, 5)
Shape of y: (569,)
Shape of X_train: (455, 5)
Shape of X_test: (114, 5)
Shape of y_train: (455,)
Shape of y_test: (114,)


In [35]:
#Initialize Scaler and Scale the features

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Train the SVM Model 
svm_model = SVC(kernel='rbf', random_state=30)
svm_model.fit(X_train_scaled, y_train)

#predictions
y_pred = svm_model.predict(X_test_scaled)

#rules for the paramater grid.
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

#Creates a tool for best combo 
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train_scaled, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best cross validation Score: {grid_search.best_score_}")

best_svm = grid_search.best_estimator_
y_pred_best = best_svm.predict(X_test_scaled)



Best Parameters: {'C': 100, 'gamma': 1, 'kernel': 'poly'}
Best cross validation Score: 0.9472527472527472


In [36]:
print("\n Best SVM Model results:")

#Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print(f"Roc_Auc:" ,roc_auc_score(y_test, y_pred))



 Best SVM Model results:
Accuracy: 0.8860

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.80      0.84        44
           1       0.88      0.94      0.91        70

    accuracy                           0.89       114
   macro avg       0.89      0.87      0.88       114
weighted avg       0.89      0.89      0.88       114

Roc_Auc: 0.8691558441558441
