# SVM

This notebook is the classification part after Image Segmentation and Feature Extraction. Let us import all libraries first:


In [1]:
import pandas as pd
import os
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

It is time to load our features, as well as our classes.

In [2]:
# Load features DataFrame
features_path_train = "./data/extracted_features/df_features_train.csv"
features_path_test = ".data/extracted_features/df_features_test.csv"
features_path_val = "./data/extracted_features/df_features_val.csv"

df_train_features = pd.read_csv(features_path_train)
df_test_features = pd.read_csv(features_path_test)
df_val_features = pd.read_csv(features_path_val)


We will need to ensure that the variables are the same when we run our SVM model, so after inspection we will remove these variables:

In [3]:
columns_to_remove = ['k1_mean', 'k1_var', 'k2_mean', 'k2_var', 'k3_mean', 'k3_var', 'k4_mean', 'k4_var',
                     'k5_mean', 'k5_var', 'k6_mean', 'k6_var', 'k7_mean', 'k7_var', 'k8_mean', 'k8_var',
                     'k9_mean', 'k9_var', 'k10_mean', 'k10_var', 'k11_mean', 'k11_var', 'k12_mean', 'k12_var',
                     'k13_mean', 'k13_var', 'k14_mean', 'k14_var', 'k15_mean', 'k15_var', 'k16_mean', 'k16_var']

#df_train_features = df_train_features.drop(columns=columns_to_remove, axis=1)
#df_test_features = df_test_features.drop(columns=columns_to_remove, axis=1)
#df_val_features = df_val_features.drop(columns=columns_to_remove, axis=1)



df_test_features = df_test_features.iloc[:, [0] + list(range(33, 607))]
df_val_features = df_val_features.iloc[:, [0] + list(range(33, 607))]

In [4]:
df_train_classes = df_train_features[['image_id', 'class']]
df_test_classes = df_test_features[['image_id', 'class']]
df_val_classes = df_val_features[['image_id', 'class']]

In [5]:
df_val_features

Unnamed: 0,image_id,lpb_1,lpb_2,lpb_3,lpb_4,lpb_5,lpb_6,lpb_7,lpb_8,lpb_9,...,l_skew,u_skew,v_skew,r_kurt,g_kurt,b_kurt,l_kurt,u_kurt,v_kurt,class
0,ISIC_0,0.086853,0.044006,0.033081,0.014160,0.014099,0.008667,0.006958,0.006042,0.004456,...,-2.569194,3.398034,2.857902,4.138339,6.467677,12.250507,9.410343,10.119021,7.484356,2
1,ISIC_1,0.085083,0.045288,0.032043,0.017395,0.012207,0.007080,0.005188,0.006348,0.005066,...,-3.178749,-2.395177,0.100351,3.142048,11.707215,8.798235,20.767852,4.067166,2.644779,2
2,ISIC_10,0.089111,0.048035,0.034546,0.013977,0.010071,0.006042,0.004456,0.004333,0.003296,...,-2.307191,-1.709324,1.613100,0.984231,14.658242,0.985889,12.311486,1.123818,0.890010,2
3,ISIC_100,0.080566,0.043457,0.034729,0.016174,0.018494,0.009766,0.008301,0.009094,0.007385,...,-4.415226,-1.395196,-1.609557,12.006543,23.894998,42.670430,28.515325,2.923531,2.981911,2
4,ISIC_101,0.091736,0.051025,0.030701,0.013916,0.010254,0.006104,0.004944,0.004395,0.003540,...,-1.694701,-0.278146,-0.273656,1.003198,3.676215,40.213509,5.644897,-0.980380,-1.454769,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,ISIC_95,0.090881,0.042969,0.031189,0.017761,0.016357,0.010315,0.008423,0.008606,0.006958,...,-3.297447,0.904524,0.057155,8.012008,23.893977,102.840999,31.137972,1.203015,3.134727,1
146,ISIC_96,0.087952,0.045288,0.035645,0.015442,0.013489,0.009399,0.005005,0.005798,0.005005,...,-1.990749,-0.471905,-0.903106,3.967960,4.670544,16.732005,5.727336,1.139545,0.439204,1
147,ISIC_97,0.085205,0.043579,0.039917,0.017395,0.016296,0.009583,0.006714,0.007324,0.004517,...,-2.981933,0.488490,0.130845,7.322186,11.292971,26.226746,13.114619,1.979465,0.981510,1
148,ISIC_98,0.084656,0.041260,0.032043,0.017090,0.013794,0.009399,0.007568,0.008118,0.005859,...,-7.139651,0.276576,0.106804,48.807618,66.954655,107.508700,78.632685,0.913655,4.552788,2


In [6]:
df_train_features = df_train_features.drop('class', axis=1)
df_test_features = df_test_features.drop('class', axis=1)
df_val_features = df_val_features.drop('class', axis=1)

In [7]:
# Sort DataFrames based on 'image_id'
df_train_features.sort_values('image_id', inplace=True)
df_train_classes = df_train_classes.sort_values('image_id').copy(deep=True)
df_test_features.sort_values('image_id', inplace=True)
df_test_classes = df_test_classes.sort_values('image_id').copy(deep=True)
df_val_features.sort_values('image_id', inplace=True)
df_val_classes = df_val_classes.sort_values('image_id').copy(deep=True)

# Reset the index for both DataFrames
df_train_features.reset_index(drop=True, inplace=True)
df_train_classes.reset_index(drop=True, inplace=True)
df_test_features.reset_index(drop=True, inplace=True)
df_test_classes.reset_index(drop=True, inplace=True)
df_val_features.reset_index(drop=True, inplace=True)
df_val_classes.reset_index(drop=True, inplace=True)

For experimentation, we will need to select a random sample of 100 samples per class and see the difference for comparison. Let us now create our sample dataset:

In [8]:
# Group the data by the target class
grouped = df_train_classes.groupby('class')

# Randomly select 100 samples for each class while maintaining the same 'image_id'
selected_samples = []
for _, group in grouped:
    samples = group.sample(n=100, random_state=42)  # Randomly select 100 samples
    selected_samples.append(samples)


In [9]:
# Concatenate the selected samples for each class
selected_data = pd.concat(selected_samples)

# Merge with the original features dataset based on 'image_id'
selected_data = pd.merge(selected_data, df_train_features, on='image_id', how='left')

# Split selected data into features and classes
selected_features = selected_data.drop(['class'], axis=1)
selected_classes = selected_data[['image_id', 'class']]

In [10]:
# Save selected features and classes into separate CSV files
selected_features.to_csv('selected_features.csv', index=False)
selected_classes.to_csv('selected_classes.csv', index=False)

In [11]:
save_directory = "./data/extracted_features"
features_file_path = os.path.join(save_directory, "selected_features.csv")
classes_file_path = os.path.join(save_directory, "selected_classes.csv")


In [12]:
df_features1 = pd.read_csv(features_file_path)
df_classes1 = pd.read_csv(classes_file_path)

In [13]:
# Sort both DataFrames based on 'image_id'
df_features1.sort_values('image_id', inplace=True)
df_classes1 = df_classes1.sort_values('image_id').copy(deep=True)



In [14]:
# Reset the index for both DataFrames
df_features1.reset_index(drop=True, inplace=True)
df_classes1.reset_index(drop=True, inplace=True)

With the data preparation complete, let us now integrate the SVM classifier accordingly.

# WHOLE DATASET

In [15]:
# Prepare the data
X = df_train_features.drop('image_id', axis=1)  # Exclude the image_id column from features
y = df_train_classes['class']   # Select the target variable columns

In [16]:
X_train = df_train_features.drop('image_id', axis=1)
X_val = df_val_features.drop('image_id', axis=1)
X_test = df_test_features.drop('image_id', axis=1)
y_train = df_train_classes['class']
y_val = df_val_classes['class']
y_test = df_test_classes['class']

From here, we will do 2 methods: one without hyperparameter tuning and one with hyperparameter tuning. We will use GridSearch to tune. Let us first establish the parameter grid:

In [17]:
# Train the SVM model
model = svm.SVC(kernel='rbf')
model.fit(X_train, y_train)

In [18]:
#Make predictions on the validation set
y_pred = model.predict(X_val)

In [19]:
# Make predictions on the test set
y_pred1 = model.predict(X_test)

In [20]:
# View Classification Metrics
print(classification_report(y_val,y_pred))

# Performance Evaluation
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)

              precision    recall  f1-score   support

           0       1.00      0.03      0.06        30
           1       0.00      0.00      0.00        42
           2       0.52      1.00      0.69        78

    accuracy                           0.53       150
   macro avg       0.51      0.34      0.25       150
weighted avg       0.47      0.53      0.37       150

Accuracy: 0.5266666666666666


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# View Classification Metrics
print(classification_report(y_test,y_pred1))

# Performance Evaluation
accuracy = accuracy_score(y_test, y_pred1)
print("Accuracy:", accuracy)

              precision    recall  f1-score   support

           0       0.83      0.04      0.08       117
           1       0.00      0.00      0.00        90
           2       0.66      1.00      0.80       393

    accuracy                           0.66       600
   macro avg       0.50      0.35      0.29       600
weighted avg       0.60      0.66      0.54       600

Accuracy: 0.6633333333333333


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


It can be seen from here that the results heavily lean towards class 2 (Nevus), and the likely reason is imbalanced data, which creates bias. With this in mind, let us create a dataset where in every class has equal samples.

# SAMPLE DATASET

In [22]:
# Prepare the data
X1_train = df_features1.drop('image_id', axis=1)  # Exclude the image_id column from features
y1_train = df_classes1['class']   # Select the target variable columns

In [23]:
# Split the data into training and testing sets
#X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

In [24]:
# Train the SVM model
model1 = svm.SVC(kernel='rbf')
model1.fit(X1_train, y1_train)

In [25]:
#Make predictions on the validation set
y1_pred = model1.predict(X_val)

In [26]:
#Make predictions on the test set
y1_pred1 = model1.predict(X_test)

In [27]:
# VALIDATION

print(classification_report(y_val,y1_pred))

# Evaluate the performance
accuracy1 = accuracy_score(y_val, y1_pred)
print("Accuracy:", accuracy1)

              precision    recall  f1-score   support

           0       0.23      0.50      0.32        30
           1       0.30      0.36      0.33        42
           2       0.75      0.35      0.47        78

    accuracy                           0.38       150
   macro avg       0.43      0.40      0.37       150
weighted avg       0.52      0.38      0.40       150

Accuracy: 0.38


In [28]:
# TEST

print(classification_report(y_test,y1_pred1))

# Evaluate the performance
accuracy2 = accuracy_score(y_test, y1_pred1)
print("Accuracy:", accuracy2)

              precision    recall  f1-score   support

           0       0.24      0.62      0.34       117
           1       0.23      0.46      0.30        90
           2       0.87      0.25      0.39       393

    accuracy                           0.36       600
   macro avg       0.44      0.44      0.35       600
weighted avg       0.65      0.35      0.37       600

Accuracy: 0.355


There is definitely a huge improvement in both the precision and recall this time.

NOTE: As of writing, the dataset was split within the training dataset due to complications in the feature extraction process (i.e. differing number of features extracted). I will investigate the cause and will make adjustments accordingly in the code.

# GRIDSEARCH

## WHOLE DATASET

In [29]:
# GridSearch CV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
# Define the parameter grid for GridSearchCV
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': [0.1, 1, 10]}

svm = SVC()
# Create the GridSearchCV object
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='roc_auc')

In [30]:
# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

Traceback (most recent call last):
  File "C:\Users\Patrick\Desktop\Capstone\U-Net\UNet_venv\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Patrick\Desktop\Capstone\U-Net\UNet_venv\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Patrick\Desktop\Capstone\U-Net\UNet_venv\lib\site-packages\sklearn\metrics\_scorer.py", line 367, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported

Traceback (most recent call last):
  File "C:\Users\Patrick\Desktop\Capstone\U-Net\UNet_venv\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Patrick\Desktop\Capstone\U-Net\UNet_venv\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Pat

In [42]:
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'linear'}


In [35]:
# Make predictions on the testing data
y_pred11 = best_model.predict(X_test)
y_pred12 = best_model.predict(X_val)

In [36]:
# VALIDATION

print(classification_report(y_val,y_pred12))

# Evaluate the performance
accuracy12 = accuracy_score(y_val, y_pred12)
print("Accuracy:", accuracy12)

              precision    recall  f1-score   support

           0       1.00      0.03      0.06        30
           1       0.88      0.36      0.51        42
           2       0.58      0.97      0.72        78

    accuracy                           0.61       150
   macro avg       0.82      0.45      0.43       150
weighted avg       0.75      0.61      0.53       150

Accuracy: 0.6133333333333333


In [37]:
# VALIDATION

print(classification_report(y_test,y_pred11))

# Evaluate the performance
accuracy11 = accuracy_score(y_test, y_pred11)
print("Accuracy:", accuracy11)

              precision    recall  f1-score   support

           0       0.40      0.02      0.03       117
           1       0.82      0.30      0.44        90
           2       0.69      0.98      0.81       393

    accuracy                           0.69       600
   macro avg       0.64      0.43      0.43       600
weighted avg       0.65      0.69      0.60       600

Accuracy: 0.6916666666666667


## SAMPLE DATASET

In [38]:
# Fit the GridSearchCV object to the training data
grid_search.fit(X1_train, y1_train)

# Get the best hyperparameters and model
best_params1 = grid_search.best_params_
best_model1 = grid_search.best_estimator_

Traceback (most recent call last):
  File "C:\Users\Patrick\Desktop\Capstone\U-Net\UNet_venv\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Patrick\Desktop\Capstone\U-Net\UNet_venv\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Patrick\Desktop\Capstone\U-Net\UNet_venv\lib\site-packages\sklearn\metrics\_scorer.py", line 367, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported

Traceback (most recent call last):
  File "C:\Users\Patrick\Desktop\Capstone\U-Net\UNet_venv\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Patrick\Desktop\Capstone\U-Net\UNet_venv\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\Pat

In [39]:
# Make predictions on the testing data
y_pred21 = best_model1.predict(X_test)
y_pred22 = best_model1.predict(X_val)

In [40]:
# VALIDATION

print(classification_report(y_val,y_pred22))

# Evaluate the performance
accuracy22 = accuracy_score(y_val, y_pred22)
print("Accuracy:", accuracy22)

              precision    recall  f1-score   support

           0       0.35      0.57      0.43        30
           1       0.56      0.76      0.65        42
           2       0.80      0.45      0.57        78

    accuracy                           0.56       150
   macro avg       0.57      0.59      0.55       150
weighted avg       0.64      0.56      0.57       150

Accuracy: 0.56


In [41]:
# TEST

print(classification_report(y_test,y_pred21))

# Evaluate the performance
accuracy21 = accuracy_score(y_test, y_pred21)
print("Accuracy:", accuracy21)

              precision    recall  f1-score   support

           0       0.29      0.60      0.39       117
           1       0.34      0.78      0.47        90
           2       0.87      0.34      0.49       393

    accuracy                           0.45       600
   macro avg       0.50      0.57      0.45       600
weighted avg       0.68      0.45      0.46       600

Accuracy: 0.4533333333333333
