# Classifier based on the metadata of the BCC image set

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from utils import split_data, get_metrics, get_classifier

Main module did not execute


### Config

In [2]:
SEED = 1369 # random seed to be used throughout
SPLITS = 10 # number of folds the data will be split
SPLIT_STRATEGY = 'StratifiedKFold' # strategy to split train/test data

NAMES = ['SVM', 'Random Forest', 'AdaBoost', 'XGBoost']
CLASSIFIERS = [SVC(probability=True),
               RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
               AdaBoostClassifier(), XGBClassifier(learning_rate=0.02, n_estimators=600, silent=True, nthread=1)]

### Load data

In [3]:
df = pd.read_csv('prepared_dataset.csv', index_col=0)

train_df = df.loc[:, 'Sex':'Age 90+']
labels_df = df.loc[:, 'Basosquamous':'Low Risk']

In [4]:
train_df.head()

Unnamed: 0,Sex,Feature 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,...,Upper Extremities,Lower Extremities,Age 0-30,Age 30-40,Age 40-50,Age 50-60,Age 60-70,Age 70-80,Age 80-90,Age 90+
0,0,0,1,0,1,0,1,1,1,0.0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,1,0,0,0,1,0,0,1.0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,1,1,0,0,0,1.0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,1,0,0,1.0,...,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,1,0,1,1.0,...,0,0,0,1,0,0,0,0,0,0


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 842 entries, 0 to 841
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Sex                842 non-null    int64  
 1   Feature 0          842 non-null    int64  
 2   Feature 1          842 non-null    int64  
 3   Feature 2          842 non-null    int64  
 4   Feature 3          842 non-null    int64  
 5   Feature 4          842 non-null    int64  
 6   Feature 5          842 non-null    int64  
 7   Feature 6          842 non-null    int64  
 8   Feature 7          842 non-null    int64  
 9   Feature 8          842 non-null    float64
 10  Feature 9          842 non-null    int64  
 11  Head/Neck          842 non-null    int64  
 12  Trunk              842 non-null    int64  
 13  Upper Extremities  842 non-null    int64  
 14  Lower Extremities  842 non-null    int64  
 15  Age 0-30           842 non-null    int64  
 16  Age 30-40          842 non

In [6]:
train_df['Feature 8'] = train_df['Feature 8'].astype(int)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 842 entries, 0 to 841
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Sex                842 non-null    int64
 1   Feature 0          842 non-null    int64
 2   Feature 1          842 non-null    int64
 3   Feature 2          842 non-null    int64
 4   Feature 3          842 non-null    int64
 5   Feature 4          842 non-null    int64
 6   Feature 5          842 non-null    int64
 7   Feature 6          842 non-null    int64
 8   Feature 7          842 non-null    int64
 9   Feature 8          842 non-null    int32
 10  Feature 9          842 non-null    int64
 11  Head/Neck          842 non-null    int64
 12  Trunk              842 non-null    int64
 13  Upper Extremities  842 non-null    int64
 14  Lower Extremities  842 non-null    int64
 15  Age 0-30           842 non-null    int64
 16  Age 30-40          842 non-null    int64
 17  Age 40-50       

In [7]:
labels_df.head()

Unnamed: 0,Basosquamous,Infiltrating,Micronodular,Morphea,Superficial,Low Risk
0,0,0,0,0,0,1
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,0,0,0,0,1,0
4,0,0,0,0,0,1


In [8]:
labels_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 842 entries, 0 to 841
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Basosquamous  842 non-null    int64
 1   Infiltrating  842 non-null    int64
 2   Micronodular  842 non-null    int64
 3   Morphea       842 non-null    int64
 4   Superficial   842 non-null    int64
 5   Low Risk      842 non-null    int64
dtypes: int64(6)
memory usage: 46.0 KB


### Create a second label dataframe by combining high risk subtypes:

In [9]:
cols = labels_df.columns[0:4].values
# print(cols)
reduced_labels_df = pd.DataFrame()
reduced_labels_df['High Risk'] = ((labels_df['Basosquamous'] == 1) | (labels_df['Infiltrating'] == 1) | (labels_df['Micronodular'] == 1) | (labels_df['Morphea'] == 1)).astype(int)
reduced_labels_df[['Superficial', 'Low Risk']] = labels_df[['Superficial', 'Low Risk']]
reduced_labels_df.describe()

Unnamed: 0,High Risk,Superficial,Low Risk
count,842.0,842.0,842.0
mean,0.133017,0.225653,0.64133
std,0.339795,0.41826,0.479895
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,1.0
75%,0.0,0.0,1.0
max,1.0,1.0,1.0


### Split the data to k folds

In [14]:
list1 = [[1, 2, 3, 4], [5, 6, 7, 8]]
[sum(i)/2 for i in zip(*list1)]

[3.0, 4.0, 5.0, 6.0]

In [16]:
reduced_labels = reduced_labels_df.idxmax(1)

reduced_labels = reduced_labels.apply(lambda x: 0 if x == 'Low Risk' else (1 if x == 'Superficial' else 2) )



scores = dict()
stdev = dict()

for name, clf in zip(NAMES, CLASSIFIERS):
    print(f"\nModel {name}")
    print("==================")
    skf = split_data(train_df, reduced_labels, split_strategy='StratifiedKFold', n_splits=SPLITS, shuffle=True, random_state=SEED)
    bal_acc = []
    fold_scores = []
    for (train_idx, test_idx), i in skf:
        X_train, y_train, X_test, y_test = train_df.iloc[train_idx], reduced_labels.iloc[train_idx], train_df.iloc[test_idx], reduced_labels.iloc[test_idx]
        clf.fit(X_train, y_train)
        # print(f"Fold {i}")
        # print("----------------------------")
        y_pred = clf.predict(X_test)
        fold_scores.append(get_metrics(y_test, y_pred, verbose=False, filename=None))

    scores[name] = [sum(i)/SPLITS for i in zip(*fold_scores)]
    stdev[name] = [np.std(i) for i in zip(*fold_scores)]
    print("Averages of: Precision | Recall | F1 score | Accuracy | Balanced Accuracy")
    print(scores[name])
    print("Standard Deviation of: Precision | Recall | F1 score | Accuracy | Balanced Accuracy")
    print(stdev[name])



Model SVM
Averages of: Precision | Recall | F1 score |Accuracy | Balanced Accuracy
[0.5995589671784274, 0.6864425770308125, 0.6175092661743735, 0.6864425770308125, 0.4386614684860299]
Standard Deviation of: Precision | Recall | F1 score |Accuracy | Balanced Accuracy
[0.034541341525979145, 0.024697464854146742, 0.024175239161867503, 0.024697464854146742, 0.02746271321589327]
Model Random Forest
Averages of: Precision | Recall | F1 score |Accuracy | Balanced Accuracy
[0.5666873507661787, 0.6531932773109245, 0.5419050222239001, 0.6531932773109245, 0.36111111111111105]
Standard Deviation of: Precision | Recall | F1 score |Accuracy | Balanced Accuracy
[0.05280665400397441, 0.014115646116703223, 0.021049159670088604, 0.014115646116703223, 0.017788377357967466]
Model AdaBoost
Averages of: Precision | Recall | F1 score |Accuracy | Balanced Accuracy
[0.5836791828436233, 0.667422969187675, 0.6166470542134627, 0.667422969187675, 0.456214188670329]
Standard Deviation of: Precision | Recall | F1 sc