In [1]:
import pandas as pd
import numpy as np
import sklearn

wbcd_dataset = pd.read_csv('./dataset/wbcd.data', header=None)
wdbc_dataset = pd.read_csv('./dataset/wdbc.data', header=None)

random_state = 0

In [2]:
wbcd_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
wdbc_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Data preprocessing

In [4]:
# wbcd_dataset
wbcd_dataset = wbcd_dataset.drop(0, axis=1) # drop the id column
# if record contains ? value for any column (feature incomplete), delete the record
incomplete_records = []
for index, row in wbcd_dataset.iterrows():
    if '?' in row.values:
        incomplete_records.append(index)
wbcd_dataset = wbcd_dataset.drop(incomplete_records, axis=0)
print(f'removed {len(incomplete_records)} incomplete records: {incomplete_records}')

removed 16 incomplete records: [23, 40, 139, 145, 158, 164, 235, 249, 275, 292, 294, 297, 315, 321, 411, 617]


In [7]:
# wbcd partitioning
# 50-50
train_50 = wbcd_dataset.sample(frac=0.5, random_state=random_state)
test_50 = wbcd_dataset.drop(train_50.index)
# 60-40
train_60 = wbcd_dataset.sample(frac=0.6, random_state=random_state)
test_60 = wbcd_dataset.drop(train_60.index)
# 10-CV
train_10cv = wbcd_dataset.copy()
test_10cv = []
for i in range(10):
    test_10cv.append(train_10cv.sample(frac=0.1, random_state=(random_state+i)))

wbcd_partitioned = {
    '50-50': {
        'train': train_50,
        'test': test_50
    },
    '60-40': {
        'train': train_60,
        'test': test_60
    },
    '10-CV': {
        'train': train_10cv,
        'test': test_10cv
    }
}

def show_wbcd_statistic_data(dataset) -> None:
    print(f'number of records: {len(dataset)}')
    print(f'B: {len(dataset[dataset[10] == 2])}')
    print(f'M: {len(dataset[dataset[10] == 4])}')
    
for key, val in wbcd_partitioned.items():
    if key == '10-CV':
        print(f'10-CV')
        for i in range(10):
            print(f'fold {i+1}')
            show_wbcd_statistic_data(val['test'][i])
    else:
        print(key)
        print('Train set')
        show_wbcd_statistic_data(val['train'])
        print('Test set')
        show_wbcd_statistic_data(val['test'])

50-50
Train set
number of records: 342
B: 219
M: 123
Test set
number of records: 341
B: 225
M: 116
60-40
Train set
number of records: 410
B: 268
M: 142
Test set
number of records: 273
B: 176
M: 97
10-CV
fold 1
number of records: 68
B: 46
M: 22
fold 2
number of records: 68
B: 43
M: 25
fold 3
number of records: 68
B: 42
M: 26
fold 4
number of records: 68
B: 38
M: 30
fold 5
number of records: 68
B: 45
M: 23
fold 6
number of records: 68
B: 46
M: 22
fold 7
number of records: 68
B: 43
M: 25
fold 8
number of records: 68
B: 44
M: 24
fold 9
number of records: 68
B: 42
M: 26
fold 10
number of records: 68
B: 37
M: 31


## Basic SVM

In [8]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

def basic_svm_fit(partition: dict, sigma = None) -> tuple[float, float, float, float]:
    train = partition['train']
    test = partition['test']
    train_x = train.drop(10, axis=1)
    train_y = train[10]
    test_x = test.drop(10, axis=1)
    test_y = test[10]
    if sigma is not None:
        gamma = 1 / (sigma**2)
        svm = SVC(kernel='rbf', random_state=random_state, gamma=gamma)
    else:
        svm = SVC(kernel='rbf', random_state=random_state)
    svm.fit(train_x, train_y)
    pred_y = svm.predict(test_x)
    acc = accuracy_score(test_y, pred_y)
    tp, fp, fn, tn = confusion_matrix(test_y, pred_y).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    auc = roc_auc_score(test_y, pred_y)
    return acc, sensitivity, specificity, auc

# def basic_svm_fit_10cv(partition: dict) -> tuple[float, float, float, float]:

print('50-50')
acc, sensitivity, specificity, auc = basic_svm_fit(wbcd_partitioned['50-50'])
print(f'accuracy: {acc}, sensitivity: {sensitivity}, specificity: {specificity}, auc: {auc}')

50-50
accuracy: 0.9794721407624634, sensitivity: 0.9866071428571429, specificity: 0.9658119658119658, auc: 0.9781800766283524


## Whales Optimization Algorithm

In [None]:
# Initialize the parameters
# a. Foraging of prey
population_size = 50
max_iteration = 50
num_classes = 2