In [1]:
# if "Intel" in cpuinfo.get_cpu_info()['brand_raw']:
from sklearnex import patch_sklearn
patch_sklearn()

import os
os.chdir("..")

import random
from multiprocessing.dummy import Pool
from pathlib import Path
from typing import Counter

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from skin_lesion_cad.data.BOVW import (BagofWords, ColorDescriptor,
                                       DenseDescriptor, DescriptorsTransformer,
                                       LBPDescriptor)
from skin_lesion_cad.features.colour import (ColorFeaturesDescriptor,
                                             ColorFeaturesExtractor)
from skin_lesion_cad.features.texture import get_glcm, glcm_features, lbph
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics import (accuracy_score, classification_report,
                             cohen_kappa_score, confusion_matrix, f1_score,
                             precision_score)
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.svm import SVC
from tqdm import tqdm

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
KP_SIZE = 25
color_spaces = {'bgr':cv2.COLOR_RGB2BGR, 'hsv':cv2.COLOR_RGB2HSV, 'YCrCb':cv2.COLOR_RGB2YCrCb}

root_path = Path().resolve()

chall2_train = root_path/"data/processed/chall2/train"
mel_imgs_all = np.array(list((chall2_train/'mel').glob("*inpaint_1.png")))
bcc_imgs_all = np.array(list((chall2_train/'bcc').glob("*inpaint_1.png")))
scc_imgs_all = np.array(list((chall2_train/'scc').glob("*inpaint_1.png")))
test_imgs_train = np.concatenate([mel_imgs_all, bcc_imgs_all, scc_imgs_all])

chall2_val = root_path/"data/processed/chall2/val"
mel_imgs_all = np.array(list((chall2_val/'mel').glob("*inpaint_1.png")))
bcc_imgs_all = np.array(list((chall2_val/'bcc').glob("*inpaint_1.png")))
scc_imgs_all = np.array(list((chall2_val/'scc').glob("*inpaint_1.png")))
test_imgs_val = np.concatenate([mel_imgs_all, bcc_imgs_all, scc_imgs_all])

chall2_test = root_path/"data/processed/chall2"
test_imgs_TEST = np.array(list((chall2_test/'testX').glob("*inpaint_1.png")))

print(f'Train images {len(test_imgs_train)}\nValidation images {len(test_imgs_val)}')
print(f'Test images {len(test_imgs_TEST)}')

Train images 5082
Validation images 1270
Test images 2121


## Getting Color Descriptors for Images for BoW

Either rung the section below or load the saved pickle file.from one below it.

### Calculate features from scratch

Use gaussian random sampling from the whole image with the centre in the image centre

In [3]:
# define descriptors used for BoW
# use gaussian sampling
dense_desc = DenseDescriptor(descriptor=None, min_keypoints=100,
                             max_keypoints=500, kp_size=KP_SIZE,
                             sample_method='gaussian')


dense_color = ColorDescriptor(dense_desc, color_spaces, meanshift=None,
                              min_keypoints=100, max_keypoints=500,
                              kp_size=25,
                              sample_method='gaussian')

In [4]:
def _load_and_extract_des_color(x):
    image_path, descriptor = x
    im = cv2.imread(str(image_path))
    fov = cv2.imread(str(image_path).replace('inpaint', 'fov'), cv2.IMREAD_GRAYSCALE)
    mask = 255*(fov == 0).astype(np.uint8)
    mcx, mcy = mask.shape[0]//2, mask.shape[1]//2
    if not np.any(mask[mcx - 10:mcx + 10, mcy - 10:mcy + 10]):
        mask = np.zeros(mask.shape, dtype=np.uint8)
        mask[im.shape[0]//4:im.shape[0]*3//4, im.shape[1]//4:im.shape[1]*3//4] = 255
    if 'bcc' in str(image_path):
        img_cls = 1
    elif 'mel' in str(image_path):
        img_cls = 0
    else:
        img_cls = 2
        
    _, des = descriptor.detectAndCompute(im, mask)
    return (des, img_cls, image_path.name)

In [5]:
descriptors_color_train = []
img_classes_color_train = []
img_names_color_train = []

# COLOR FOR TRAIN
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(_load_and_extract_des_color,
                                       zip(test_imgs_train,
                                           [dense_color]*len(test_imgs_train))), total=len(test_imgs_train)):
        descriptors_color_train.append(des)
        img_classes_color_train.append(img_cls)
        img_names_color_train.append(img_name)
        
pd.to_pickle((descriptors_color_train, img_classes_color_train, img_names_color_train), 'descriptors_color_train_all.pkl')

  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)
100%|██████████| 5082/5082 [30:41<00:00,  2.76it/s]  


In [6]:
descriptors_color_val = []
img_classes_color_val = []
img_names_color_val = []

# COLOR FOR VALIDATION
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(_load_and_extract_des_color,
                                       zip(test_imgs_val,
                                           [dense_color]*len(test_imgs_val))), total=len(test_imgs_val)):
        descriptors_color_val.append(des)
        img_classes_color_val.append(img_cls)
        img_names_color_val.append(img_name)
        
pd.to_pickle((descriptors_color_val, img_classes_color_val, img_names_color_val), 'descriptors_color_val_all.pkl')

100%|██████████| 1270/1270 [07:47<00:00,  2.71it/s]


In [7]:
descriptors_color_TEST = []
img_classes_color_TEST = []
img_names_color_TEST = []

# COLOR FOR TESTIDATION
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(_load_and_extract_des_color,
                                       zip(test_imgs_TEST,
                                           [dense_color]*len(test_imgs_TEST))), total=len(test_imgs_TEST)):
        descriptors_color_TEST.append(des)
        img_classes_color_TEST.append(img_cls)
        img_names_color_TEST.append(img_name)
        
pd.to_pickle((descriptors_color_TEST, img_classes_color_TEST, img_names_color_TEST), 'descriptors_color_TESTX.pkl')

100%|██████████| 2121/2121 [22:17<00:00,  1.59it/s]


### Load the dump

In [8]:
# descriptors_color_train, img_classes_color_train, img_names_color_train = pd.read_pickle('descriptors_color_train_all.pkl')
# descriptors_color_val, img_classes_color_val, img_names_color_val = pd.read_pickle('descriptors_color_val_all.pkl')
# descriptors_color_TEST, img_classes_color_TEST, img_names_color_TEST = pd.read_pickle('descriptors_color_TESTX.pkl')

# print(len(descriptors_color_train), len(descriptors_color_val), len(descriptors_color_TEST))

## Creating and calculating BoW

In [9]:
descriptors_color_train = np.asarray(descriptors_color_train)
descriptors_color_val = np.asarray(descriptors_color_val)


# Fitting and transforming the descriptors to a BoW model for train and val evaluation first
bow_color = BagofWords(n_words=100, n_jobs=-1, random_state=42)
train_BoWed = bow_color.fit_transform(descriptors_color_train, img_classes_color_train) 
val_BoWed = bow_color.transform(descriptors_color_val)

# creating datafrains that contain the BoW features and the image class for all images
bow_train = pd.DataFrame(train_BoWed.toarray(), columns=[f'bow_{i}' for i in range(train_BoWed.shape[1])])
bow_val = pd.DataFrame(val_BoWed.toarray(), columns=[f'bow_{i}' for i in range(val_BoWed.shape[1])])

bow_train['class'] = img_classes_color_train
bow_val['class'] = img_classes_color_val

bow_train['name'] = img_names_color_train
bow_val['name'] = img_names_color_val


# classification and evaluation
svc = SVC(kernel='rbf', C=1, random_state=42, probability=False, class_weight='balanced')
svc.fit(bow_train.drop(['class', 'name'], axis=1), bow_train['class'])
y_pred = svc.predict(bow_val.drop(['class', 'name'], axis=1))
y_pred_train = svc.predict(bow_train.drop(['class', 'name'], axis=1))

print(confusion_matrix(bow_val['class'], y_pred))
print('Train f1_weighted',f1_score(bow_train['class'], y_pred_train, average='weighted'))
print('Validation f1_weighted', f1_score(bow_val['class'], y_pred, average='weighted'))
print('Train kappa',cohen_kappa_score(bow_train['class'], y_pred_train))
print('Validation kappa', cohen_kappa_score(bow_val['class'], y_pred))

  descriptors_color_train = np.asarray(descriptors_color_train)
  descriptors_color_val = np.asarray(descriptors_color_val)


[[549 128   1]
 [ 92 405   1]
 [ 34  60   0]]
Train f1_weighted 0.7814170329131336
Validation f1_weighted 0.7243712470219058
Train kappa 0.6368262876137128
Validation kappa 0.5332120574866124


Making final BOW on train+val

In [17]:
ALL_descrps = np.concatenate([descriptors_color_train, descriptors_color_val])
ALL_img_classes = np.hstack([img_classes_color_train, img_classes_color_val])

print(descriptors_color_train.shape, descriptors_color_val.shape, ALL_descrps.shape, ALL_img_classes.shape)

bow_color = BagofWords(n_words=100, n_jobs=-1, random_state=42)
train_BoWed = bow_color.fit_transform(ALL_descrps, ALL_img_classes) 


TEST_BoWed = bow_color.transform(descriptors_color_TEST)
bow_TEST = pd.DataFrame(TEST_BoWed.toarray(), columns=[f'bow_{i}' for i in range(val_BoWed.shape[1])])
bow_TEST['name'] = img_names_color_TEST

(5082,) (1270,) (6352,) (6352,)


## Extracting color features (global)

In [18]:
def extract_color(image_path):
    cfe = ColorFeaturesExtractor(color_spaces)
    im = cv2.imread(str(image_path))  
    
    # im = im[im.shape[0]//4:im.shape[0]*3//4, im.shape[1]//4:im.shape[1]*3//4, :]
    fov = cv2.imread(str(image_path).replace('inpaint', 'fov'), cv2.IMREAD_GRAYSCALE)
    mask = 255*(fov == 0).astype(np.uint8)
    mcx, mcy = mask.shape[0]//2, mask.shape[1]//2
    if not np.any(mask[mcx - 10:mcx + 10, mcy - 10:mcy + 10]):
        mask = np.zeros(mask.shape, dtype=np.uint8)
        mask[im.shape[0]//4:im.shape[0]*3//4, im.shape[1]//4:im.shape[1]*3//4] = 255

    if 'bcc' in str(image_path):
        img_cls = 1
    elif 'mel' in str(image_path):
        img_cls = 0
    else:
        img_cls = 2

    des = cfe.extract_masked(im, mask)

    return (des, img_cls, image_path.name)

In [19]:
colf_train = []
colc_train = []
colnam_train = []
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(extract_color,
                                       test_imgs_train), total=len(test_imgs_train)):
        colf_train.append(des)
        colc_train.append(img_cls)
        colnam_train.append(img_name)

100%|██████████| 5082/5082 [12:29<00:00,  6.78it/s]


In [20]:
colf_val = []
colc_val = []
colnam_val = []
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(extract_color,
                                       test_imgs_val), total=len(test_imgs_val)):
        colf_val.append(des)
        colc_val.append(img_cls)
        colnam_val.append(img_name)

100%|██████████| 1270/1270 [02:13<00:00,  9.49it/s]


In [21]:
colf_TEST = []
colc_TEST = []
colnam_TEST = []
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(extract_color,
                                       test_imgs_TEST), total=len(test_imgs_TEST)):
        colf_TEST.append(des)
        colc_TEST.append(img_cls)
        colnam_TEST.append(img_name)

100%|██████████| 2121/2121 [03:44<00:00,  9.44it/s]


In [22]:
color_df_train = pd.DataFrame(colf_train)
color_df_train['name'] = colnam_train
color_df_train['class'] = colc_train

color_df_val = pd.DataFrame(colf_val)
color_df_val['name'] = colnam_val
color_df_val['class'] = colc_val

color_df_TEST = pd.DataFrame(colf_TEST)
color_df_TEST['name'] = colnam_TEST

# Evaluating global color features performance
scaler = DescriptorsTransformer(imputation=None)
X_train = scaler.fit_transform(color_df_train.drop(['class', 'name'], axis=1))
X_test = scaler.transform(color_df_val.drop(['class', 'name'], axis=1))

svc = SVC(kernel='rbf', C=1, random_state=42, probability=False, class_weight='balanced')
svc.fit(X_train, color_df_train['class'])
y_pred = svc.predict(X_test)
y_pred_train = svc.predict(X_train)

print(confusion_matrix(color_df_val['class'], y_pred))
print('Train f1_weighted',f1_score(color_df_train['class'], y_pred_train, average='weighted'))
print('Validation f1_weighted', f1_score(color_df_val['class'], y_pred, average='weighted'))
print('Train kappa',cohen_kappa_score(color_df_train['class'], y_pred_train))
print('Validation kappa', cohen_kappa_score(color_df_val['class'], y_pred))

[[613  64   1]
 [ 64 433   1]
 [ 29  59   6]]
Train f1_weighted 0.8229139027603319
Validation f1_weighted 0.8038029645253443
Train kappa 0.7129753255194526
Validation kappa 0.6767889496983395


```
Preprocessed 

[[435  61   2]
 [ 62 616   0]
 [ 57  31   6]]
Train f1_weighted 0.8190181201191215
Validation f1_weighted 0.8075351154802809
Train kappa 0.706336521578109
Validation kappa 0.6840692709452578

```

## Extracting texture features

### GLCM

In [24]:
def extract_text(image_path):
    im = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE)
    
    fov = cv2.imread(str(image_path).replace('inpaint', 'fov'), cv2.IMREAD_GRAYSCALE)
    mask = 255*(fov == 0).astype(np.uint8)
    
    fov_mask = 255*(fov == 0).astype(np.uint8)
    mcx, mcy = mask.shape[0]//2, mask.shape[1]//2
    if not np.any(fov_mask[mcx - 10:mcx + 10, mcy - 10:mcy + 10]):
        mask = np.zeros(mask.shape, dtype=np.uint8)
        mask[im.shape[0]//4:im.shape[0]*3//4, im.shape[1]//4:im.shape[1]*3//4] = 255
    
    if np.any(fov):
        cpoint_x, cpointy = np.nonzero(mask)
        im = im[min(cpoint_x):max(cpoint_x), min(cpointy):max(cpointy)]


    if 'bcc' in str(image_path):
        img_cls = 1
    elif 'mel' in str(image_path):
        img_cls = 0
    else:
        img_cls = 2

    des = glcm_features(im)

    return (des, img_cls, image_path.name)

txtf_train = []
txtcl_train = []
txtnam_train = []
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(extract_text,
                                       test_imgs_train), total=len(test_imgs_train)):
        txtf_train.append(des)
        txtcl_train.append(img_cls)
        txtnam_train.append(img_name)
        
txtf_val = []
txtcl_val = []
txtnam_val = []
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(extract_text,
                                       test_imgs_val), total=len(test_imgs_val)):
        txtf_val.append(des)
        txtcl_val.append(img_cls)
        txtnam_val.append(img_name)
        
txtf_TEST = []
txtcl_TEST = []
txtnam_TEST = []
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(extract_text,
                                       test_imgs_TEST), total=len(test_imgs_TEST)):
        txtf_TEST.append(des)
        txtcl_TEST.append(img_cls)
        txtnam_TEST.append(img_name)

100%|██████████| 5082/5082 [07:27<00:00, 11.37it/s]
100%|██████████| 1270/1270 [01:51<00:00, 11.39it/s]
100%|██████████| 2121/2121 [03:08<00:00, 11.26it/s]


In [25]:
# unrevelling the dataframe
txtf_exp_train = []
for i in range(len(txtf_train)):
    res = dict()
    for feat in txtf_train[i].keys():
        for featvalidx, featval in enumerate(txtf_train[i][feat][0]):
            res[f'{feat}_{featvalidx}'] = featval
    txtf_exp_train.append(res)
    
txtf_exp_val = []
for i in range(len(txtf_val)):
    res = dict()
    for feat in txtf_val[i].keys():
        for featvalidx, featval in enumerate(txtf_val[i][feat][0]):
            res[f'{feat}_{featvalidx}'] = featval
    txtf_exp_val.append(res)

    
txtf_exp_TEST = []
for i in range(len(txtf_TEST)):
    res = dict()
    for feat in txtf_TEST[i].keys():
        for featTESTidx, featTEST in enumerate(txtf_TEST[i][feat][0]):
            res[f'{feat}_{featTESTidx}'] = featTEST
    txtf_exp_TEST.append(res)


glcm_df_train = pd.DataFrame(txtf_exp_train)
glcm_df_train['name'] = txtnam_train
glcm_df_train['class'] = txtcl_train

glcm_df_val = pd.DataFrame(txtf_exp_val)
glcm_df_val['name'] = txtnam_val
glcm_df_val['class'] = txtcl_val

glcm_df_TEST = pd.DataFrame(txtf_exp_TEST)
glcm_df_TEST['name'] = txtnam_TEST

In [26]:
scaler = DescriptorsTransformer(None)
X_train = scaler.fit_transform(glcm_df_train.drop(['class', 'name'], axis=1))
X_test = scaler.transform(glcm_df_val.drop(['class', 'name'], axis=1))

svc = SVC(kernel='rbf', C=1, random_state=42, probability=False, class_weight='balanced')
svc.fit(X_train, glcm_df_train['class'])
y_pred = svc.predict(X_test)
y_pred_train = svc.predict(X_train)

print(confusion_matrix(glcm_df_val['class'], y_pred))
print('Train f1_weighted',f1_score(glcm_df_train['class'], y_pred_train, average='weighted'))
print('Validation f1_weighted', f1_score(glcm_df_val['class'], y_pred, average='weighted'))
print('Train kappa',cohen_kappa_score(glcm_df_train['class'], y_pred_train))
print('Validation kappa', cohen_kappa_score(glcm_df_val['class'], y_pred))

[[537 141   0]
 [163 335   0]
 [ 39  55   0]]
Train f1_weighted 0.6711111416221535
Validation f1_weighted 0.6599522920162395
Train kappa 0.4243155292983415
Validation kappa 0.40353071676382424


Processed

```
[[347 151   0]
 [147 531   0]
 [ 60  34   0]]
Train f1_weighted 0.6709994708774802
Validation f1_weighted 0.6653965548931365
Train kappa 0.42462073093515984
Validation kappa 0.41537883413969656
```

### LBP

In [28]:
def extract_lbp(image_path):
    im = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE)
    
    # select central half of the image
    # im = im[im.shape[0]//4:im.shape[0]*3//4, im.shape[1]//4:im.shape[1]*3//4]
    
    fov = cv2.imread(str(image_path).replace('inpaint', 'fov'), cv2.IMREAD_GRAYSCALE)
    mask = 255*(fov == 0).astype(np.uint8)
    
    fov_mask = 255*(fov == 0).astype(np.uint8)
    mcx, mcy = mask.shape[0]//2, mask.shape[1]//2
    if not np.any(fov_mask[mcx - 10:mcx + 10, mcy - 10:mcy + 10]):
        mask = np.zeros(mask.shape, dtype=np.uint8)
        mask[im.shape[0]//4:im.shape[0]*3//4, im.shape[1]//4:im.shape[1]*3//4] = 255
    
    if np.any(fov):
        cpoint_x, cpointy = np.nonzero(mask)
        im = im[min(cpoint_x):max(cpoint_x), min(cpointy):max(cpointy)]

    if 'bcc' in str(image_path):
        img_cls = 1
    elif 'mel' in str(image_path):
        img_cls = 0
    else:
        img_cls = 2

    des = lbph(im)

    return (des, img_cls, image_path.name)

txtf_train = []
txtcl_train = []
txtnam_train = []
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(extract_lbp,
                                       test_imgs_train), total=len(test_imgs_train)):
        txtf_train.append(des)
        txtcl_train.append(img_cls)
        txtnam_train.append(img_name)
        
txtf_val = []
txtcl_val = []
txtnam_val = []
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(extract_lbp,
                                       test_imgs_val), total=len(test_imgs_val)):
        txtf_val.append(des)
        txtcl_val.append(img_cls)
        txtnam_val.append(img_name)
        
   
txtf_TEST = []
txtcl_TEST = []
txtnam_TEST = []
with Pool(8) as pool:
    for (des, img_cls, img_name) in tqdm(pool.imap(extract_lbp,
                                       test_imgs_TEST), total=len(test_imgs_TEST)):
        txtf_TEST.append(des)
        txtcl_TEST.append(img_cls)
        txtnam_TEST.append(img_name)

100%|██████████| 5082/5082 [32:16<00:00,  2.62it/s]
100%|██████████| 1270/1270 [10:12<00:00,  2.07it/s]
100%|██████████| 2121/2121 [30:03<00:00,  1.18it/s]


In [29]:
lbp_df_train = pd.DataFrame(txtf_train, columns=[f'lbp_{i}' for i in range(len(txtf_train[0]))])
lbp_df_train['name'] = txtnam_train
lbp_df_train['class'] = txtcl_train

lbp_df_val = pd.DataFrame(txtf_val, columns=[f'lbp_{i}' for i in range(len(txtf_train[0]))])
lbp_df_val['name'] = txtnam_val
lbp_df_val['class'] = txtcl_val

lbp_df_TEST = pd.DataFrame(txtf_TEST, columns=[f'lbp_{i}' for i in range(len(txtf_train[0]))])
lbp_df_TEST['name'] = txtnam_TEST

In [30]:
scaler = DescriptorsTransformer(None)
X_train = scaler.fit_transform(lbp_df_train.drop(['class', 'name'], axis=1))
X_test = scaler.transform(lbp_df_val.drop(['class', 'name'], axis=1))

# X_train = lbp_df_train.drop(['class', 'name'], axis=1)
# X_test = lbp_df_val.drop(['class', 'name'], axis=1)

svc = SVC(kernel='rbf', C=1, random_state=42, probability=False, class_weight='balanced')
svc.fit(X_train, lbp_df_train['class'])
y_pred = svc.predict(X_test)
y_pred_train = svc.predict(X_train)

print(confusion_matrix(lbp_df_val['class'], y_pred))
print('Train f1_weighted',f1_score(lbp_df_train['class'], y_pred_train, average='weighted'))
print('Validation f1_weighted', f1_score(lbp_df_val['class'], y_pred, average='weighted'))
print('Train kappa',cohen_kappa_score(lbp_df_train['class'], y_pred_train))
print('Validation kappa', cohen_kappa_score(lbp_df_val['class'], y_pred))

[[552 126   0]
 [183 315   0]
 [ 49  45   0]]
Train f1_weighted 0.6625153147977768
Validation f1_weighted 0.6541886421966431
Train kappa 0.4060556066606835
Validation kappa 0.3902087404089025


# Merging BoW and global faetures

In [31]:
merged_df_train = pd.merge(color_df_train.drop(columns=['class']), glcm_df_train, on='name', how='inner')
merged_df_train = pd.merge(merged_df_train, lbp_df_train.drop(columns=['class']), on='name', how='inner').set_index('name')
merged_df_train = pd.merge(merged_df_train, bow_train.drop(columns=['class']), on='name', how='inner').set_index('name')

merged_df_val = pd.merge(color_df_val.drop(columns=['class']), glcm_df_val, on='name', how='inner')
merged_df_val = pd.merge(merged_df_val, lbp_df_val.drop(columns=['class']), on='name', how='inner').set_index('name')
merged_df_val = pd.merge(merged_df_val, bow_val.drop(columns=['class']), on='name', how='inner').set_index('name')

merged_df_TEST = pd.merge(color_df_TEST, glcm_df_TEST, on='name', how='inner')
merged_df_TEST = pd.merge(merged_df_TEST, lbp_df_TEST, on='name', how='inner').set_index('name')
merged_df_TEST = pd.merge(merged_df_TEST, bow_TEST, on='name', how='inner').set_index('name')


In [36]:
merged_df_TEST.to_pickle('merged_df_TEST.pkl')
merged_df_val.to_pickle('merged_df_val.pkl')
merged_df_train.to_pickle('merged_df_train.pkl')

In [33]:
merged_df_val

Unnamed: 0_level_0,bgr_b_mean,bgr_b_std,bgr_b_skew,bgr_b_kurt,bgr_b_max,bgr_b_min,bgr_b_entrp,bgr_b_unq,bgr_g_mean,bgr_g_std,...,bow_90,bow_91,bow_92,bow_93,bow_94,bow_95,bow_96,bow_97,bow_98,bow_99
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mel02863_inpaint_1.png,213.906067,22.297472,-1.648822,3.045142,255.0,82.0,13.080577,164,116.886780,44.209274,...,0.000000,0.000000,0.381687,0.0,0.050918,0.000000,0.0,0.0,0.0,0.056047
mel02970_inpaint_1.png,159.960159,8.882860,-7.665990,111.063321,221.0,8.0,13.451581,213,135.450317,8.764379,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
mel03315_inpaint_1.png,190.249619,18.089682,-2.055556,4.938194,224.0,33.0,13.844849,189,160.242584,25.798092,...,0.110595,0.049976,0.000000,0.0,0.000000,0.084368,0.0,0.0,0.0,0.000000
mel02862_inpaint_1.png,158.277832,58.892693,-0.028834,-1.333697,255.0,23.0,13.715636,231,139.639923,59.441696,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
mel03012_inpaint_1.png,176.563614,15.976261,-1.037247,1.644206,214.0,75.0,13.858693,138,149.150848,24.324524,...,0.000000,0.000000,0.000000,0.0,0.000000,0.215282,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
scc00413_inpaint_1.png,203.829132,21.426785,-2.228764,7.141194,252.0,45.0,12.500071,193,174.749237,36.083839,...,0.000000,0.076279,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
scc00377_inpaint_1.png,192.310074,24.545528,-1.447687,4.134878,234.0,20.0,12.437281,215,155.088867,23.281540,...,0.000000,0.150171,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
scc00420_inpaint_1.png,166.541733,14.910087,-1.720465,8.061281,226.0,22.0,13.858644,202,99.227730,22.154819,...,0.000000,0.071286,0.037736,0.0,0.035239,0.000000,0.0,0.0,0.0,0.659405
scc00445_inpaint_1.png,211.114243,15.796206,-0.519468,0.087055,255.0,113.0,12.503333,129,152.496689,17.798973,...,0.000000,0.045371,0.000000,0.0,0.639199,0.000000,0.0,0.0,0.0,0.000000


Evaluating the performance of merged features

In [38]:
X_train = merged_df_train.drop(columns=['class'])
X_test = merged_df_val.drop(columns=['class'])
y_train = merged_df_train['class']
y_test = merged_df_val['class']

scaler = DescriptorsTransformer(None)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
svc = SVC(kernel='rbf', C=1, random_state=42, probability=True, class_weight='balanced')
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
y_pred_train = svc.predict(X_train)

print(confusion_matrix(glcm_df_val['class'], y_pred))
print('Train f1_weighted',f1_score(glcm_df_train['class'], y_pred_train, average='weighted'))
print('Validation f1_weighted', f1_score(glcm_df_val['class'], y_pred, average='weighted'))
print('Train kappa',cohen_kappa_score(glcm_df_train['class'], y_pred_train))
print('Validation kappa', cohen_kappa_score(glcm_df_val['class'], y_pred))

[[615  63   0]
 [ 52 443   3]
 [ 23  68   3]]
Train f1_weighted 0.8429171459061254
Validation f1_weighted 0.8085342887716412
Train kappa 0.7486235697694401
Validation kappa 0.6908802511809042


```
[[449  47   2]
 [ 74 603   1]
 [ 56  27  11]]
Train f1_weighted 0.8551313288300627
Validation f1_weighted 0.8171845388739531
Train kappa 0.7644150306930817
Validation kappa 0.6958122549859067
```