In [48]:
import torch 
import torchvision
from torchvision import transforms
import torchvision.datasets as datasets
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import os
from glob import glob
from PIL import Image
from torch.nn import Conv2d, MaxPool2d, Parameter


### Data Loading

In [49]:
df_data = pd.read_csv("HAM10000_metadata")
df_data.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern


In [50]:

img_path = {os.path.splitext(os.path.basename(x))[0]: x for x in glob((os.path.join('*', '*.jpg')))}
df_data['img_path'] = df_data['image_id'].map(img_path.get)

#drop rows with no image path
df_data.dropna(inplace=True)
df_data.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,img_path
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025030.jpg
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0026769.jpg
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025661.jpg
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,HAM10000_images_part_2/ISIC_0031633.jpg
8,HAM_0005132,ISIC_0025837,bkl,histo,70.0,female,back,vidir_modern,HAM10000_images_part_1/ISIC_0025837.jpg


In [51]:
def load_image(image_path):
    try:
        image = Image.open(image_path)
        return np.asarray(image.resize((32, 32)))
    except Exception as e:
        print(f"Error loading image '{image_path}': {e}")
        return None

In [52]:
def transpose(img):
    return np.transpose(img, (2, 0 ,1))

In [53]:
df_data['img'] = df_data['img_path'].map(load_image)
df_data['img'] = df_data['img'].apply(transpose)
df_data.dropna()
df_data.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,img_path,img
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025030.jpg,"[[[24, 56, 106, 143, 167, 173, 177, 178, 185, ..."
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0026769.jpg,"[[[190, 199, 200, 205, 207, 207, 209, 201, 199..."
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025661.jpg,"[[[35, 83, 128, 161, 174, 180, 191, 192, 199, ..."
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,HAM10000_images_part_2/ISIC_0031633.jpg,"[[[155, 188, 210, 220, 228, 233, 235, 234, 238..."
8,HAM_0005132,ISIC_0025837,bkl,histo,70.0,female,back,vidir_modern,HAM10000_images_part_1/ISIC_0025837.jpg,"[[[122, 158, 179, 184, 191, 188, 194, 195, 199..."


In [54]:
print(df_data.shape)
print(df_data['img'].iloc[10].shape)

(8039, 10)
(3, 32, 32)


Assigning each of the dx values a numerical value

In [55]:
df_data.dx.unique()

array(['bkl', 'nv', 'df', 'mel', 'vasc', 'bcc', 'akiec'], dtype=object)

In [56]:

categories = {
    "bkl"   : 0,
    "nv" : 1,
    "df" : 2,
    "mel" : 3,
    "vasc" : 4,
    "bcc": 5,
    "akiec" :6
}

df_data = df_data[df_data["dx"].apply(lambda x: x in categories.keys())]
df_data.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,img_path,img
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025030.jpg,"[[[24, 56, 106, 143, 167, 173, 177, 178, 185, ..."
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0026769.jpg,"[[[190, 199, 200, 205, 207, 207, 209, 201, 199..."
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025661.jpg,"[[[35, 83, 128, 161, 174, 180, 191, 192, 199, ..."
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,HAM10000_images_part_2/ISIC_0031633.jpg,"[[[155, 188, 210, 220, 228, 233, 235, 234, 238..."
8,HAM_0005132,ISIC_0025837,bkl,histo,70.0,female,back,vidir_modern,HAM10000_images_part_1/ISIC_0025837.jpg,"[[[122, 158, 179, 184, 191, 188, 194, 195, 199..."


In [57]:
df_data["dx"] = df_data["dx"].apply(categories.get)
df_data

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,img_path,img
1,HAM_0000118,ISIC_0025030,0,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025030.jpg,"[[[24, 56, 106, 143, 167, 173, 177, 178, 185, ..."
2,HAM_0002730,ISIC_0026769,0,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0026769.jpg,"[[[190, 199, 200, 205, 207, 207, 209, 201, 199..."
3,HAM_0002730,ISIC_0025661,0,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025661.jpg,"[[[35, 83, 128, 161, 174, 180, 191, 192, 199, ..."
4,HAM_0001466,ISIC_0031633,0,histo,75.0,male,ear,vidir_modern,HAM10000_images_part_2/ISIC_0031633.jpg,"[[[155, 188, 210, 220, 228, 233, 235, 234, 238..."
8,HAM_0005132,ISIC_0025837,0,histo,70.0,female,back,vidir_modern,HAM10000_images_part_1/ISIC_0025837.jpg,"[[[122, 158, 179, 184, 191, 188, 194, 195, 199..."
...,...,...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,6,histo,40.0,male,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033084.jpg,"[[[181, 179, 180, 183, 185, 191, 215, 219, 223..."
10011,HAM_0002867,ISIC_0033550,6,histo,40.0,male,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033550.jpg,"[[[4, 24, 101, 128, 133, 126, 139, 152, 160, 1..."
10012,HAM_0002867,ISIC_0033536,6,histo,40.0,male,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033536.jpg,"[[[132, 157, 177, 174, 190, 197, 211, 210, 210..."
10013,HAM_0000239,ISIC_0032854,6,histo,80.0,male,face,vidir_modern,HAM10000_images_part_2/ISIC_0032854.jpg,"[[[160, 163, 166, 168, 170, 174, 177, 181, 182..."


Now doing that for other non-numeric variables that will be used in the analyses

In [58]:
df_data.dx_type.unique()

array(['histo', 'consensus', 'confocal', 'follow_up'], dtype=object)

In [59]:
categories = {
    "histo"   : 0,
    "consensus" : 1,
    "confocal" : 2,
    "follow_up" : 3
}

df_data = df_data[df_data["dx_type"].apply(lambda x: x in categories.keys())]
df_data["dx_type"] = df_data["dx_type"].apply(categories.get)
df_data

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,img_path,img
1,HAM_0000118,ISIC_0025030,0,0,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025030.jpg,"[[[24, 56, 106, 143, 167, 173, 177, 178, 185, ..."
2,HAM_0002730,ISIC_0026769,0,0,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0026769.jpg,"[[[190, 199, 200, 205, 207, 207, 209, 201, 199..."
3,HAM_0002730,ISIC_0025661,0,0,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025661.jpg,"[[[35, 83, 128, 161, 174, 180, 191, 192, 199, ..."
4,HAM_0001466,ISIC_0031633,0,0,75.0,male,ear,vidir_modern,HAM10000_images_part_2/ISIC_0031633.jpg,"[[[155, 188, 210, 220, 228, 233, 235, 234, 238..."
8,HAM_0005132,ISIC_0025837,0,0,70.0,female,back,vidir_modern,HAM10000_images_part_1/ISIC_0025837.jpg,"[[[122, 158, 179, 184, 191, 188, 194, 195, 199..."
...,...,...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,6,0,40.0,male,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033084.jpg,"[[[181, 179, 180, 183, 185, 191, 215, 219, 223..."
10011,HAM_0002867,ISIC_0033550,6,0,40.0,male,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033550.jpg,"[[[4, 24, 101, 128, 133, 126, 139, 152, 160, 1..."
10012,HAM_0002867,ISIC_0033536,6,0,40.0,male,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033536.jpg,"[[[132, 157, 177, 174, 190, 197, 211, 210, 210..."
10013,HAM_0000239,ISIC_0032854,6,0,80.0,male,face,vidir_modern,HAM10000_images_part_2/ISIC_0032854.jpg,"[[[160, 163, 166, 168, 170, 174, 177, 181, 182..."


In [60]:
df_data.groupby("dx_type").size() / len(df_data)

dx_type
0    0.549820
1    0.089439
2    0.006717
3    0.354024
dtype: float64

Now with sex

In [61]:
df_data.sex.unique()

array(['male', 'female', 'unknown'], dtype=object)

In [62]:
categories = {
    "male"   : 0,
    "female" : 1,
    "unknown" : 99
}

df_data = df_data[df_data["sex"].apply(lambda x: x in categories.keys())]
df_data["sex"] = df_data["sex"].apply(categories.get)
df_data

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,img_path,img
1,HAM_0000118,ISIC_0025030,0,0,80.0,0,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025030.jpg,"[[[24, 56, 106, 143, 167, 173, 177, 178, 185, ..."
2,HAM_0002730,ISIC_0026769,0,0,80.0,0,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0026769.jpg,"[[[190, 199, 200, 205, 207, 207, 209, 201, 199..."
3,HAM_0002730,ISIC_0025661,0,0,80.0,0,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025661.jpg,"[[[35, 83, 128, 161, 174, 180, 191, 192, 199, ..."
4,HAM_0001466,ISIC_0031633,0,0,75.0,0,ear,vidir_modern,HAM10000_images_part_2/ISIC_0031633.jpg,"[[[155, 188, 210, 220, 228, 233, 235, 234, 238..."
8,HAM_0005132,ISIC_0025837,0,0,70.0,1,back,vidir_modern,HAM10000_images_part_1/ISIC_0025837.jpg,"[[[122, 158, 179, 184, 191, 188, 194, 195, 199..."
...,...,...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,6,0,40.0,0,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033084.jpg,"[[[181, 179, 180, 183, 185, 191, 215, 219, 223..."
10011,HAM_0002867,ISIC_0033550,6,0,40.0,0,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033550.jpg,"[[[4, 24, 101, 128, 133, 126, 139, 152, 160, 1..."
10012,HAM_0002867,ISIC_0033536,6,0,40.0,0,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033536.jpg,"[[[132, 157, 177, 174, 190, 197, 211, 210, 210..."
10013,HAM_0000239,ISIC_0032854,6,0,80.0,0,face,vidir_modern,HAM10000_images_part_2/ISIC_0032854.jpg,"[[[160, 163, 166, 168, 170, 174, 177, 181, 182..."


In [63]:
df_data.groupby("sex").size() / len(df_data)

sex
0     0.542107
1     0.456649
99    0.001244
dtype: float64

Now for localization

In [64]:
df_data.localization.unique()

array(['scalp', 'ear', 'back', 'trunk', 'chest', 'upper extremity',
       'face', 'abdomen', 'unknown', 'lower extremity', 'genital', 'neck',
       'hand', 'foot', 'acral'], dtype=object)

In [65]:
categories = {
    "scalp"  : 0,
    "ear" : 1,
    "face" : 2,
    "back" : 3,
    "trunk" : 4,
    "chest": 5,
    "upper extremity" :6,
    "abdomen" : 7,
    "lower extremity" : 8,
    "genital" : 9,
    "neck": 10,
    "hand" :11,
    "foot" : 12,
    "acral": 13,
    "unknown" :99,

}

df_data = df_data[df_data["localization"].apply(lambda x: x in categories.keys())]
df_data["localization"] = df_data["localization"].apply(categories.get)
df_data

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,img_path,img
1,HAM_0000118,ISIC_0025030,0,0,80.0,0,0,vidir_modern,HAM10000_images_part_1/ISIC_0025030.jpg,"[[[24, 56, 106, 143, 167, 173, 177, 178, 185, ..."
2,HAM_0002730,ISIC_0026769,0,0,80.0,0,0,vidir_modern,HAM10000_images_part_1/ISIC_0026769.jpg,"[[[190, 199, 200, 205, 207, 207, 209, 201, 199..."
3,HAM_0002730,ISIC_0025661,0,0,80.0,0,0,vidir_modern,HAM10000_images_part_1/ISIC_0025661.jpg,"[[[35, 83, 128, 161, 174, 180, 191, 192, 199, ..."
4,HAM_0001466,ISIC_0031633,0,0,75.0,0,1,vidir_modern,HAM10000_images_part_2/ISIC_0031633.jpg,"[[[155, 188, 210, 220, 228, 233, 235, 234, 238..."
8,HAM_0005132,ISIC_0025837,0,0,70.0,1,3,vidir_modern,HAM10000_images_part_1/ISIC_0025837.jpg,"[[[122, 158, 179, 184, 191, 188, 194, 195, 199..."
...,...,...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,6,0,40.0,0,7,vidir_modern,HAM10000_images_part_2/ISIC_0033084.jpg,"[[[181, 179, 180, 183, 185, 191, 215, 219, 223..."
10011,HAM_0002867,ISIC_0033550,6,0,40.0,0,7,vidir_modern,HAM10000_images_part_2/ISIC_0033550.jpg,"[[[4, 24, 101, 128, 133, 126, 139, 152, 160, 1..."
10012,HAM_0002867,ISIC_0033536,6,0,40.0,0,7,vidir_modern,HAM10000_images_part_2/ISIC_0033536.jpg,"[[[132, 157, 177, 174, 190, 197, 211, 210, 210..."
10013,HAM_0000239,ISIC_0032854,6,0,80.0,0,2,vidir_modern,HAM10000_images_part_2/ISIC_0032854.jpg,"[[[160, 163, 166, 168, 170, 174, 177, 181, 182..."


### Logistic Regression using factors **other than** the image

Splitting the code into training vs test:

- Including all numeric predictor variables in the training data
- Also including an identifying variable: "lesion_id" 

- Including just "dx" to the test dataset

In [66]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_data[['age', 'sex', 'localization', 'dx_type', 'lesion_id']], df_data['dx'], test_size=0.2, random_state=42)
X_train

Unnamed: 0,age,sex,localization,dx_type,lesion_id
1156,60.0,0,6,1,HAM_0000827
373,75.0,0,8,0,HAM_0001389
5382,45.0,0,6,3,HAM_0002885
6162,25.0,0,4,3,HAM_0002309
7735,30.0,1,3,0,HAM_0006567
...,...,...,...,...,...
6604,50.0,0,4,3,HAM_0003747
6832,55.0,1,6,0,HAM_0000464
1081,70.0,1,4,1,HAM_0003715
9457,35.0,0,7,1,HAM_0000675


I want *lesion_id* to be in the training dataset, but I don't want to use it in my analyses 

1. because it is a non-numeric value
 
    and 

2. because it's associated with the answer

In [224]:
# this is the exact dataset I want to train on, training dataset without lesion_id
X_train.iloc[:,0:4]

Unnamed: 0,age,sex,localization,dx_type
2401,45.0,1,7,1
449,55.0,1,8,0
2496,85.0,0,8,0
715,80.0,1,8,0
809,50.0,1,2,2
...,...,...,...,...
5746,45.0,1,7,3
5203,45.0,0,7,3
5402,45.0,1,10,3
860,55.0,1,5,1


In [237]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
f = LR.fit(X_train.iloc[:,0:4], y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [238]:
LR.score(X_train.iloc[:,0:4], y_train)


0.6771277931207632

In [240]:
LR.score(X_test.iloc[:,0:4], y_test)

0.6842369477911646

This model scores a bit better on the testing than the training data but I chalk that up to chance.

#### Finding the baseline frequency

In [228]:
#df_data.dx.unique()
df_data.groupby("dx").size() / len(df_data)

dx
0    0.109359
1    0.668809
2    0.011549
3    0.111569
4    0.014260
5    0.051617
6    0.032838
dtype: float64

On the training data, this model has an accuracy of about 67%. The baseline accuracy of this data is 67% so... we really aren't doing that much better than randomly guessing.

### Logistic Regression Model using images

First, I need to recreate training and testing data so that the training data uses the images

In [67]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_data[['lesion_id', 'img']], df_data['dx'], test_size=0.2, random_state=42)
X_train

Unnamed: 0,lesion_id,img
1156,HAM_0000827,"[[[168, 176, 178, 179, 178, 185, 184, 184, 191..."
373,HAM_0001389,"[[[140, 148, 154, 163, 165, 171, 172, 172, 173..."
5382,HAM_0002885,"[[[242, 234, 240, 244, 244, 245, 247, 246, 246..."
6162,HAM_0002309,"[[[232, 233, 236, 236, 237, 237, 239, 239, 240..."
7735,HAM_0006567,"[[[183, 182, 184, 185, 187, 190, 190, 191, 192..."
...,...,...
6604,HAM_0003747,"[[[225, 225, 223, 222, 224, 228, 228, 229, 232..."
6832,HAM_0000464,"[[[160, 165, 169, 172, 172, 174, 177, 183, 185..."
1081,HAM_0003715,"[[[215, 218, 220, 217, 214, 218, 219, 221, 222..."
9457,HAM_0000675,"[[[164, 168, 172, 175, 175, 175, 177, 180, 182..."


In [74]:
n, p = X_train.shape[0], X_train.shape[1] - 1

img_tensors = [torch.Tensor(img) for img in X_train['img'].values]
X_train['img_tensor'] = img_tensors
print(X_train.shape)

img_tensors_tst = [torch.Tensor(img) for img in X_test['img'].values]
X_test['img_tensor'] = img_tensors_tst
print(X_test.shape)

X_train_flat = torch.stack([img.flatten() for img in X_train['img_tensor']])
X_train_flat.shape

X_test_flat = torch.stack([img.flatten() for img in X_test['img_tensor']])



(6431, 3)
(1608, 3)


In [69]:
X_train_flat[1].size()

torch.Size([3072])

In [70]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
f = LR.fit(X_train_flat, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [71]:
X_train_flat

tensor([[168., 176., 178.,  ...,  90.,  56.,  31.],
        [140., 148., 154.,  ..., 124., 109.,  93.],
        [242., 234., 240.,  ..., 151., 147., 145.],
        ...,
        [215., 218., 220.,  ..., 118., 116., 109.],
        [164., 168., 172.,  ..., 153., 139., 137.],
        [145., 153., 154.,  ..., 166., 166., 159.]])

In [72]:
LR.score(X_train_flat, y_train)

0.7017571139791634

Cool! Logistic regression scores about 70% accuracy. This is better than the logistic regression using non-image variables, but not by much.

In [75]:
LR.score(X_test_flat, y_test)

0.6660447761194029

On test data, this score around 69%

### Pixel-By-Pixel Prediction

In [76]:
X_test_flat = torch.stack([img.flatten() for img in X_test['img_tensor']])
X_test_flat.shape


torch.Size([1608, 3072])

In [77]:

LR.score(X_test_flat, y_test)

0.6660447761194029

This scores similarly, though a bit worse, than the previous model, at 69% accuracy

Incorporating a pipeline function

In [78]:
def vectorization_experiment(pipeline = lambda x: x, return_confusion_matrix = False):
    print(f"Number of features = {X_train_flat.size(1)}")

    LR = LogisticRegression() 
    LR.fit(X_train_flat, y_train)
    print(f"Training accuracy = {LR.score(X_train_flat, y_train):.2f}")

    print(f"Testing accuracy  = {LR.score(X_test_flat, y_test):.2f}")

    if return_confusion_matrix: 
        y_test_pred = LR.predict(X_test_flat)
        return confusion_matrix(y_test, y_test_pred, normalize = "true")

In [79]:
vectorization_experiment() # same experiment as above

Number of features = 3072
Training accuracy = 0.70
Testing accuracy  = 0.67


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Combining image and numerical variables:
Clearly separately, the image and other variables models do... not too great. I wonder if combining them will allow for improvement

Splitting the training and testing data

In [80]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_data[['age', 'sex', 'localization', 'dx_type', 'lesion_id', 'img']], df_data['dx'], test_size=0.2, random_state=42)
X_train

Unnamed: 0,age,sex,localization,dx_type,lesion_id,img
1156,60.0,0,6,1,HAM_0000827,"[[[168, 176, 178, 179, 178, 185, 184, 184, 191..."
373,75.0,0,8,0,HAM_0001389,"[[[140, 148, 154, 163, 165, 171, 172, 172, 173..."
5382,45.0,0,6,3,HAM_0002885,"[[[242, 234, 240, 244, 244, 245, 247, 246, 246..."
6162,25.0,0,4,3,HAM_0002309,"[[[232, 233, 236, 236, 237, 237, 239, 239, 240..."
7735,30.0,1,3,0,HAM_0006567,"[[[183, 182, 184, 185, 187, 190, 190, 191, 192..."
...,...,...,...,...,...,...
6604,50.0,0,4,3,HAM_0003747,"[[[225, 225, 223, 222, 224, 228, 228, 229, 232..."
6832,55.0,1,6,0,HAM_0000464,"[[[160, 165, 169, 172, 172, 174, 177, 183, 185..."
1081,70.0,1,4,1,HAM_0003715,"[[[215, 218, 220, 217, 214, 218, 219, 221, 222..."
9457,35.0,0,7,1,HAM_0000675,"[[[164, 168, 172, 175, 175, 175, 177, 180, 182..."


In [81]:
img_tensors = [torch.Tensor(img) for img in X_train['img'].values]
X_train['img_tensor'] = img_tensors
print(X_train.shape)

img_tensors_tst = [torch.Tensor(img) for img in X_test['img'].values]
X_test['img_tensor'] = img_tensors_tst
print(X_test.shape)

X_train_flat = torch.stack([img.flatten() for img in X_train['img_tensor']])
X_train_flat.shape

(6431, 7)
(1608, 7)


torch.Size([6431, 3072])

In [82]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
f = LR.fit(X_train_flat, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [83]:
LR.score(X_train_flat, y_train)

0.7017571139791634

In [84]:
LR.score(X_test_flat, y_test)

0.6660447761194029

The model acts pretty similarly when its run using all variables available...

### Creating a confusion matrix

In [85]:
preds = LR.predict(X_test_flat)

In [86]:
from sklearn.metrics import confusion_matrix as cm
cm(y_test, preds)

array([[ 25, 121,   0,  17,   0,  16,   4],
       [ 15, 990,   0,  32,   0,  15,   0],
       [  5,   4,   0,   1,   0,   2,   0],
       [  8, 167,   0,  25,   2,   2,   1],
       [  1,  16,   0,   2,   1,   2,   0],
       [  6,  38,   0,   2,   0,  27,   1],
       [  9,  35,   0,   2,   0,  11,   3]])

### Classifying between malignant and benign cancers

Using all available variables and the images, I classiyf the data as malignant or non malignant skin lesions. I consider malignant skin lesions melanoma (MEL) and Bowen's disease (akiec).

In [98]:
#creating malignant column that is 1 if the row corresponds to a malignant skin lesion akiec or mel

df_data['malignant'] = (df_data['dx'] == 3) | (df_data['dx'] == 6)


df_data.groupby("malignant").size() / len(df_data)


malignant
False    0.849733
True     0.150267
dtype: float64

In [99]:
# splitting training and test groups
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_data[['age', 'sex', 'localization', 'dx_type', 'img']], df_data['malignant'], test_size=0.2, random_state=42)
X_train

Unnamed: 0,age,sex,localization,dx_type,img
1156,60.0,0,6,1,"[[[168, 176, 178, 179, 178, 185, 184, 184, 191..."
373,75.0,0,8,0,"[[[140, 148, 154, 163, 165, 171, 172, 172, 173..."
5382,45.0,0,6,3,"[[[242, 234, 240, 244, 244, 245, 247, 246, 246..."
6162,25.0,0,4,3,"[[[232, 233, 236, 236, 237, 237, 239, 239, 240..."
7735,30.0,1,3,0,"[[[183, 182, 184, 185, 187, 190, 190, 191, 192..."
...,...,...,...,...,...
6604,50.0,0,4,3,"[[[225, 225, 223, 222, 224, 228, 228, 229, 232..."
6832,55.0,1,6,0,"[[[160, 165, 169, 172, 172, 174, 177, 183, 185..."
1081,70.0,1,4,1,"[[[215, 218, 220, 217, 214, 218, 219, 221, 222..."
9457,35.0,0,7,1,"[[[164, 168, 172, 175, 175, 175, 177, 180, 182..."


In [100]:
img_tensors = [torch.Tensor(img) for img in X_train['img'].values]
X_train['img_tensor'] = img_tensors
print(X_train.shape)

img_tensors_tst = [torch.Tensor(img) for img in X_test['img'].values]
X_test['img_tensor'] = img_tensors_tst
print(X_test.shape)

X_train_flat = torch.stack([img.flatten() for img in X_train['img_tensor']])
X_train_flat.shape

(6431, 6)
(1608, 6)


torch.Size([6431, 3072])

In [101]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
f = LR.fit(X_train_flat, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [102]:
LR.score(X_train_flat, y_train)

0.869382677655108

In [103]:
LR.score(X_test_flat, y_test)

0.8165422885572139

Not bad! But where are the majority of these identifications going to?

In [104]:
preds = LR.predict(X_test_flat)

In [105]:
from sklearn.metrics import confusion_matrix as cm
cm(y_test, preds)

array([[1286,   57],
       [ 238,   27]])

In [135]:
#getting the proportion of correctly identified malignant images
27/(238 + 27)

0.1018867924528302

This correctly identifies about 10% of malignant skin lesions, likely because they are so many more non malignant images in the dataset (85%)

### Applying class weights

In [109]:
class_counts = df_data['malignant'].value_counts().sort_index()
class_counts_list = class_counts.values.tolist()

# confirmed: these are in the correct order as the label encoder!
class_counts

malignant
False    6831
True     1208
Name: count, dtype: int64

In [112]:
#determing the number of non malignant images to 1 malignant image
maligToNonmalig = df_data['malignant'].value_counts()[0]/df_data['malignant'].value_counts()[1]
maligToNonmalig

  maligToNonmalig = df_data['malignant'].value_counts()[0]/df_data['malignant'].value_counts()[1]


5.654801324503311

In [126]:
#generating weights
weights = {False:1.0 , True:maligToNonmalig}
weights

{False: 1.0, True: 5.654801324503311}

In [127]:
#defining a LR model with weights
model = LogisticRegression(solver='lbfgs', class_weight = weights)
f = model.fit(X_train_flat, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [128]:
model.score(X_train_flat, y_train)

0.7406313170580003

In [129]:
model.score(X_test_flat, y_test)

0.7083333333333334

In [130]:
preds = model.predict(X_test_flat)

In [131]:
from sklearn.metrics import confusion_matrix as cm
cm(y_test, preds)

array([[989, 354],
       [115, 150]])

In [133]:
#getting proportion of correctly identified malignant images
150/(150+115)

0.5660377358490566

This does a much better job at detecting malignant images at 57% accuracy! From just 10% accuracy, this is great to now correctly identify over half of malignant images. There is also minimal overfitting which suggests that this is generalizable. 

Of course, 57% identifiability is far from where we would ideally be.