In [2]:
import torch 
import torchvision
from torchvision import transforms
import torchvision.datasets as datasets
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import os
from glob import glob
from PIL import Image
from torch.nn import Conv2d, MaxPool2d, Parameter


  Referenced from: <2D1B8D5C-7891-3680-9CF9-F771AE880676> /Users/breannaguo/anaconda3/envs/ml-0451/lib/python3.9/site-packages/torchvision/image.so
  warn(


### Data Loading

In [3]:
df_data = pd.read_csv("HAM10000_metadata")
df_data.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern


In [4]:

img_path = {os.path.splitext(os.path.basename(x))[0]: x for x in glob((os.path.join('*', '*.jpg')))}
df_data['img_path'] = df_data['image_id'].map(img_path.get)

#drop rows with no image path
df_data.dropna(inplace=True)
df_data.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,img_path
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0027419.jpg
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025030.jpg
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0026769.jpg
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025661.jpg
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,HAM10000_images_part_2/ISIC_0031633.jpg


In [8]:
def load_image(image_path):
    try:
        image = Image.open(image_path)
        return np.asarray(image.resize((32, 32)))
    except Exception as e:
        print(f"Error loading image '{image_path}': {e}")
        return None

In [9]:
def transpose(img):
    return np.transpose(img, (2, 0 ,1))

In [10]:
df_data['img'] = df_data['img_path'].map(load_image)
df_data['img'] = df_data['img'].apply(transpose)
df_data.dropna()
df_data.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,img_path,img
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0027419.jpg,"[[[191, 194, 196, 199, 205, 209, 208, 208, 213..."
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025030.jpg,"[[[24, 56, 106, 143, 167, 173, 177, 178, 185, ..."
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0026769.jpg,"[[[190, 199, 200, 205, 207, 207, 209, 201, 199..."
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025661.jpg,"[[[35, 83, 128, 161, 174, 180, 191, 192, 199, ..."
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,HAM10000_images_part_2/ISIC_0031633.jpg,"[[[155, 188, 210, 220, 228, 233, 235, 234, 238..."


In [11]:
print(df_data.shape)
print(df_data['img'].iloc[10].shape)

(9958, 10)
(3, 32, 32)


Assigning each of the dx values a numerical value

In [206]:
df_data.dx.unique()


array(['bkl', 'nv', 'df', 'mel', 'vasc', 'bcc', 'akiec'], dtype=object)

In [16]:

categories = {
    "bkl"   : 0,
    "nv" : 1,
    "df" : 2,
    "mel" : 3,
    "vasc" : 4,
    "bcc": 5,
    "akiec" :6
}

df_data = df_data[df_data["dx"].apply(lambda x: x in categories.keys())]
df_data.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,img_path,img
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0027419.jpg,"[[[191, 194, 196, 199, 205, 209, 208, 208, 213..."
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025030.jpg,"[[[24, 56, 106, 143, 167, 173, 177, 178, 185, ..."
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0026769.jpg,"[[[190, 199, 200, 205, 207, 207, 209, 201, 199..."
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025661.jpg,"[[[35, 83, 128, 161, 174, 180, 191, 192, 199, ..."
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,HAM10000_images_part_2/ISIC_0031633.jpg,"[[[155, 188, 210, 220, 228, 233, 235, 234, 238..."


In [17]:
df_data["dx"] = df_data["dx"].apply(categories.get)
df_data

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,img_path,img
0,HAM_0000118,ISIC_0027419,0,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0027419.jpg,"[[[191, 194, 196, 199, 205, 209, 208, 208, 213..."
1,HAM_0000118,ISIC_0025030,0,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025030.jpg,"[[[24, 56, 106, 143, 167, 173, 177, 178, 185, ..."
2,HAM_0002730,ISIC_0026769,0,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0026769.jpg,"[[[190, 199, 200, 205, 207, 207, 209, 201, 199..."
3,HAM_0002730,ISIC_0025661,0,histo,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025661.jpg,"[[[35, 83, 128, 161, 174, 180, 191, 192, 199, ..."
4,HAM_0001466,ISIC_0031633,0,histo,75.0,male,ear,vidir_modern,HAM10000_images_part_2/ISIC_0031633.jpg,"[[[155, 188, 210, 220, 228, 233, 235, 234, 238..."
...,...,...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,6,histo,40.0,male,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033084.jpg,"[[[181, 179, 180, 183, 185, 191, 215, 219, 223..."
10011,HAM_0002867,ISIC_0033550,6,histo,40.0,male,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033550.jpg,"[[[4, 24, 101, 128, 133, 126, 139, 152, 160, 1..."
10012,HAM_0002867,ISIC_0033536,6,histo,40.0,male,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033536.jpg,"[[[132, 157, 177, 174, 190, 197, 211, 210, 210..."
10013,HAM_0000239,ISIC_0032854,6,histo,80.0,male,face,vidir_modern,HAM10000_images_part_2/ISIC_0032854.jpg,"[[[160, 163, 166, 168, 170, 174, 177, 181, 182..."


Now doing that for other non-numeric variables that will be used in the analyses

In [209]:
df_data.dx_type.unique()

array(['histo', 'consensus', 'confocal', 'follow_up'], dtype=object)

In [18]:
categories = {
    "histo"   : 0,
    "consensus" : 1,
    "confocal" : 2,
    "follow_up" : 3
}

df_data = df_data[df_data["dx_type"].apply(lambda x: x in categories.keys())]
df_data["dx_type"] = df_data["dx_type"].apply(categories.get)
df_data

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,img_path,img
0,HAM_0000118,ISIC_0027419,0,0,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0027419.jpg,"[[[191, 194, 196, 199, 205, 209, 208, 208, 213..."
1,HAM_0000118,ISIC_0025030,0,0,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025030.jpg,"[[[24, 56, 106, 143, 167, 173, 177, 178, 185, ..."
2,HAM_0002730,ISIC_0026769,0,0,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0026769.jpg,"[[[190, 199, 200, 205, 207, 207, 209, 201, 199..."
3,HAM_0002730,ISIC_0025661,0,0,80.0,male,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025661.jpg,"[[[35, 83, 128, 161, 174, 180, 191, 192, 199, ..."
4,HAM_0001466,ISIC_0031633,0,0,75.0,male,ear,vidir_modern,HAM10000_images_part_2/ISIC_0031633.jpg,"[[[155, 188, 210, 220, 228, 233, 235, 234, 238..."
...,...,...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,6,0,40.0,male,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033084.jpg,"[[[181, 179, 180, 183, 185, 191, 215, 219, 223..."
10011,HAM_0002867,ISIC_0033550,6,0,40.0,male,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033550.jpg,"[[[4, 24, 101, 128, 133, 126, 139, 152, 160, 1..."
10012,HAM_0002867,ISIC_0033536,6,0,40.0,male,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033536.jpg,"[[[132, 157, 177, 174, 190, 197, 211, 210, 210..."
10013,HAM_0000239,ISIC_0032854,6,0,80.0,male,face,vidir_modern,HAM10000_images_part_2/ISIC_0032854.jpg,"[[[160, 163, 166, 168, 170, 174, 177, 181, 182..."


In [19]:
df_data.groupby("dx_type").size() / len(df_data)

dx_type
0    0.535449
1    0.085660
2    0.006929
3    0.371962
dtype: float64

Now with sex

In [20]:
df_data.sex.unique()

array(['male', 'female', 'unknown'], dtype=object)

In [21]:
categories = {
    "male"   : 0,
    "female" : 1,
    "unknown" : 99
}

df_data = df_data[df_data["sex"].apply(lambda x: x in categories.keys())]
df_data["sex"] = df_data["sex"].apply(categories.get)
df_data

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,img_path,img
0,HAM_0000118,ISIC_0027419,0,0,80.0,0,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0027419.jpg,"[[[191, 194, 196, 199, 205, 209, 208, 208, 213..."
1,HAM_0000118,ISIC_0025030,0,0,80.0,0,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025030.jpg,"[[[24, 56, 106, 143, 167, 173, 177, 178, 185, ..."
2,HAM_0002730,ISIC_0026769,0,0,80.0,0,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0026769.jpg,"[[[190, 199, 200, 205, 207, 207, 209, 201, 199..."
3,HAM_0002730,ISIC_0025661,0,0,80.0,0,scalp,vidir_modern,HAM10000_images_part_1/ISIC_0025661.jpg,"[[[35, 83, 128, 161, 174, 180, 191, 192, 199, ..."
4,HAM_0001466,ISIC_0031633,0,0,75.0,0,ear,vidir_modern,HAM10000_images_part_2/ISIC_0031633.jpg,"[[[155, 188, 210, 220, 228, 233, 235, 234, 238..."
...,...,...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,6,0,40.0,0,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033084.jpg,"[[[181, 179, 180, 183, 185, 191, 215, 219, 223..."
10011,HAM_0002867,ISIC_0033550,6,0,40.0,0,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033550.jpg,"[[[4, 24, 101, 128, 133, 126, 139, 152, 160, 1..."
10012,HAM_0002867,ISIC_0033536,6,0,40.0,0,abdomen,vidir_modern,HAM10000_images_part_2/ISIC_0033536.jpg,"[[[132, 157, 177, 174, 190, 197, 211, 210, 210..."
10013,HAM_0000239,ISIC_0032854,6,0,80.0,0,face,vidir_modern,HAM10000_images_part_2/ISIC_0032854.jpg,"[[[160, 163, 166, 168, 170, 174, 177, 181, 182..."


In [22]:
df_data.groupby("sex").size() / len(df_data)

sex
0     0.542278
1     0.456718
99    0.001004
dtype: float64

Now for localization

In [23]:
df_data.localization.unique()

array(['scalp', 'ear', 'face', 'back', 'trunk', 'chest',
       'upper extremity', 'abdomen', 'unknown', 'lower extremity',
       'genital', 'neck', 'hand', 'foot', 'acral'], dtype=object)

In [24]:
categories = {
    "scalp"  : 0,
    "ear" : 1,
    "face" : 2,
    "back" : 3,
    "trunk" : 4,
    "chest": 5,
    "upper extremity" :6,
    "abdomen" : 7,
    "lower extremity" : 8,
    "genital" : 9,
    "neck": 10,
    "hand" :11,
    "foot" : 12,
    "acral": 13,
    "unknown" :99,

}

df_data = df_data[df_data["localization"].apply(lambda x: x in categories.keys())]
df_data["localization"] = df_data["localization"].apply(categories.get)
df_data

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,img_path,img
0,HAM_0000118,ISIC_0027419,0,0,80.0,0,0,vidir_modern,HAM10000_images_part_1/ISIC_0027419.jpg,"[[[191, 194, 196, 199, 205, 209, 208, 208, 213..."
1,HAM_0000118,ISIC_0025030,0,0,80.0,0,0,vidir_modern,HAM10000_images_part_1/ISIC_0025030.jpg,"[[[24, 56, 106, 143, 167, 173, 177, 178, 185, ..."
2,HAM_0002730,ISIC_0026769,0,0,80.0,0,0,vidir_modern,HAM10000_images_part_1/ISIC_0026769.jpg,"[[[190, 199, 200, 205, 207, 207, 209, 201, 199..."
3,HAM_0002730,ISIC_0025661,0,0,80.0,0,0,vidir_modern,HAM10000_images_part_1/ISIC_0025661.jpg,"[[[35, 83, 128, 161, 174, 180, 191, 192, 199, ..."
4,HAM_0001466,ISIC_0031633,0,0,75.0,0,1,vidir_modern,HAM10000_images_part_2/ISIC_0031633.jpg,"[[[155, 188, 210, 220, 228, 233, 235, 234, 238..."
...,...,...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,6,0,40.0,0,7,vidir_modern,HAM10000_images_part_2/ISIC_0033084.jpg,"[[[181, 179, 180, 183, 185, 191, 215, 219, 223..."
10011,HAM_0002867,ISIC_0033550,6,0,40.0,0,7,vidir_modern,HAM10000_images_part_2/ISIC_0033550.jpg,"[[[4, 24, 101, 128, 133, 126, 139, 152, 160, 1..."
10012,HAM_0002867,ISIC_0033536,6,0,40.0,0,7,vidir_modern,HAM10000_images_part_2/ISIC_0033536.jpg,"[[[132, 157, 177, 174, 190, 197, 211, 210, 210..."
10013,HAM_0000239,ISIC_0032854,6,0,80.0,0,2,vidir_modern,HAM10000_images_part_2/ISIC_0032854.jpg,"[[[160, 163, 166, 168, 170, 174, 177, 181, 182..."


### Logistic Regression using factors **other than** the image

Splitting the code into training vs test:

- Including all numeric predictor variables in the training data
- Also including an identifying variable: "lesion_id" 

- Including just "dx" to the test dataset

In [236]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_data[['age', 'sex', 'localization', 'dx_type', 'lesion_id']], df_data['dx'], test_size=0.2, random_state=42)
X_train

Unnamed: 0,age,sex,localization,dx_type,lesion_id
2401,45.0,1,7,1,HAM_0006852
449,55.0,1,8,0,HAM_0006128
2496,85.0,0,8,0,HAM_0000687
715,80.0,1,8,0,HAM_0004695
809,50.0,1,2,2,HAM_0005896
...,...,...,...,...,...
5746,45.0,1,7,3,HAM_0006930
5203,45.0,0,7,3,HAM_0005432
5402,45.0,1,10,3,HAM_0005836
860,55.0,1,5,1,HAM_0001053


I want *lesion_id* to be in the training dataset, but I don't want to use it in my analyses 

1. because it is a non-numeric value
 
    and 

2. because it's associated with the answer

In [224]:
# this is the exact dataset I want to train on, training dataset without lesion_id
X_train.iloc[:,0:4]

Unnamed: 0,age,sex,localization,dx_type
2401,45.0,1,7,1
449,55.0,1,8,0
2496,85.0,0,8,0
715,80.0,1,8,0
809,50.0,1,2,2
...,...,...,...,...
5746,45.0,1,7,3
5203,45.0,0,7,3
5402,45.0,1,10,3
860,55.0,1,5,1


In [237]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
f = LR.fit(X_train.iloc[:,0:4], y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [238]:
LR.score(X_train.iloc[:,0:4], y_train)


0.6771277931207632

In [240]:
LR.score(X_test.iloc[:,0:4], y_test)

0.6842369477911646

This model scores a bit better on the testing than the training data but I chalk that up to chance.

#### Finding the baseline frequency

In [228]:
#df_data.dx.unique()
df_data.groupby("dx").size() / len(df_data)

dx
0    0.109359
1    0.668809
2    0.011549
3    0.111569
4    0.014260
5    0.051617
6    0.032838
dtype: float64

On the training data, this model has an accuracy of about 68%. The baseline accuracy of this data is 67% so... we really aren't doing that much better than randomly guessing.

### Logistic Regression Model using images

First, I need to recreate training and testing data so that the training data uses the images

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_data[['lesion_id', 'img']], df_data['dx'], test_size=0.2, random_state=42)
X_train

Unnamed: 0,lesion_id,img
2401,HAM_0006852,"[[[185, 189, 191, 195, 196, 197, 197, 198, 199..."
449,HAM_0006128,"[[[185, 152, 174, 193, 191, 190, 192, 192, 192..."
2496,HAM_0000687,"[[[174, 185, 185, 197, 201, 199, 196, 211, 221..."
715,HAM_0004695,"[[[213, 213, 212, 215, 213, 212, 213, 216, 215..."
809,HAM_0005896,"[[[167, 167, 172, 175, 178, 181, 182, 183, 182..."
...,...,...
5746,HAM_0006930,"[[[226, 228, 226, 224, 227, 225, 224, 224, 224..."
5203,HAM_0005432,"[[[239, 239, 240, 240, 241, 243, 244, 244, 245..."
5402,HAM_0005836,"[[[219, 221, 225, 225, 225, 226, 227, 231, 232..."
860,HAM_0001053,"[[[27, 28, 25, 22, 42, 88, 128, 145, 150, 152,..."


In [13]:
n, p = X_train.shape[0], X_train.shape[1] - 1

img_tensors = [torch.Tensor(img) for img in X_train['img'].values]
X_train['img_tensor'] = img_tensors
print(X_train.shape)

img_tensors_tst = [torch.Tensor(img) for img in X_test['img'].values]
X_test['img_tensor'] = img_tensors_tst
print(X_test.shape)

X_train_flat = torch.stack([img.flatten() for img in X_train['img_tensor']])
X_train_flat.shape

  img_tensors = [torch.Tensor(img) for img in X_train['img'].values]


(7966, 3)
(1992, 3)


torch.Size([7966, 3072])

In [15]:
X_train_flat[1].size()

torch.Size([3072])

In [232]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
f = LR.fit(X_train_flat, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [233]:
X_train_flat

tensor([[185., 189., 191.,  ..., 165., 166., 166.],
        [185., 152., 174.,  ..., 183., 184., 181.],
        [174., 185., 185.,  ..., 148.,  89.,  43.],
        ...,
        [219., 221., 225.,  ..., 154., 152., 148.],
        [ 27.,  28.,  25.,  ..., 106.,  67.,  31.],
        [191., 196., 199.,  ..., 150., 154., 155.]])

In [234]:
LR.score(X_train_flat, y_train)

0.6988450916394677

Cool! Logistic regression scores about 70% accuracy. This is better than the logistic regression using non-image variables, but not by much.

In [235]:
LR.score(X_test_flat, y_test)

0.6932730923694779

On test data, this score around 69%

### Pixel-By-Pixel Prediction

In [33]:
X_test_flat = torch.stack([img.flatten() for img in X_test['img_tensor']])
X_test_flat.shape


torch.Size([1992, 3072])

In [None]:

LR.score(X_test_flat, y_test)

This scores similarly, though a bit worse, than the previous model, at 69% accuracy

Incorporating a pipeline function

In [119]:
def vectorization_experiment(pipeline = lambda x: x, return_confusion_matrix = False):
    print(f"Number of features = {X_train_flat.size(1)}")

    LR = LogisticRegression() 
    LR.fit(X_train_flat, y_train)
    print(f"Training accuracy = {LR.score(X_train_flat, y_train):.2f}")

    print(f"Testing accuracy  = {LR.score(X_test_flat, y_test):.2f}")

    if return_confusion_matrix: 
        y_test_pred = LR.predict(X_test_flat)
        return confusion_matrix(y_test, y_test_pred, normalize = "true")

In [120]:
vectorization_experiment() # same experiment as above

Number of features = 3072
Training accuracy = 0.70
Testing accuracy  = 0.69


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Combining image and numerical variables:
Clearly separately, the image and other variables models do... not too great. I wonder if combining them will allow for improvement

Splitting the training and testing data

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_data[['age', 'sex', 'localization', 'dx_type', 'lesion_id', 'img']], df_data['dx'], test_size=0.2, random_state=42)
X_train

Unnamed: 0,age,sex,localization,dx_type,lesion_id,img
2401,45.0,1,7,1,HAM_0006852,"[[[185, 189, 191, 195, 196, 197, 197, 198, 199..."
449,55.0,1,8,0,HAM_0006128,"[[[185, 152, 174, 193, 191, 190, 192, 192, 192..."
2496,85.0,0,8,0,HAM_0000687,"[[[174, 185, 185, 197, 201, 199, 196, 211, 221..."
715,80.0,1,8,0,HAM_0004695,"[[[213, 213, 212, 215, 213, 212, 213, 216, 215..."
809,50.0,1,2,2,HAM_0005896,"[[[167, 167, 172, 175, 178, 181, 182, 183, 182..."
...,...,...,...,...,...,...
5746,45.0,1,7,3,HAM_0006930,"[[[226, 228, 226, 224, 227, 225, 224, 224, 224..."
5203,45.0,0,7,3,HAM_0005432,"[[[239, 239, 240, 240, 241, 243, 244, 244, 245..."
5402,45.0,1,10,3,HAM_0005836,"[[[219, 221, 225, 225, 225, 226, 227, 231, 232..."
860,55.0,1,5,1,HAM_0001053,"[[[27, 28, 25, 22, 42, 88, 128, 145, 150, 152,..."


In [28]:
img_tensors = [torch.Tensor(img) for img in X_train['img'].values]
X_train['img_tensor'] = img_tensors
print(X_train.shape)

img_tensors_tst = [torch.Tensor(img) for img in X_test['img'].values]
X_test['img_tensor'] = img_tensors_tst
print(X_test.shape)

X_train_flat = torch.stack([img.flatten() for img in X_train['img_tensor']])
X_train_flat.shape

(7966, 7)
(1992, 7)


torch.Size([7966, 3072])

In [29]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
f = LR.fit(X_train_flat, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
LR.score(X_train_flat, y_train)

0.6988450916394677

In [34]:
LR.score(X_test_flat, y_test)

0.6932730923694779

The model acts pretty similarly when its run using all variables available...

IF TIME ALLOWS: MAKE CONFUSION MATRIX W WHERE PREDICTED VALUES ARE GOING- ARE THEY ALL GOING TO DX = NV?