In [1]:
import os
import pandas as pd
import numpy as np
from glob import glob

base_skin_dir = "/Users/csengeszoke/programming/Thesis/Code/Fairness-Rectification/Data"

imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(base_skin_dir, '*', '*.jpg'))}

lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

metadata_path = os.path.join(base_skin_dir, 'HAM10000_metadata.csv')

tile_df = pd.read_csv(metadata_path)

tile_df['path'] = tile_df['image_id'].map(imageid_path_dict.get)
tile_df['cell_type'] = tile_df['dx'].map(lesion_type_dict.get) 
tile_df['cell_type_idx'] = pd.Categorical(tile_df['cell_type']).codes

tile_df.sample(3)

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx
3986,HAM_0002555,ISIC_0032398,nv,follow_up,55.0,male,trunk,/Users/csengeszoke/programming/Thesis/Code/Fai...,Melanocytic nevi,4
1129,HAM_0005492,ISIC_0032138,df,histo,35.0,female,lower extremity,/Users/csengeszoke/programming/Thesis/Code/Fai...,Dermatofibroma,3
5983,HAM_0004837,ISIC_0030735,nv,follow_up,50.0,female,lower extremity,/Users/csengeszoke/programming/Thesis/Code/Fai...,Melanocytic nevi,4


In [2]:
df = pd.DataFrame()

df['filepaths'] =tile_df['path']
df['labels'] =tile_df['cell_type_idx']
df['labels'] =df['labels'].apply(lambda x: str(x))
df.head()

Unnamed: 0,filepaths,labels
0,/Users/csengeszoke/programming/Thesis/Code/Fai...,2
1,/Users/csengeszoke/programming/Thesis/Code/Fai...,2
2,/Users/csengeszoke/programming/Thesis/Code/Fai...,2
3,/Users/csengeszoke/programming/Thesis/Code/Fai...,2
4,/Users/csengeszoke/programming/Thesis/Code/Fai...,2


In [3]:
# This df will be used for fairness evaluation
df2 = pd.DataFrame() 

df2['filepaths'] = tile_df['path']
df2['labels'] = tile_df['cell_type_idx']
#df2['labels'] = df['labels'].apply(lambda x: str(x))
df2['age'] = tile_df['age']
df2['sex'] = tile_df['sex']
df2.head()

gender_counts = df2['sex'].value_counts()
print(gender_counts/len(df2))

age_counts = df2['age'].value_counts()
print(age_counts)

sex
male       0.539790
female     0.454518
unknown    0.005691
Name: count, dtype: float64
age
45.0    1299
50.0    1187
55.0    1009
40.0     985
60.0     803
70.0     756
35.0     753
65.0     731
75.0     618
30.0     464
80.0     404
85.0     290
25.0     247
20.0     169
5.0       86
15.0      77
10.0      41
0.0       39
Name: count, dtype: int64


In [4]:
from sklearn.model_selection import train_test_split

def split_df(df, trsize, column):
    train_df, temp_df = train_test_split(df, train_size=trsize, shuffle=True, random_state=42, stratify=df[column])
    valid_df, test_df= train_test_split(temp_df, train_size=.50, shuffle=True, random_state=42, stratify=temp_df[column])
    print('train_df length: ', len(train_df), '  test_df length: ', len(test_df), '  valid_df length: ', len(valid_df)) 
    return train_df, valid_df, test_df 

train_df, valid_df, test_df = split_df(df2, .60, 'labels')
len(df2)

train_df length:  6009   test_df length:  2003   valid_df length:  2003


10015

In [5]:
import torch
from torchvision.io import read_image
from torch.utils.data import DataLoader, Dataset
from PIL import Image


class CustomImageDataset(Dataset):
    def __init__(self,data_frame, transform=None, target_transform=None,batch_size=32):
        self.data_frame = data_frame
        self.transform = transform
        self.batch_size = batch_size
        self.target_transform = target_transform

    def __len__(self):
        return len(self.data_frame['filepaths'])

    def __getitem__(self, idx):
        img_path = self.data_frame['filepaths'].iloc[idx]
        image = Image.open(img_path).convert("RGB")
        label =  self.data_frame['labels'].iloc[idx]
        age = self.data_frame['age'].iloc[idx]
        gender = self.data_frame['sex'].iloc[idx]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, torch.tensor(label, dtype=torch.long),age,gender

In [6]:
from torchvision import transforms, models
import torchvision.transforms as T

transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor()])

train_dataloader = DataLoader(CustomImageDataset(data_frame = train_df, transform=transform),batch_size=32)
val_dataloader =  DataLoader(CustomImageDataset(data_frame = valid_df,transform=transform) , batch_size=32)
test_dataloader =  DataLoader(CustomImageDataset(data_frame = test_df,transform=transform),batch_size=32)

In [7]:
from torchvision.models import resnet18
import torch.nn as nn
import torch.optim as optim
import numpy as np

# # Load pre-trained ResNet-18
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, 7)  # Adjusting output layer for 7 classes
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 20
top_accuracy = 0
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for images, labels,age,gender in train_dataloader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_dataloader):.4f}")



KeyboardInterrupt: 

In [8]:
image_path = train_df.iloc[0]['filepaths']  # Replace with actual column name
image = Image.open(image_path).convert("RGB")  # Load image
print(type(image))

image = transform(image)  # Convert PIL image to Tensor
print(type(image))

<class 'PIL.Image.Image'>
<class 'torch.Tensor'>
