In [5]:
import os 
import numpy as np
import pandas as pd 
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as datasets

from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, random_split
from torch.utils.data import Dataset
from PIL import Image

# PATHS

In [6]:
cwd = os.getcwd()
data_path = os.path.join(cwd, "data")
trainset_path = os.path.join(data_path, "trainset")
testset_path = os.path.join(data_path, "testset")

# TOOLS - Custom classes and functions 

## Data loaders 

In [21]:
class PatientDataset(Dataset):
    def __init__(self, root_dir, annotation: pd.DataFrame, patients: list, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.patients = patients
        self.df = annotation

    def __len__(self):
        return len(self.patients)

    def __getitem__(self, idx):
        patient_folder = os.path.join(self.root_dir, self.patients[idx])
        images = []
        for filename in os.listdir(patient_folder):
            image_path = os.path.join(patient_folder, filename)
            image = Image.open(image_path).convert('RGB')
            if self.transform:
                image = self.transform(image)
            images.append(image)

        # get label 
        label = self.df[self.df.index == self.patients[idx]].LABEL
        label_tensor = torch.tensor([label], dtype=torch.float32)

        # get features 
        features = self.df[self.df.index == self.patients[idx]].drop("LABEL", axis=1).to_numpy()
        features_tensor = torch.tensor([features], dtype=torch.float32)
    
        return torch.stack(images), features_tensor, label_tensor, self.patients[idx]

In [8]:
class ImageLevelDataset(Dataset):
    def __init__(self, root_dir, list_images, labels, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.list_img = list_images
        self.labels = labels

    def __len__(self):
        return len(self.list_img)

    def __getitem__(self, idx):
        img_path, patient = self.list_img[idx]

        # get image and label
        label = self.labels[patient]
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:  # transform image
                image = self.transform(image)

        # transfrom label to right format
        label_tensor = torch.tensor([label], dtype=torch.float32)
        return image, label_tensor

## Models

### Features extractor 

In [9]:
class Autoencoder(nn.Module):
    def __init__(self, latent_size=128):
        super(Autoencoder, self).__init__()

        # define latent size
        self.latent_size = latent_size

        # define encoder 
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        # close to latent space
        self.fc_encoder = nn.Linear(256 * 28 * 28, latent_size)
        self.fc_decoder = nn.Linear(latent_size, 256 * 28 * 28)

        # decoder architecture
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(256, 128, kernel_size=3, stride=2, padding=1,output_padding=1),
            nn.ReLU(True),
            nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(True),
            nn.ConvTranspose2d(64, 3, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.Tanh()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = x.view(x.size(0), -1)
        x = self.fc_encoder(x)
        x = self.fc_decoder(x)
        x = x.view(x.size(0), 256, 28, 28)
        x = self.decoder(x)
        return x

    def encoder(self, x):
        x = self.encoder(x)
        x = self.fc_encoder(x)
        return x

### Classifier

In [10]:
class DeepSets(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        """
        Initialization of the deep set model. 

        :params input_dim: int
            Dimension of each element of the input. 
        :params hidden_dim: int 
            Dimension of hidden layers. 
        :parasm output_dim: int 
            Dimensionality of output
        """
        super(DeepSets, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        
        # Shared layers
        self.shared_layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),  
            nn.Sigmoid()
        )
        
        # Permutation-invariant layer
        self.invariant_layer = nn.Sequential(
            nn.Linear(hidden_dim, output_dim), 
            nn.Sigmoid()
        )

    def forward(self, x):
        """
        Foward function of deep set model that given images features and 
        tabular data return a prediction.

        :params x: tuple
            Tuples containing one 2d tensor and one 1d tensor.

        :return one value 
        """
        # seperate img features and tabular data
        img_features, tabular = x
        
        # Apply shared layers to each element of the set
        output = self.shared_layers(img_features)
        
        # Permutation-invariant aggregation
        mean = torch.mean(output, dim=1)  
        
        # concatenate tabular data and feature from bag of images
        conc = torch.concat([mean, tabular], dim=1)
        
        # Apply permutation-invariant layer
        output = self.invariant_layer(conc)
        
        return output

# Introduction 

In [11]:
annotation_file = os.path.join(data_path, "clinical_annotation.csv")
df_ann = pd.read_csv(annotation_file)
df_ann.drop("Unnamed: 0", axis=1, inplace=True)

### RAPID PREPROCESSING

# compute age 
def compute_age(x):
    year = int(x[-4:])
    return 2024 - year
    
df_ann["age"] = df_ann.DOB.apply(compute_age)
df_ann.drop("DOB", axis=1, inplace=True)

#encode gender
df_ann["GENDER"] = df_ann["GENDER"].replace('f', "F")
label_encoder = LabelEncoder()
df_ann['GENDER'] = label_encoder.fit_transform(df_ann['GENDER'])

df_ann.set_index("ID", inplace=True)

In [12]:
df_ann.head()

Unnamed: 0_level_0,LABEL,GENDER,LYMPH_COUNT,age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
P26,1,1,11.2,91
P183,1,1,12.8,82
P89,1,1,9.6,89
P123,1,1,122.6,93
P61,1,0,11.6,93


In [13]:
sub_patients = list(df_ann.loc[df_ann.LABEL==-1].index)

## Train-test split 

Isolate as soon as possible our future test data

In [14]:
test_patients = list(df_ann[df_ann.LABEL==0].sample(15).index)
test_patients += list(df_ann[df_ann.LABEL==1].sample(15).index)
print("Test size: ", len(test_patients))

Test size:  30


Define submission, train and test set 

In [15]:
sub_df = df_ann[df_ann.index.isin(sub_patients)]  # submission patients
test_df = df_ann[df_ann.index.isin(test_patients)]  # test patients

# train patients are the others
train_patients = list(set(df_ann.index) - set(test_patients) - set(sub_patients))
train_df = df_ann[df_ann.index.isin(train_patients)]

# Feature extraction with Auto-encoder

## Data loader 

In [16]:
# preparation of data loader
labels = train_df.LABEL.to_dict()
transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

list_images = []
for pat in train_patients:
    for img in os.listdir(os.path.join(trainset_path, pat)):
        list_images.append((os.path.join(trainset_path, pat, img), pat))
        
print(list_images)

[('d:\\MOI\\CentraleSupelec\\Cours CS\\3A\\SDI\\DLMI\\challenge\\code\\DLMI---Lymphocytosis-classification\\data\\trainset\\P61\\000000.jpg', 'P61'), ('d:\\MOI\\CentraleSupelec\\Cours CS\\3A\\SDI\\DLMI\\challenge\\code\\DLMI---Lymphocytosis-classification\\data\\trainset\\P61\\000001.jpg', 'P61'), ('d:\\MOI\\CentraleSupelec\\Cours CS\\3A\\SDI\\DLMI\\challenge\\code\\DLMI---Lymphocytosis-classification\\data\\trainset\\P61\\000002.jpg', 'P61'), ('d:\\MOI\\CentraleSupelec\\Cours CS\\3A\\SDI\\DLMI\\challenge\\code\\DLMI---Lymphocytosis-classification\\data\\trainset\\P61\\000003.jpg', 'P61'), ('d:\\MOI\\CentraleSupelec\\Cours CS\\3A\\SDI\\DLMI\\challenge\\code\\DLMI---Lymphocytosis-classification\\data\\trainset\\P61\\000004.jpg', 'P61'), ('d:\\MOI\\CentraleSupelec\\Cours CS\\3A\\SDI\\DLMI\\challenge\\code\\DLMI---Lymphocytosis-classification\\data\\trainset\\P61\\000005.jpg', 'P61'), ('d:\\MOI\\CentraleSupelec\\Cours CS\\3A\\SDI\\DLMI\\challenge\\code\\DLMI---Lymphocytosis-classification

In [17]:
dataset = ImageLevelDataset("./data/trainset/", list_images, labels, transform) 
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

## Training 

In [None]:
# Check if GPU is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Initialize the model
model = Autoencoder(latent_size=128).to(device)

# Define loss function (here MSE)
criterion = torch.nn.MSELoss()

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, _ in train_loader:
        optimizer.zero_grad()
        images = images.to(device)
        outputs = model(images)
        loss = criterion(outputs, images)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}")

In [None]:
torch.save(model.state_dict(), 'ae_Mehdi.pt')

## Test auto-encoder performance on test set

In [None]:
test_labels = test_df.LABEL.to_dict()
list_images_test = []
for pat in test_patients:
    for img in os.listdir(os.path.join(trainset_path, pat)):
        list_images_test.append((os.path.join(trainset_path, pat, img), pat))
        
print(list_images_test[:3])

In [None]:
test_ae_dataset = ImageLevelDataset("./data/trainset/", list_images_test, test_labels, transform) 
test_loader = DataLoader(test_ae_dataset, batch_size=32, shuffle=False)

mse_array = np.zeros(len(test_loader))
with torch.no_grad():
    i = 0
    for images, labels in test_loader:
        print(images)
        images = images.to(device)
        outputs = model(images)
        mse_ = torch.nn.MSELoss()(outputs, images) * 32
        mse_array[i] = mse_
        i += 1
        break

print(f"Test MSE: {np.mean(mse_array) / 32}%")

# Classification using deep sets

In [22]:
dataset = PatientDataset(
    root_dir=trainset_path, 
    annotation=train_df,
    patients=train_patients, 
    transform=transform
    )
loader = DataLoader(dataset)

In [27]:
for data in loader:
    img, tab, labels, patient = data
    print(tab, labels, patient)
    # process image
    img = torch.squeeze(img, dim=0)

    # rabular data

tensor([[[[ 0.0000, 11.6000, 93.0000]]]]) tensor([[[1.]]]) ('P61',)
tensor([[[[ 1.0000, 87.9300, 87.0000]]]]) tensor([[[1.]]]) ('P59',)


KeyboardInterrupt: 

In [26]:
t = torch.tensor([[[0, 1, 2]]])
t.squeeze(dim=1)

tensor([[0, 1, 2]])

# For submission

In [None]:
sub_df.head()

In [None]:
sub_labels = sub_df.LABEL.to_dict()
sub_dataset = PatientDataset(testset_path, sub_labels, patients=sub_patients, transform=transform)
sub_loader = DataLoader(sub_dataset)

In [None]:
dico_sub = {}

with torch.no_grad():
    for data in sub_loader:
        images, labels, pat = data
        images = torch.squeeze(images, dim=0)
        outputs = model(images)
        predicted = outputs.float().mean()
        dico_sub[pat] = predicted

In [None]:
dico_sub

# TEST -- to be removed 

In [None]:
cwd = os.getcwd()


In [None]:
df_ann = pd.read_csv(annotation_file)
df_ann.drop("Unnamed: 0", axis=1, inplace=True)
"""
# compute age 
def compute_age(x):
    year = int(x[-4:])
    return 2024 - year
    
df_ann["age"] = df_ann.DOB.apply(compute_age)
df_ann.drop("DOB", axis=1, inplace=True)

#encode gender
df_ann["GENDER"] = df_ann["GENDER"].replace('f', "F")
label_encoder = LabelEncoder()
df_ann['GENDER'] = label_encoder.fit_transform(df_ann['GENDER'])

df_ann.set_index("ID", inplace=True)"""

df_ann.loc[df_ann.LABEL >= 0].groupby("LABEL").count()