Practical Work in AI - Concept Models
Tragler Thomas
====================


In [1]:
import sys

import pandas as pd
import os
import torch
import torch.nn as nn
from torch.utils.data import random_split
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt
import wandb
from datetime import datetime
import torch.optim as optim
from sklearn.model_selection import KFold

import derm7pt_data
from derm7pt_data import Derm7pt_data

from importlib import reload


reload(derm7pt_data)

<module 'derm7pt_data' from 'D:\\Business\\Uni\\Practical Work\\PW_ConceptModels\\derm7pt_data.py'>

In [2]:
#Data loading
path = os.path.normpath('Data\\Derm7pt')

derm7pt = Derm7pt_data(path)
metadata = derm7pt.metadata
print(metadata.shape)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("device:", device)

(680, 20)
device: cuda


In [3]:
metadata.columns

Index(['case_num', 'diagnosis', 'seven_point_score', 'pigment_network',
       'streaks', 'pigmentation', 'regression_structures', 'dots_and_globules',
       'blue_whitish_veil', 'vascular_structures',
       'level_of_diagnostic_difficulty', 'elevation', 'location', 'sex',
       'clinic', 'derm', 'nums', 'is_cancer', 'abbrevs', 'info'],
      dtype='object')

In [4]:
#Torch CNN model with 3 Conv layers and 3 fully connected layers
class Net(nn.Module):
    def __init__(self, num_classes=1, image_size=(192, 128)):
        super(Net, self).__init__()
        mod = 1
        
        #conv Layers
        in_channels, out_channels = (3, 16)
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=4, stride=4, padding=0)
        mod *= 4
        
        in_channels, out_channels = (out_channels, 2*out_channels)
        self.conv2 = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        mod *= 2
        
        in_channels, out_channels = (out_channels, out_channels)
        self.conv3 = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        mod *= 2
        
        # Fully connected layers
        self.first_linear_layer_size = out_channels * (image_size[0]//mod * image_size[1]//mod)
        self.fc1 = nn.Linear(self.first_linear_layer_size, 256)  # Adjust the input size based on your image dimensions
        self.fc2 = nn.Linear(256, 64)
        self.fc3 = nn.Linear(64, num_classes)  # Output layer with one neuron for binary classification
        
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        # Convolutional layers with activation and pooling
        x = self.relu(self.conv1(x))
        x = self.pool1(x)
        x = self.relu(self.conv2(x))
        x = self.pool2(x)
        x = self.relu(self.conv3(x))
        x = self.pool3(x)

        # Flatten the output for fully connected layers
        x = x.view(-1, self.first_linear_layer_size)  # Corrected input size based on spatial dimensions

        # Fully connected layers with activation
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.softmax(self.fc3(x))

        return x
    
def majority_class_baseline(true_labels):
    # Find the most frequent class in the training set
    elems, counts = true_labels.unique(return_counts=True)
    majority_count = counts[counts.argmax()]
    majority_class = elems[counts.argmax()]
    #predictions = torch.full_like(true_labels, majority_class)
    accuracy = majority_count / len(true_labels)
    return majority_class, accuracy

In [5]:
#Training the model
# hyperparameters
n_epochs = 30
learning_rate = 0.00002
n_folds = 8
batch_size = 64

num_classes = derm7pt.diagnosis[derm7pt.model_columns["label"]].nunique()
#Categorical crossentropy loss
criterion = nn.CrossEntropyLoss()

wandb.init(
    # set the wandb project where this run will be logged
    project= "PracticalWork",
    
    # track hyperparameters and run metadata
    config={
    "learning_rate": learning_rate,
    "architecture": "SimpleCNN",
    "dataset": "derm7pt",
    "labels": derm7pt.model_columns["label"],
    "epochs": n_epochs,
    "batch_size": batch_size,
    "n_folds": n_folds,
    "device": device
    },
    name="run"+str(datetime.now())
)


simple_val_baseline = 0

# Training loop
kf = KFold(n_splits=n_folds, shuffle=True)
for fold, (train_idx, val_idx) in enumerate(kf.split(derm7pt.metadata)):
    #Todo Hackfix to get the majority class of the validation set
    print("start validation baseline: ", datetime.now())
    if True:
        val_loader = DataLoader(
            dataset=derm7pt,
            batch_size=999999,
            sampler=torch.utils.data.SubsetRandomSampler(val_idx),
        )
        for i, batch in enumerate(val_loader, 0):
            inputs, labels = batch
            baseline, simple_val_baseline = majority_class_baseline(labels)
    print("end validation baseline:   ", datetime.now(), ", baseline: ", baseline, " percent ",  simple_val_baseline)
    
    train_loader = DataLoader(
        dataset=derm7pt,
        batch_size=batch_size,
        sampler=torch.utils.data.SubsetRandomSampler(train_idx),
    )
    val_loader = DataLoader(
        dataset=derm7pt,
        batch_size=batch_size,
        sampler=torch.utils.data.SubsetRandomSampler(val_idx),
    )
    
    # Instantiate the model
    model = Net(num_classes, derm7pt.image_size)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(n_epochs):
        running_loss = 0.0
        i = 0
        for i, batch in enumerate(train_loader, 0):
            inputs, labels = batch
            #ToDo one hot encoding in Dataloader
            labels = torch.eye(num_classes)[labels.squeeze().int()]
            inputs, labels = inputs.to(device), labels.to(device)
    
            # Zero the parameter gradients
            optimizer.zero_grad()
    
            # Forward pass, backward pass, and optimization
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    
            # Print statistics
            running_loss += loss.item()
            
        running_loss /= (i+1)        
        
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for i, batch in enumerate(val_loader, 0):
                images, labels = batch
                images, labels = images.to(device), labels.to(device)
                output = model(images)
                output = output.argmax(dim=1)
                total += labels.size(0)
                correct += (output == labels.squeeze()).sum().item()
                
        
        val_accuracy = correct/total
        wandb.log({"loss": running_loss, "validation_accuracy": val_accuracy})
        print('[%d, %5d] loss: %.4f, val_accuracy: %.4f, simple_baseline: %.4f' % (epoch + 1, i + 1, running_loss, val_accuracy, simple_val_baseline))   
        
        
        #ToDo early stopping
        if running_loss < 0.01:
            print("Early stopping!")
            break
    
    #ToDo only one fold for now
    break

wandb.finish()
print('Finished Training')

[34m[1mwandb[0m: Currently logged in as: [33mtraglert[0m ([33mnlp_ass3[0m). Use [1m`wandb login --relogin`[0m to force relogin


start validation baseline:  2024-03-31 19:26:05.661509
end validation baseline:    2024-03-31 19:26:06.494300 , baseline:  tensor(1)  percent  tensor(0.4941)
[1,     2] loss: 1.6031, val_accuracy: 0.4941, simple_baseline: 0.4941
[2,     2] loss: 1.5919, val_accuracy: 0.4941, simple_baseline: 0.4941
[3,     2] loss: 1.5750, val_accuracy: 0.4941, simple_baseline: 0.4941
[4,     2] loss: 1.5534, val_accuracy: 0.4941, simple_baseline: 0.4941
[5,     2] loss: 1.5248, val_accuracy: 0.2588, simple_baseline: 0.4941
[6,     2] loss: 1.5063, val_accuracy: 0.2471, simple_baseline: 0.4941
[7,     2] loss: 1.4881, val_accuracy: 0.2471, simple_baseline: 0.4941
[8,     2] loss: 1.4831, val_accuracy: 0.5882, simple_baseline: 0.4941
[9,     2] loss: 1.4668, val_accuracy: 0.5765, simple_baseline: 0.4941
[10,     2] loss: 1.4682, val_accuracy: 0.5765, simple_baseline: 0.4941
[11,     2] loss: 1.4623, val_accuracy: 0.4941, simple_baseline: 0.4941
[12,     2] loss: 1.4590, val_accuracy: 0.5294, simple_base

VBox(children=(Label(value='0.001 MB of 0.007 MB uploaded\r'), FloatProgress(value=0.1740900992618987, max=1.0…

0,1
loss,██▇▆▅▅▄▄▃▃▃▃▃▃▂▂▂▃▂▂▂▂▂▁▁▁▁▁▁▁
validation_accuracy,▆▆▆▆▁▁▁███▆▇▆█▇▆▆▆█▆▇▇▇▇▆▆▇▆▇▆

0,1
loss,1.40225
validation_accuracy,0.50588


Finished Training


In [6]:
print("done")

done
