In [1]:
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os
# import imutils
from PIL import Image
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import hashlib
from collections import Counter



In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU instead.")

Using GPU: NVIDIA GeForce RTX 2070 SUPER


## Data Processing

In [3]:
def file_hash(filepath):
    with open(filepath, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()

def check_duplicates(set1, set2):
    hashes = {}
    duplicates = []

    # Process all files in both sets and store their hashes
    for dataset_path in [set1, set2]:
        for root, _, files in os.walk(dataset_path):
            for filename in files:
                if filename.endswith('jpg'):  # Add other file types if needed
                    file_path = os.path.join(root, filename)
                    filehash = file_hash(file_path)
                    if filehash in hashes:
                        duplicates.append((hashes[filehash], file_path))
                    else:
                        hashes[filehash] = file_path
    return duplicates

# Check for duplicates
duplicates = check_duplicates('raw_data/Training', 'raw_data/Testing')
if duplicates:
    print("Duplicates found:", len(duplicates))
    # for dup in duplicates:
    #     print(f"Duplicate: {dup[0]} and {dup[1]}")
else:
    print("No duplicates found.")

Duplicates found: 297


Our data is... really bad. There are 297 duplicates, and some of those duplicates are in the testing AND training sets. No wonder some of the 

In [4]:
label_map = {'notumor': 0, 'glioma': 1, 'meningioma': 2, 'pituitary': 3}
image_size = 150

simple_transform = transforms.Compose([
    transforms.ToTensor() # already normalizes the image
])

def preprocess_data(image, image_size):
    image_np = np.array(image)
    image_np = cv2.bilateralFilter(image_np, 2, 50, 50)
    image_np = cv2.resize(image_np, (image_size, image_size))
    return Image.fromarray(image_np)

def load_unique_images(base_path):
    images = []
    labels = []
    hashes = set()
    
    for partition in ('Training', 'Testing'):
        for label in label_map.keys():
            path = os.path.join(base_path, partition, label)
            for file in tqdm(os.listdir(path)):
                file_path = os.path.join(path, file)

                img_hash = file_hash(file_path)
                if img_hash in hashes:
                    continue
                hashes.add(img_hash)

                # process image
                image = Image.open(file_path).convert('L')
                image = preprocess_data(image, image_size)
                image = simple_transform(image)
                images.append(image)
                
                # (binary label, multiclass label)
                labels.append((label_map[label]))
    
    return images, labels

all_images, all_labels = load_unique_images('raw_data')

x_train, x_test, y_multi_train, y_multi_test = train_test_split(all_images, all_labels, test_size=0.2, random_state=69)

100%|██████████| 1595/1595 [00:02<00:00, 771.91it/s]
100%|██████████| 1321/1321 [00:02<00:00, 612.72it/s]
100%|██████████| 1339/1339 [00:02<00:00, 585.49it/s]
100%|██████████| 1457/1457 [00:02<00:00, 544.84it/s]
100%|██████████| 405/405 [00:00<00:00, 1083.81it/s]
100%|██████████| 300/300 [00:00<00:00, 627.17it/s]
100%|██████████| 306/306 [00:00<00:00, 692.13it/s]
100%|██████████| 300/300 [00:00<00:00, 598.31it/s]


In [5]:
x_train, x_test = torch.stack(x_train), torch.stack(x_test)

y_binary_train = [int(label != 0) for label in y_multi_train]
y_binary_test =  [int(label != 0) for label in y_multi_test]

print("Label Distribution")
print("    Binary:")
print("        Training", Counter(y_binary_train))
print("        Test", Counter(y_binary_test))
print("    Multiclass:")
print("        Training", Counter(y_multi_train))
print("        Test", Counter(y_multi_test))

y_binary_train, y_binary_test = torch.tensor(y_binary_train), torch.tensor(y_binary_test)
y_multi_train, y_multi_test = torch.tensor(y_multi_train), torch.tensor(y_multi_test)

binary_training = DataLoader(TensorDataset(x_train, y_binary_train), batch_size=32,
                             shuffle=True,
                             pin_memory=True,
                             num_workers=3)
binary_testing = DataLoader(TensorDataset(x_test, y_binary_test), batch_size=32,
                             shuffle=False,
                             pin_memory=True,
                             num_workers=3)
multi_training = DataLoader(TensorDataset(x_train, y_multi_train), batch_size=32,
                             shuffle=True,
                             pin_memory=True,
                             num_workers=3)
multi_testing = DataLoader(TensorDataset(x_test, y_multi_test), batch_size=32,
                             shuffle=False,
                             pin_memory=True,
                             num_workers=3)

Label Distribution
    Binary:
        Training Counter({1: 4005, 0: 1375})
        Test Counter({1: 990, 0: 356})
    Multiclass:
        Training Counter({3: 1408, 0: 1375, 2: 1309, 1: 1288})
        Test Counter({0: 356, 3: 332, 1: 332, 2: 326})


## Training Model

In [6]:
class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # in_channels = the number of channels of the input to the convolutional layer (greyscale = 1, rgb = 3)
        # out_channels = the number of feature maps 
        # padding = (kernel_size-1) / 2
        # Use Conv2D since our image data is 2D.
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1)  # Change input channels if your images are not grayscale
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(128 * 75 * 75, 128)  # Adjust the size here based on the output of your last pooling layer
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(F.relu(self.conv3(x)), 2)
        x = x.view(x.size(0), -1)  # Flatten the output
        x = F.relu(self.fc1(x))
        x = self.fc2(x) # we are using BCEWithLogitsLoss
        return x

binary_model = BinaryClassification().to(device)
binary_criterion = nn.BCEWithLogitsLoss()
binary_optimizer = torch.optim.Adam(binary_model.parameters(), lr=0.001)

In [7]:
def train_binary_model(num_epochs):
    binary_model.train()  # Set the model to training mode
    
    for epoch in range(num_epochs):
        for images, labels in tqdm(binary_training):
            images, labels = images.to(device), labels.float().unsqueeze(1).to(device)
            
            binary_optimizer.zero_grad()
            outputs = binary_model(images)
            loss = binary_criterion(outputs, labels)
            loss.backward()
            binary_optimizer.step()
        
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')
        evaluate_binary_model()

def evaluate_binary_model():
    binary_model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in binary_testing:
            images, labels = images.to(device), labels.float().unsqueeze(1).to(device)
            outputs = binary_model(images)
            probs = torch.sigmoid(outputs)  # Convert logits to probabilities
            predicted = probs.round()  # Convert probabilities to 0 or 1
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy}%\n')


train_binary_model(10)

100%|██████████| 169/169 [00:18<00:00,  8.92it/s]

Epoch 1/10, Loss: 0.004723594058305025





Test Accuracy: 96.2852897473997%



100%|██████████| 169/169 [00:16<00:00, 10.13it/s]

Epoch 2/10, Loss: 0.015162359923124313





Test Accuracy: 97.84546805349183%



100%|██████████| 169/169 [00:16<00:00, 10.24it/s]

Epoch 3/10, Loss: 0.0014109177282080054





Test Accuracy: 97.91976225854383%



100%|██████████| 169/169 [00:16<00:00, 10.17it/s]

Epoch 4/10, Loss: 3.740075044333935e-05





Test Accuracy: 96.87964338781575%



100%|██████████| 169/169 [00:16<00:00, 10.13it/s]

Epoch 5/10, Loss: 4.341838211985305e-05





Test Accuracy: 98.58841010401188%



100%|██████████| 169/169 [00:16<00:00, 10.17it/s]

Epoch 6/10, Loss: 1.1920927533992653e-07





Test Accuracy: 98.58841010401188%



100%|██████████| 169/169 [00:16<00:00, 10.07it/s]

Epoch 7/10, Loss: 0.0





Test Accuracy: 98.58841010401188%



100%|██████████| 169/169 [00:16<00:00, 10.23it/s]

Epoch 8/10, Loss: 2.3841794245527126e-06





Test Accuracy: 98.58841010401188%



100%|██████████| 169/169 [00:16<00:00, 10.09it/s]

Epoch 9/10, Loss: 4.470344947549165e-07





Test Accuracy: 98.58841010401188%



100%|██████████| 169/169 [00:16<00:00, 10.12it/s]

Epoch 10/10, Loss: 0.0





Test Accuracy: 98.58841010401188%



In [8]:
class MulticlassClassification(nn.Module):
    def __init__(self):
        super(MulticlassClassification, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(128 * 75 * 75, 128)  # Adjust if different size or pooling
        self.fc2 = nn.Linear(128, 4)  # Output layer for multiclass classification

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(F.relu(self.conv3(x)), 2)  # Pooling to reduce spatial dimensions
        x = x.view(x.size(0), -1)  # Flatten the output
        x = F.relu(self.fc1(x))
        x = self.fc2(x)  # No activation, raw logits for CrossEntropyLoss
        return x

# Example instantiation and setup
multiclass_model = MulticlassClassification().to(device)
multiclass_criterion = nn.CrossEntropyLoss()  # Suitable for multiclass
multiclass_optimizer = torch.optim.Adam(multiclass_model.parameters(), lr=0.001)

In [9]:
def train_multiclass_model(model, optimizer, criterion, num_epochs):
    model.train()  # Set the model to training mode
    
    for epoch in range(num_epochs):
        for images, labels in tqdm(multi_training):
            images, labels = images.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')
        evaluate_multiclass_model(model)

def evaluate_multiclass_model(model):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in multi_testing:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)  # Get the index of the max log-probability
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy}%\n')

# Example of training the model
num_epochs = 10
train_multiclass_model(multiclass_model, multiclass_optimizer, multiclass_criterion, num_epochs)


100%|██████████| 169/169 [00:17<00:00,  9.93it/s]

Epoch 1/10, Loss: 1.8880287408828735





Test Accuracy: 83.8038632986627%



100%|██████████| 169/169 [00:16<00:00, 10.28it/s]

Epoch 2/10, Loss: 1.9556090831756592





Test Accuracy: 84.9182763744428%



100%|██████████| 169/169 [00:16<00:00, 10.25it/s]

Epoch 3/10, Loss: 0.2808113992214203





Test Accuracy: 89.67310549777118%



100%|██████████| 169/169 [00:16<00:00, 10.29it/s]

Epoch 4/10, Loss: 0.00995013676583767





Test Accuracy: 93.09063893016345%



100%|██████████| 169/169 [00:16<00:00, 10.22it/s]

Epoch 5/10, Loss: 0.06787999719381332





Test Accuracy: 91.08469539375929%



100%|██████████| 169/169 [00:16<00:00, 10.07it/s]

Epoch 6/10, Loss: 3.0217732273740694e-05





Test Accuracy: 92.79346210995543%



100%|██████████| 169/169 [00:16<00:00, 10.20it/s]

Epoch 7/10, Loss: 0.0008965736487880349





Test Accuracy: 91.38187221396731%



100%|██████████| 169/169 [00:16<00:00, 10.18it/s]

Epoch 8/10, Loss: 0.43215009570121765





Test Accuracy: 87.66716196136701%



100%|██████████| 169/169 [00:16<00:00, 10.29it/s]

Epoch 9/10, Loss: 0.0002233274863101542





Test Accuracy: 93.23922734026746%



100%|██████████| 169/169 [00:16<00:00, 10.23it/s]

Epoch 10/10, Loss: 3.146986637148075e-05





Test Accuracy: 93.53640416047548%

