## Import Libraries

In [1]:
import os
import numpy as np 
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import DataLoader , Dataset
from torch.utils.data import TensorDataset
import torchvision
from torchvision import datasets , transforms 
from torchvision.datasets import ImageFolder
from torchvision.transforms import ToTensor
from PIL import Image
import torch.nn.functional as F
import pytorch_lightning as pl

## Import Dataset

In [2]:
data_dir = '/home/NGS/HTA_Projects/group_5/lung_colon_image_set'
filepaths = []
labels = []

folds = os.listdir(data_dir)
for fold in folds:
    foldpath = os.path.join(data_dir, fold)
    flist = os.listdir(foldpath)

    for f in flist:
        f_path = os.path.join(foldpath, f)
        filelist = os.listdir(f_path)
        for file in filelist:
            fpath = os.path.join(f_path, file)
            filepaths.append(fpath)
            if f == 'colon_aca':
                labels.append('Colon Adenocarcinoma')

            elif f == 'colon_n':
                labels.append('Colon Benign Tissue')

            elif f == 'lung_aca':
                labels.append('Lung Adenocarcinoma')

            elif f == 'lung_n':
                labels.append('Lung Benign Tissue')

            elif f == 'lung_scc':
                labels.append('Lung Squamous Cell Carcinoma')
                
# Concatenate data paths with labels into one dataframe
Fseries = pd.Series(filepaths, name= 'filepaths')
Lseries = pd.Series(labels, name='labels')
df = pd.concat([Fseries, Lseries], axis= 1)

In [3]:
le = LabelEncoder()
df['labels'] = df['labels'].apply(lambda x: 0 if x == 'Colon Adenocarcinoma' 
                                  else (1 if x == 'Colon Benign Tissue' 
                                        else (2 if x == 'Lung Adenocarcinoma' 
                                              else (3 if x == 'Lung Squamous Cell Carcinoma' 
                                                    else (4 if x == 'Lung Benign Tissue' else x)))))


## Split Dataset

In [4]:
x = df['filepaths']
y = df['labels']
x_train_paths, x_test_paths, y_train, y_test = train_test_split(x, y, train_size=0.7,
                                                                shuffle=True, random_state=34)


## Convert Into Tensor using transform

In [5]:
transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])

def to_tensor(image_path):
    image = Image.open(image_path)
    image = transform(image)  
    return image

In [6]:
x_train = [to_tensor(image_path) for image_path in x_train_paths]

In [7]:
x_test = [to_tensor(image_path) for image_path in x_test_paths]

## Create Dataloaders

In [8]:
y_train_tensor = torch.tensor(y_train.values)

 
y_test_tensor = torch.tensor(y_test.values)

In [9]:
train_dataset = TensorDataset(torch.stack(x_train), y_train_tensor)
test_dataset = TensorDataset(torch.stack(x_test), y_test_tensor)


train_loader = DataLoader(train_dataset, batch_size=120, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=120, shuffle=False)

In [10]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7f7618ef84d0>

In [11]:
train_loader.batch_size

120

## Model Building

In [12]:
class ConvolutionalNetwork(pl.LightningModule):
    
    def __init__(self):
        super(ConvolutionalNetwork, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 3, 1)
        self.conv2 = nn.Conv2d(6, 16, 3, 1)
        self.conv3 = nn.Conv2d(16, 32, 3, 1)

        self.fc1 = nn.Linear(32 * 26 * 26, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 20)
        self.fc4 = nn.Linear(20, 5)

    def forward(self, X):
        X = F.relu(self.conv1(X))
        X = F.max_pool2d(X, 2, 2)
        X = F.relu(self.conv2(X))
        X = F.max_pool2d(X, 2, 2)
        X = F.relu(self.conv3(X))  
        X = F.max_pool2d(X, 2, 2)
       
        X = X.view(-1, 32 * 26 * 26) 
       
        X = F.relu(self.fc1(X))
        X = F.relu(self.fc2(X))
        X = F.relu(self.fc3(X))
        X = self.fc4(X)
        return F.log_softmax(X, dim=1)
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.004)
        return optimizer

    def training_step(self, train_batch, batch_idx):
        X, y = train_batch
        y_hat = self(X)
        loss = F.cross_entropy(y_hat, y)
        pred = y_hat.argmax(dim=1, keepdim=True)
        acc = pred.eq(y.view_as(pred)).sum().item() / y.shape[0]
        self.log("train_loss", loss)
        self.log("train_acc", acc)
        return loss
    def validation_step(self, val_batch, batch_idx):
        X, y = val_batch
        y_hat = self(X)
        loss = F.cross_entropy(y_hat, y)
        pred = y_hat.argmax(dim=1, keepdim=True)
        acc = pred.eq(y.view_as(pred)).sum().item() / y.shape[0]
        self.log("val_loss", loss)
        self.log("val_acc", acc)

    def test_step(self, test_batch, batch_idx):
        X, y = test_batch
        y_hat = self(X)
        loss = F.cross_entropy(y_hat, y)
        pred = y_hat.argmax(dim=1, keepdim=True)
        acc = pred.eq(y.view_as(pred)).sum().item() / y.shape[0]
        self.log("test_loss", loss)
        self.log("test_acc", acc)

In [23]:
model = ConvolutionalNetwork()

In [26]:
trainer = pl.Trainer(max_epochs=30 , accelerator="gpu" , devices=1)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [27]:
trainer.fit(model , train_dataloaders = train_loader , val_dataloaders = test_loader )

Missing logger folder: logs/ConvolutionalNetwork
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type   | Params
---------------------------------
0 | conv1 | Conv2d | 168   
1 | conv2 | Conv2d | 880   
2 | conv3 | Conv2d | 4.6 K 
3 | fc1   | Linear | 2.6 M 
4 | fc2   | Linear | 10.2 K
5 | fc3   | Linear | 1.7 K 
6 | fc4   | Linear | 105   
---------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.454    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/student/miniconda3/envs/pytorch/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=39` in the `DataLoader` to improve performance.
/home/student/miniconda3/envs/pytorch/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=39` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.


## Evaluate model

In [49]:
from sklearn.metrics import classification_report

model.eval()
predicted_labels = []
true_labels = []

with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        predicted_labels.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())


predicted_labels = np.array(predicted_labels)
true_labels = np.array(true_labels)


report = classification_report(true_labels, predicted_labels)
print(report)


              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1519
           1       0.96      0.86      0.90      1466
           2       0.94      0.89      0.91      1481
           3       0.90      0.96      0.93      1493
           4       0.99      0.99      0.99      1541

    accuracy                           0.93      7500
   macro avg       0.93      0.93      0.93      7500
weighted avg       0.93      0.93      0.93      7500



In [50]:

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Accuracy on test set: {accuracy * 100:.2f}%")


Accuracy on test set: 92.95%
