In [1]:
import zipfile
import os
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
folder_path = '/content/drive/My Drive/Capstone/Data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
train_labels_path = '/content/drive/My Drive/Capstone/Data/train.xlsx'
test_labels_path = '/content/drive/My Drive/Capstone/Data/test.xlsx'
val_labels_path = '/content/drive/My Drive/Capstone/Data/validation.xlsx'

In [4]:
# Load the label data from the Excel files
train_df = pd.read_excel(train_labels_path)
test_df = pd.read_excel(test_labels_path)
val_df = pd.read_excel(val_labels_path)

In [5]:
trainImg_path = '/content/drive/My Drive/Capstone/TrainTestVal-Images/Train'
testImg_path = '/content/drive/My Drive/Capstone/TrainTestVal-Images/Test'
valImg_path = '/content/drive/My Drive/Capstone/TrainTestVal-Images/Validation'

In [6]:
# import zipfile
# import os

# # Replace with the path to your zip file
# zip_file_path = '/content/drive/My Drive/Capstone/Data/TrainTestVal-Images.zip'

# # Replace with the path where you want to extract the contents of the zip file
# extract_to_path = '/content/drive/My Drive/Capstone/Data'

# # Ensure the extract path exists
# if not os.path.exists(extract_to_path):
#     os.makedirs(extract_to_path)

# # Unzip the file
# with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#     zip_ref.extractall(extract_to_path)

# print(f"Extracted all files to {extract_to_path}")



In [7]:
import os

# Replace with the actual path to your images folder
images_folder_path = '/content/drive/My Drive/Capstone/Data/TrainTestVal-Images/Validation'

# List of image file extensions we're looking for
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']

# Initialize a counter
image_count = 0

# Walk through all files and folders within the images folder
for root, dirs, files in os.walk(images_folder_path):
    for file in files:
        # Check if the file has one of the image file extensions
        if any(file.lower().endswith(ext) for ext in image_extensions):
            image_count += 1

print(f'The folder contains {image_count} image(s).')


The folder contains 23205 image(s).


In [8]:
from torch.utils.data import Dataset
from PIL import Image
import torchvision.transforms as transforms

class CustomImageDataset(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None):
        self.img_labels = pd.read_excel(annotations_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
      img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 1])
      image = Image.open(img_path).convert('RGB')  # Convert image to RGB
      label = self.img_labels.iloc[idx, 0]
      # Convert the label from string to integer using the mapping dictionary
      label = class_to_idx[label]
      if self.transform:
          image = self.transform(image)
      return image, label


In [9]:
# Define transformations
transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Create datasets
train_dataset = CustomImageDataset(annotations_file='/content/drive/My Drive/Capstone/Data/train.xlsx',
                                   img_dir='/content/drive/My Drive/Capstone/Data/TrainTestVal-Images/Train',
                                   transform=transform)

test_dataset = CustomImageDataset(annotations_file='/content/drive/My Drive/Capstone/Data/test.xlsx',
                                  img_dir='/content/drive/My Drive/Capstone/Data/TrainTestVal-Images/Test',
                                  transform=transform)

val_dataset = CustomImageDataset(annotations_file='/content/drive/My Drive/Capstone/Data/validation.xlsx',
                                 img_dir='/content/drive/My Drive/Capstone/Data/TrainTestVal-Images/Validation',
                                 transform=transform)


In [10]:
class_to_idx = {class_name: index for index, class_name in enumerate(pd.unique(train_dataset.img_labels['class']))}
class_to_idx

{'Mass': 0,
 'Cardiomegaly': 1,
 'Atelectasis': 2,
 'Effusion': 3,
 'Pneumothorax': 4,
 'No Finding': 5,
 'Subcutaneous Emphysema': 6,
 'Nodule': 7,
 'Pleural Thickening': 8,
 'Edema': 9,
 'Pneumonia': 10,
 'Emphysema': 11,
 'Infiltration': 12,
 'Consolidation': 13,
 'Fibrosis': 14}

In [13]:
from torch.utils.data import DataLoader

train_loader = DataLoader(dataset=train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=128, shuffle=False)
val_loader = DataLoader(dataset=val_dataset, batch_size=128, shuffle=False)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models
from torchsummary import summary



# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pretrained ResNet model
model = models.resnet50(pretrained=True)


num_classes = 15
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, num_classes)

# Move the model to the GPU if available
model = model.to(device)

# Summary of the model (optional, it requires torchsummary to be installed)
# summary(model, input_size=(3, 224, 224))

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Training loop
def train_model(model, criterion, optimizer, num_epochs=25):
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}/{num_epochs}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
                dataloader = train_loader
            else:
                model.eval()   # Set model to evaluate mode
                dataloader = val_loader

            running_loss = 0.0
            # Initialize running_corrects as a zero tensor at the start of the epoch
            running_corrects = torch.tensor(0, device=device)

            # Iterate over data
            for inputs, labels in dataloader:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # Backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # Statistics
                running_loss += loss.item() * inputs.size(0)
                # Use .item() to convert the result of sum to a Python integer and then add to running_corrects tensor
                running_corrects += torch.sum(preds == labels.data).item()

            epoch_loss = running_loss / len(dataloader.dataset)
            epoch_acc = running_corrects / len(dataloader.dataset)  # Convert to float if necessary

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

        print()

    return model

# Train the model
model_trained = train_model(model, criterion, optimizer, num_epochs=25)

# Save the model (optional)
# torch.save(model_trained.state_dict(), 'model_resnet50.pth')

# To evaluate on the test set
model.eval()
# Then loop over your test_loader


Epoch 1/25
----------
