# Modeling

Here we'll build a computer vision model to detect if a lung disease is present or not.

In [14]:
import os
import pandas as pd

## Data Loading

Using the final results from the data_exploration_and_processing notebook, we'll load our data and labels.

In [15]:
def get_image_files(index_lst: list):
    image_files = []

    for i in index_lst:
        image_dir = f"D:\BigData\images_00{i}\images"
        
        image_files_for_one_folder = os.listdir(image_dir)
        
        image_files += image_files_for_one_folder
        
    return image_files
    
image_files = get_image_files([1])
len(image_files)

4999

In [16]:
df = pd.read_csv("../data/lung_disease_labels.csv")
df.head(3)

Unnamed: 0,image_index,follow-up_#,patient_age,patient_gender,view_position,finding_labels
0,00000001_000.png,0,58,0,0,1
1,00000001_001.png,1,58,0,0,1
2,00000001_002.png,2,58,0,0,1


In [17]:
subset_df = df[df["image_index"].isin(image_files)]
print(subset_df.shape)
subset_df.head()

(4999, 6)


Unnamed: 0,image_index,follow-up_#,patient_age,patient_gender,view_position,finding_labels
0,00000001_000.png,0,58,0,0,1
1,00000001_001.png,1,58,0,0,1
2,00000001_002.png,2,58,0,0,1
3,00000002_000.png,0,81,0,0,0
4,00000003_000.png,0,81,1,0,1


If it's missing images, it's likely just from the faulty age data we removed. This is fine, let's move on.

## Data Preprocessing

In [18]:
import cv2
import numpy as np
from PIL import Image

def opencv_preprocessing(image_path, final_size=(224, 224)):
    # Load the image in grayscale
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    image = cv2.equalizeHist(image)
    
    image = cv2.resize(image, final_size)
    
    # Optional: Clip pixel values to be in the valid range (0-255)
    image = np.clip(image, 0, 255).astype('uint8')
    
    # Convert from OpenCV image (NumPy array) to PIL image for Torch transformations
    pil_image = Image.fromarray(image)
    
    return pil_image


In [19]:
import torchvision.transforms as transforms

def torch_preprocessing():
    return transforms.Compose([
        transforms.RandomHorizontalFlip(),       # Apply random horizontal flip
        transforms.RandomRotation(10),           # Apply random rotation of ±10 degrees
        transforms.ToTensor(),                   # Convert image to tensor
        transforms.Normalize(mean=[0.5], std=[0.5])  # Normalize the pixel values
    ])

In [33]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image

class ImageDataset(Dataset):
    def __init__(self, dataframe, image_dir, opencv_preprocess_fn, torch_transform=None):
        """
        Args:
            dataframe (pd.DataFrame): DataFrame containing image names and corresponding labels.
            image_dir (str): Directory containing all images.
            opencv_preprocess_fn (function): Function to apply OpenCV preprocessing.
            torch_transform (function, optional): Torch transforms to be applied after OpenCV preprocessing.
        """
        self.dataframe = dataframe
        self.image_dir = image_dir  # Single image directory
        self.opencv_preprocess_fn = opencv_preprocess_fn
        self.transform = torch_transform

    def __len__(self):
        # The length of the dataset is the length of the dataframe
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_dir, self.dataframe.iloc[idx]['image_index'])
        image = Image.open(img_name).convert('RGB')
        label = self.dataframe.iloc[idx]['finding_labels']

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label, dtype=torch.float32)


In [34]:
train_dataset = ImageDataset(
    dataframe=subset_df, 
    image_dir="D:\BigData\images_001\images", 
    opencv_preprocess_fn=opencv_preprocessing,  # Use your preprocessing function here
    torch_transform=torch_preprocessing()  # Use your Torch transforms here
)

In [35]:
from torch.utils.data import random_split

dataset_size = len(train_dataset)
train_size = int(0.8 * dataset_size)
test_size = dataset_size - train_size

train_subset, test_subset = random_split(train_dataset, [train_size, test_size])

train_loader = DataLoader(train_subset, batch_size=8, shuffle=True, num_workers=0)
test_loader = DataLoader(test_subset, batch_size=8, shuffle=False, num_workers=0)


In [36]:
import time

start_time = time.time()

# Test loading one batch from train_loader
data_iter = iter(train_loader)
images, labels = next(data_iter)

end_time = time.time()
print(f"Time to load one batch: {end_time - start_time:.2f} seconds")


Time to load one batch: 0.35 seconds


In [39]:
import torch
import torch.nn as nn
import torchvision.models as models

model = models.resnet50(pretrained=True)
num_ftrs = model.fc.in_features
print(num_ftrs)



2048


In [None]:
num_classes = 1  # Remember we are starting with binary classification here for simply detecting if a disease is present or not
model.fc = nn.Sequential(
    nn.Linear(2048, num_classes),
    nn.Sigmoid()
)

# Move the model to the appropriate device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [40]:
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    epoch_start_time = time.time()

    for i, (images, labels) in enumerate(train_loader):
        batch_start_time = time.time()
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        if (i + 1) % 10 == 0:  # Print every 10 batches
            print(f"Epoch [{epoch+1}/{num_epochs}], "
                  f"Batch [{i+1}/{len(train_loader)}], "
                  f"Loss: {loss.item():.4f}, "
                  f"Batch Time: {time.time() - batch_start_time:.2f}s")
            
    epoch_loss = running_loss / len(train_loader)
    epoch_time = time.time() - epoch_start_time
    
    print(f"Epoch [{epoch+1}/{num_epochs}] completed, "
        f"Average Loss: {epoch_loss:.4f}, "
        f"Epoch Time: {epoch_time:.2f}s")

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for i, (images, labels) in enumerate(test_loader):
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            val_loss += criterion(outputs.squeeze(), labels).item()
            predicted = torch.round(torch.sigmoid(outputs.squeeze()))
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            if (i + 1) % 10 == 0:  # Print every 10 batches
                print(f"Validation Batch [{i+1}/{len(test_loader)}] processed")
            

    print(f"Epoch [{epoch+1}/{num_epochs}], "
          f"Train Loss: {loss.item():.4f}, "
          f"Val Loss: {val_loss/len(test_loader):.4f}, "
          f"Val Accuracy: {100 * correct / total:.2f}%")

In [8]:
import os
from torch.utils.data import Dataset

class ImageDataset(Dataset):
    def __init__(self, dataframe, image_dirs, opencv_preprocess_fn, torch_transform=None):
        """
        Args:
            dataframe (pd.DataFrame): DataFrame containing image names and corresponding labels.
            image_dirs (list of str): List of directories containing images.
            opencv_preprocess_fn (function): Function to apply OpenCV preprocessing.
            torch_transform (function, optional): Torch transforms to be applied after OpenCV preprocessing.
        """
        self.dataframe = dataframe
        self.image_dirs = image_dirs  # List of directories containing images
        self.opencv_preprocess_fn = opencv_preprocess_fn
        self.transform = torch_transform

    def __len__(self):
        # The length of the dataset is the length of the dataframe
        return len(self.dataframe)

    def find_image_path(self, image_name):
        # Search for the image in each directory
        for image_dir in self.image_dirs:
            image_path = os.path.join(image_dir, image_name)
            if os.path.exists(image_path):
                return image_path
        raise FileNotFoundError(f"Image {image_name} not found in any directory!")

    def __getitem__(self, idx):
        # Get the image name and label from the dataframe
        image_name = self.dataframe.iloc[idx]['image_index']  # Image file name
        label = self.dataframe.iloc[idx]['finding_labels']  # Corresponding label

        # Find the full image path by checking both directories
        image_path = self.find_image_path(image_name)

        # Apply OpenCV preprocessing to the image
        image = self.opencv_preprocess_fn(image_path)

        # Apply Torch transformations (if any)
        if self.transform:
            image = self.transform(image)

        return image, label


In [21]:
import torch
import torch.nn as nn
import torchvision.models as models

# Load a pretrained ResNet model for binary or multi-label classification
model = models.resnet50(pretrained=True)

In [22]:
num_classes = 1  # Remember we are starting with binary classification here for simply detecting if a disease is present or not
model.fc = nn.Sequential(
    nn.Linear(2048, num_classes),
    nn.Sigmoid()
)

# Move the model to the appropriate device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [13]:
image_dirs = [r"D:\BigData\images_003\images", r"D:\BigData\images_008\images"]
image_dirs = [r"D:\BigData\images_003\images"]

train_dataset = ImageDataset(
    dataframe=subset_df, 
    image_dirs=image_dirs, 
    opencv_preprocess_fn=opencv_preprocessing, 
    torch_transform=torch_preprocessing()
)

dataset_size = len(train_dataset)
train_size = int(0.8 * dataset_size)
test_size = dataset_size - train_size

train_subset, test_subset = random_split(train_dataset, [train_size, test_size])

In [14]:
train_loader = DataLoader(train_subset, batch_size=32, shuffle=True, num_workers=8)
test_loader = DataLoader(test_subset, batch_size=32, shuffle=False, num_workers=8)



In [15]:
import matplotlib.pyplot as plt

In [16]:
import torchvision.transforms.functional as F

In [17]:
# Get a single batch from the DataLoader
data_iter = iter(train_loader)
#images, labels = next(data_iter)  # Get the first batch of images and labels

#images.is_cuda




In [None]:
#images = images.cpu()
#labels = labels.cpu()


In [None]:
# Display the first image in the batch and its corresponding label
def show_image(image, label):
    # Convert the tensor image to a NumPy array for visualization
    image = image.permute(1, 2, 0).numpy()  # Convert from (C, H, W) to (H, W, C)
    
    # Undo any normalization, if necessary
    image = (image * 0.5) + 0.5  # Assuming your normalization was (mean=[0.5], std=[0.5])
    
    plt.imshow(image, cmap='gray')
    plt.title(f'Label: {label}')
    plt.axis('off')
    plt.show()

# Display the first few images in the batch
for i in range(4):  # Display first 4 images
    show_image(images[i], labels[i].item())  # Convert the label tensor to a scalar

In [19]:
from tqdm import tqdm

In [20]:
from tqdm import trange

num_epochs = 1

# Use trange to add a progress bar for epochs
for epoch in trange(num_epochs, desc="Training Progress"):
    model.train()  # Set model to training mode
    running_loss = 0.0

    # Iterate over batches with a progress bar
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")

    for images, labels in progress_bar:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")


Training Progress:   0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
num_epochs = 2

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0

    for images, labels in train_loader:
        # Move data to the GPU (if available)
        images = images.to(device)
        labels = labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(images)

        # Calculate loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate the loss
        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader)}")

# Optionally, save the trained model
torch.save(model.state_dict(), 'xray_classification_model.pth')


In [None]:
import matplotlib.pyplot as plt
from IPython.display import clear_output

# Lists to store loss values
train_losses = []

num_epochs = 1  # Adjust the number of epochs based on your needs

# Start the training loop
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0

    for images, labels in train_loader:
        # Move the data to the appropriate device
        images = images.to(device)
        labels = labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(images)

        # Calculate loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate the loss
        running_loss += loss.item()

    # Store the loss value for this epoch
    avg_train_loss = running_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Plot the training loss
    clear_output(wait=True)  # Clear the output in the notebook
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Training Loss')
    plt.title(f"Epoch {epoch+1}/{num_epochs}")
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_train_loss:.4f}")


In [None]:
model.load_state_dict(torch.load('xray_classification_model.pth'))