In [1]:
#Adding sibling directory to path of current directory
import sys
import os
# Adding Dataset to import path
sibling_dir = "../dataset"
sys.path.insert(1, sibling_dir)
import plotly.io as pio
pio.renderers.default = "iframe"

# Exploratory Data Analysis Of Segmentation And Classification Data Of OCT Images

In [2]:
# General Imports
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import PIL.Image as Image
from plotly.subplots import make_subplots

## 1. Classification Data

In [3]:
# loading the classification data splits
from collections import defaultdict

from classificationData import classificationData

class_val = classificationData("val", None)
class_train = classificationData("train", None)
class_test = classificationData("test", None)

In [4]:
print(f"""
Labels: {class_train.classes}
""")


Labels: ['CNV', 'DME', 'DRUSEN', 'NORMAL']



## 1.1 Subset Distribution

In [5]:
class_dist = go.Figure(go.Bar(x=["Test","Train","Validation"], y=[len(class_test), len(class_train), len(class_val)]))
class_dist.update_layout(title_text="Distribution Of Images Across Subsets")
class_dist.show()

As we can see from the above graph, The distribution has the following split:
- Train Set ~= 70%
- Test Set ~= 10%
- Validation Set ~= 20%


Thus, there is no need for modifying the distribution of the subsets.

## 1.2 Class Distribution Analysis

### 1.2.1 Train Set

In [6]:
labels = dict(enumerate(class_train.classes))
class_train_counts = defaultdict(int)
for _, label in class_train.labels.items():
    class_train_counts[labels[label]]+=1
class_train_dist = go.Figure(go.Bar(y=list(class_train_counts.values()), x=list(class_train_counts.keys())))
class_train_dist.update_layout(title_text = "Class Distribution In Train Dataset")

### 1.2.2 Test Set

In [7]:
labels = dict(enumerate(class_train.classes))
class_test_counts = defaultdict(int)
for _, label in class_test.labels.items():
    class_test_counts[labels[label]]+=1
class_test_dist = go.Figure(go.Bar(y=list(class_test_counts.values()), x=list(class_test_counts.keys())))
class_test_dist.update_layout(title_text = "Class Distribution In Test Dataset")

### 1.2.3 Validation Set

In [8]:
labels = dict(enumerate(class_val.classes))
class_val_counts = defaultdict(int)
for _, label in class_val.labels.items():
    class_val_counts[labels[label]]+=1
class_val_dist = go.Figure(go.Bar(y=list(class_val_counts.values()), x=list(class_val_counts.keys())))
class_val_dist.update_layout(title_text = "Class Distribution In Validation Dataset")

__As we can see from the above histograms, we can see that the proportion of images across classes remain the same, which can be verified by calculating the percentages for each class. Therefore, we conclude that the dataset splits are valid and do not need any further modification__

## 1.3 Data Visualization

In [9]:

train_images = {}
val_images = {}
test_images = {}
# Iterate over datasets and select one image per class
for dataset, target_dict in zip([class_train, class_val, class_test], 
                                [train_images, val_images, test_images]):
    seen_classes = set()  # Track which classes have been added

    for img_path, label in dataset.labels.items():
        if label not in seen_classes:  # If class is not already in the dictionary
            target_dict[label] = img_path
            seen_classes.add(label)  # Mark class as seen

In [10]:
# Combine selected images from train, val, and test sets
all_selected_images = list(train_images.items()) + list(val_images.items()) + list(test_images.items())

# Create a 4x4 subplot grid
fig = make_subplots(rows=4, cols=4, subplot_titles=[f"Class {label}" for label, _ in all_selected_images])

# Iterate over selected images and add them to the grid
for i, (label, img_path) in enumerate(all_selected_images[:16]):  # Show only 16 images
    row = (i // 4) + 1  # Compute row index (1-based)
    col = (i % 4) + 1   # Compute column index (1-based)

    img = Image.open(img_path).convert("RGB")  # Load and convert image to RGB
    img_array = np.array(img)  # Convert PIL image to NumPy array

    # Add image trace
    fig.add_trace(go.Image(z=img_array), row=row, col=col)

# Update layout
fig.update_layout(
    height=800, width=800, title_text="Sample Images From Each Class",
    showlegend=False
)

# Show figure
fig.show()

## 1.4 Final Observations:
From the initial data exploration, we have observed that the data distribution is satisfactory regarding classes as well as subsets. We can also see that the dataset contains images with random augmentations applied such as image distortion, shearing, translation and cropping, which will help in making our classifier model more robust and improve generalization.

## 2. Segmentation Data

In [11]:
# Loading the segmentation data
from segmentationData import segmentationData
seg_data = segmentationData()

In [13]:
print(f"""
Total Images: {len(seg_data)}
""")


Total Images: 3859



In [18]:
# Select 6 images
num_images = 6
selected_images = list(zip(seg_data.image_paths[:num_images], seg_data.gt_paths[:num_images]))

# Create a 3-row, 3-column subplot grid
fig = make_subplots(rows=6, cols=3, 
                    subplot_titles=["Image", "Ground Truth", "Overlay"] * num_images)

for i, (img_path, gt_path) in enumerate(selected_images):
    row = i + 1  # Row index (1-based)

    # Load images
    img = Image.open(img_path).convert("RGB")
    gt = Image.open(gt_path).convert("L")  # Convert mask to grayscale

    # Resize ground truth mask to match image size
    gt = gt.resize(img.size, Image.NEAREST)

    # Convert to NumPy arrays
    img_array = np.array(img)
    gt_array = np.array(gt)

    # Normalize mask to 0-255 scale (binary mask)
    gt_overlay = (gt_array > 128).astype(np.uint8) * 255

    # Overlay: Blend image and mask
    overlay = img_array.copy()
    overlay[:, :, 0] = np.maximum(overlay[:, :, 0], gt_overlay)  # Red channel blend

    # Add images to plotly subplots
    fig.add_trace(go.Image(z=img_array), row=row, col=1)
    fig.add_trace(go.Image(z=gt_array), row=row, col=2)
    fig.add_trace(go.Image(z=overlay), row=row, col=3)

# Update layout
fig.update_layout(
    height=1500, width=1800, title_text="Segmentation Dataset Visualization",
    showlegend=False
)

fig.show()