# Dataset EDA

In [3]:
import os
os.chdir("../")  # Change to parent directory to access Data folder
%pwd

'c:\\projects\\HealthCare-Pulmonary-diagnosis'

In [18]:
from pathlib import Path

TRAIN_DIR = Path("artifacts/data_ingestion/raw_data/Data/train") 
VAL_DIR = Path("artifacts/data_ingestion/raw_data/Data/valid")
IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff"}

def count_images_per_class(train_dir: Path):
    counts = {}

    for class_dir in train_dir.iterdir():
        if not class_dir.is_dir():
            continue

        images = [
            p for p in class_dir.rglob("*")
            if p.suffix.lower() in IMAGE_EXTS
        ]

        counts[class_dir.name] = len(images)

    return counts

dir_list =[TRAIN_DIR, VAL_DIR]
for dir in dir_list:
    print(f"Directory: {dir}")
    counts = count_images_per_class(dir)
    print("Image count per class:")
    for k, v in counts.items():
        print(f"{k:<30} {v/sum(counts.values()):.2%} ({v} images)")
    print(f"Total images: {sum(counts.values())}")

Directory: artifacts\data_ingestion\raw_data\Data\train
Image count per class:
adenocarcinoma                 31.81% (195 images)
large.cell.carcinoma           18.76% (115 images)
normal                         24.14% (148 images)
squamous.cell.carcinoma        25.29% (155 images)
Total images: 613
Directory: artifacts\data_ingestion\raw_data\Data\valid
Image count per class:
adenocarcinoma                 31.94% (23 images)
large.cell.carcinoma           29.17% (21 images)
normal                         18.06% (13 images)
squamous.cell.carcinoma        20.83% (15 images)
Total images: 72


- Class imbalance not severe
- Small dataset
- Image augmentation needed for training
- Transfer learning
- Hybrid learning ( CNN + ML)



In [16]:
import random
from PIL import Image

def inspect_random_images(train_dir: Path, n_samples=5):
    all_images = []

    for class_dir in train_dir.iterdir():
        if not class_dir.is_dir():
            continue

        for p in class_dir.rglob("*"):
            if p.suffix.lower() in IMAGE_EXTS:
                all_images.append(p)

    sample_paths = random.sample(all_images, min(n_samples, len(all_images)))

    print("\nRandom sample inspection:")
    for p in sample_paths:
        with Image.open(p) as img:
            print(f"Path: {p}")
            print(f"  Format: {img.format}")
            print(f"  Size: {img.size}")      # (width, height)
            print(f"  Mode: {img.mode}")      # RGB, L, etc.
            print("-" * 50)


inspect_random_images(TRAIN_DIR, n_samples=5)


Random sample inspection:
Path: artifacts\data_ingestion\raw_data\Data\valid\normal\4 - Copy (2).png
  Format: PNG
  Size: (615, 495)
  Mode: RGBA
--------------------------------------------------
Path: artifacts\data_ingestion\raw_data\Data\valid\large.cell.carcinoma\000126.png
  Format: PNG
  Size: (358, 220)
  Mode: RGBA
--------------------------------------------------
Path: artifacts\data_ingestion\raw_data\Data\valid\normal\7 - Copy (3).png
  Format: PNG
  Size: (940, 627)
  Mode: RGB
--------------------------------------------------
Path: artifacts\data_ingestion\raw_data\Data\valid\large.cell.carcinoma\000110 (2).png
  Format: PNG
  Size: (385, 251)
  Mode: RGBA
--------------------------------------------------
Path: artifacts\data_ingestion\raw_data\Data\valid\squamous.cell.carcinoma\000108 (3).png
  Format: PNG
  Size: (438, 256)
  Mode: RGBA
--------------------------------------------------


In [17]:
from collections import Counter
from PIL import Image
from pathlib import Path

modes = Counter()

for p in Path(TRAIN_DIR).rglob("*"):
    if p.suffix.lower() in {".png",".jpg",".jpeg"}:
        with Image.open(p) as img:
            modes[img.mode] += 1

print(modes)

Counter({'RGBA': 64, 'RGB': 8})


- All images should be converted into RGB - 3 channel at loading time
- All images should have the same size at loading time

In [5]:
import tensorflow as tf

# Base model (feature maps, 4D output)
base_model = tf.keras.applications.EfficientNetB1(
    weights="imagenet",
    include_top=False,
    pooling='avg',
)

# Take the feature map tensor
x = base_model.output

# Flatten it (NOT recommended for small datasets, but valid)
x = tf.keras.layers.Dense(512, activation="relu")(x)

# Build new model
model = tf.keras.Model(inputs=base_model.input, outputs=x)

model.summary()