In [2]:
# ==========================================
# STEP 1: INSTALL & DOWNLOAD DATA
# ==========================================
# 1. INSTALL THE LIBRARY FIRST (This fixes your error)
!pip install opendatasets --quiet

import os
import shutil
import random
import opendatasets as od

# 2. Download Dataset A (Adults/General)
print("‚¨áÔ∏è Downloading COVID-19 Radiography Database (Dataset A)...")
od.download('https://www.kaggle.com/datasets/tawsifurrahman/covid19-radiography-database')

# 3. Download Dataset B (Pediatric Booster)
print("‚¨áÔ∏è Downloading Chest X-Ray Pneumonia (Dataset B)...")
od.download('https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia')

print("\n‚úÖ Downloads Complete! You should see two folders on the left sidebar.")

‚¨áÔ∏è Downloading COVID-19 Radiography Database (Dataset A)...
Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: Ranjeet0045
Your Kaggle Key: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
Dataset URL: https://www.kaggle.com/datasets/tawsifurrahman/covid19-radiography-database
Downloading covid19-radiography-database.zip to ./covid19-radiography-database


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 778M/778M [00:12<00:00, 64.1MB/s]







‚¨áÔ∏è Downloading Chest X-Ray Pneumonia (Dataset B)...
Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: Ranjeet0045
Your Kaggle Key: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
Dataset URL: https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia
Downloading chest-xray-pneumonia.zip to ./chest-xray-pneumonia


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2.29G/2.29G [00:32<00:00, 75.5MB/s]




‚úÖ Downloads Complete! You should see two folders on the left sidebar.


In [3]:
# ==========================================
# STEP 2: MERGE, BALANCE & SPLIT (70/20/10)
# ==========================================
import os
import shutil
import random

# 1. Define Destination
BASE_DIR = 'Final_Dataset'
if os.path.exists(BASE_DIR): shutil.rmtree(BASE_DIR) # Clean reset

# Create folder structure
classes = ['NORMAL', 'PNEUMONIA']
for split in ['train', 'test', 'val']: # Note: test is 20%, val is 10%
    for cls in classes:
        os.makedirs(os.path.join(BASE_DIR, split, cls), exist_ok=True)

print("üìÇ Created folder structure. Gathering files...")

# 2. Collect All Normal Images (Adults + Kids)
normal_files = []
# From Dataset 1 (Adults)
d1_norm = 'covid19-radiography-database/COVID-19_Radiography_Dataset/Normal'
if os.path.exists(d1_norm):
    normal_files += [os.path.join(d1_norm, f) for f in os.listdir(d1_norm) if f.lower().endswith('png')]

# From Dataset 2 (Kids)
d2_norm_train = 'chest-xray-pneumonia/chest_xray/train/NORMAL'
d2_norm_test = 'chest-xray-pneumonia/chest_xray/test/NORMAL'
if os.path.exists(d2_norm_train):
    normal_files += [os.path.join(d2_norm_train, f) for f in os.listdir(d2_norm_train) if f.lower().endswith('jpeg')]
if os.path.exists(d2_norm_test):
    normal_files += [os.path.join(d2_norm_test, f) for f in os.listdir(d2_norm_test) if f.lower().endswith('jpeg')]

# 3. Collect All Pneumonia Images (Lung Opacity + Viral + COVID + Kids Pneu)
pneumonia_files = []

# D1: Lung Opacity (Bacterial)
d1_opaque = 'covid19-radiography-database/COVID-19_Radiography_Dataset/Lung_Opacity'
if not os.path.exists(d1_opaque): d1_opaque = 'covid19-radiography-database/COVID-19_Radiography_Dataset/Lung Opacity'
if os.path.exists(d1_opaque):
    pneumonia_files += [os.path.join(d1_opaque, f) for f in os.listdir(d1_opaque) if f.lower().endswith('png')]

# D1: Viral Pneumonia
d1_viral = 'covid19-radiography-database/COVID-19_Radiography_Dataset/Viral Pneumonia'
if os.path.exists(d1_viral):
    pneumonia_files += [os.path.join(d1_viral, f) for f in os.listdir(d1_viral) if f.lower().endswith('png')]

# D1: COVID (Treating as Pneumonia for training)
d1_covid = 'covid19-radiography-database/COVID-19_Radiography_Dataset/COVID'
if os.path.exists(d1_covid):
    pneumonia_files += [os.path.join(d1_covid, f) for f in os.listdir(d1_covid) if f.lower().endswith('png')]

# D2: Kids Pneumonia
d2_pneu_train = 'chest-xray-pneumonia/chest_xray/train/PNEUMONIA'
d2_pneu_test = 'chest-xray-pneumonia/chest_xray/test/PNEUMONIA'
if os.path.exists(d2_pneu_train):
    pneumonia_files += [os.path.join(d2_pneu_train, f) for f in os.listdir(d2_pneu_train) if f.lower().endswith('jpeg')]
if os.path.exists(d2_pneu_test):
    pneumonia_files += [os.path.join(d2_pneu_test, f) for f in os.listdir(d2_pneu_test) if f.lower().endswith('jpeg')]

print(f"üìä Found: {len(normal_files)} Normal vs {len(pneumonia_files)} Pneumonia")

# 4. Balance the Classes (Trim the larger one)
if len(pneumonia_files) > len(normal_files):
    print(f"‚öñÔ∏è Balancing: Trimming Pneumonia from {len(pneumonia_files)} to {len(normal_files)}")
    random.shuffle(pneumonia_files)
    pneumonia_files = pneumonia_files[:len(normal_files)]
elif len(normal_files) > len(pneumonia_files):
    print(f"‚öñÔ∏è Balancing: Trimming Normal from {len(normal_files)} to {len(pneumonia_files)}")
    random.shuffle(normal_files)
    normal_files = normal_files[:len(pneumonia_files)]

# 5. Distribute Files (70% Train, 20% Test, 10% Val)
def distribute(file_list, category):
    random.shuffle(file_list)
    total = len(file_list)

    # Calculate cut-off points
    train_end = int(0.70 * total)
    test_end = int(0.90 * total) # 70% + 20% = 90%
    # Remaining 10% (from 0.90 to 1.0) goes to Val

    for i, f in enumerate(file_list):
        if i < train_end: split = 'train'
        elif i < test_end: split = 'test'
        else: split = 'val'

        shutil.copy(f, os.path.join(BASE_DIR, split, category, os.path.basename(f)))

print("üöÄ Moving files... (This takes ~2-3 minutes)")
distribute(normal_files, 'NORMAL')
distribute(pneumonia_files, 'PNEUMONIA')

print("‚úÖ Data Merged & Split Successfully!")
print("   Check the 'Final_Dataset' folder on the left.")

üìÇ Created folder structure. Gathering files...
üìä Found: 1575 Normal vs 4265 Pneumonia
‚öñÔ∏è Balancing: Trimming Pneumonia from 4265 to 1575
üöÄ Moving files... (This takes ~2-3 minutes)
‚úÖ Data Merged & Split Successfully!
   Check the 'Final_Dataset' folder on the left.


In [4]:
# ==========================================
# STEP 3: BUILD & TRAIN (BINARY MODEL)
# ==========================================
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization

# Configuration
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
DATA_DIR = 'Final_Dataset'

print("‚öôÔ∏è Setting up Data Generators...")

# 1. TRAIN GENERATOR (With Augmentation)
# We zoom and rotate images slightly so the model learns to recognize pneumonia from all angles
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=15,
    zoom_range=0.2,
    horizontal_flip=True
)

train_generator = train_datagen.flow_from_directory(
    f'{DATA_DIR}/train',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary', # Binary because we only have Normal vs Pneumonia
    shuffle=True
)

# 2. VALIDATION GENERATOR (No Augmentation, just rescaling)
val_datagen = ImageDataGenerator(rescale=1./255)

val_generator = val_datagen.flow_from_directory(
    f'{DATA_DIR}/val',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',
    shuffle=False
)

# 3. BUILD THE CNN MODEL
print("üèóÔ∏è Building the CNN Model...")
model = Sequential([
    # Block 1
    Conv2D(32, (3,3), activation='relu', input_shape=(224,224,3)),
    BatchNormalization(),
    MaxPooling2D(2,2),

    # Block 2
    Conv2D(64, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(2,2),

    # Block 3
    Conv2D(128, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(2,2),

    # Deep Features
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5), # Drops 50% of neurons to prevent memorization

    # OUTPUT LAYER: 1 Neuron (0 = Normal, 1 = Pneumonia)
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 4. START TRAINING
print("üöÄ Training Started... This will take about 10-15 minutes.")
print("   (You will see the accuracy increase with every Epoch)")

history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10
)

# 5. SAVE THE MODEL
model.save('pneumonia_binary_model.h5')
print("\n‚úÖ SUCCESS! Model saved as 'pneumonia_binary_model.h5'")

‚öôÔ∏è Setting up Data Generators...
Found 2204 images belonging to 2 classes.
Found 316 images belonging to 2 classes.
üèóÔ∏è Building the CNN Model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


üöÄ Training Started... This will take about 10-15 minutes.
   (You will see the accuracy increase with every Epoch)


  self._warn_if_super_not_called()


Epoch 1/10
[1m69/69[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m69s[0m 858ms/step - accuracy: 0.7730 - loss: 2.7987 - val_accuracy: 0.5000 - val_loss: 46.5271
Epoch 2/10
[1m69/69[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m51s[0m 744ms/step - accuracy: 0.8275 - loss: 0.5173 - val_accuracy: 0.4684 - val_loss: 8.3447
Epoch 3/10
[1m69/69[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m51s[0m 736ms/step - accuracy: 0.8644 - loss: 0.3770 - val_accuracy: 0.5000 - val_loss: 19.9300
Epoch 4/10
[1m69/69[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m51s[0m 740ms/step - accuracy: 0.8625 - loss: 0.3394 - val_accuracy: 0.5000 - val_loss: 37.0925
Epoch 5/10
[1m69/69[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m51s[0m 745ms/step - accuracy: 0.8895 - loss: 0.2970 - val_accuracy: 0.5000 -




‚úÖ SUCCESS! Model saved as 'pneumonia_binary_model.h5'


In [6]:
# ==========================================
# NEW STEP 2: MERGE EVERYTHING (NO TRIMMING)
# ==========================================
import os
import shutil
import random
from tqdm import tqdm # Progress bar

BASE_DIR = 'Final_Dataset_Full'
if os.path.exists(BASE_DIR): shutil.rmtree(BASE_DIR)

# Create structure
for split in ['train', 'test', 'val']:
    for cls in ['NORMAL', 'PNEUMONIA']:
        os.makedirs(os.path.join(BASE_DIR, split, cls), exist_ok=True)

print("üïµÔ∏è SEARCHING FOR IMAGES...")

# --- 1. DEFINE SOURCES ---
# We map specific source folders to our target classes
sources = [
    # (Path to source folder, Target Class)

    # DATASET A (Adults)
    ('covid19-radiography-database/COVID-19_Radiography_Dataset/Normal', 'NORMAL'),
    ('covid19-radiography-database/COVID-19_Radiography_Dataset/Lung_Opacity', 'PNEUMONIA'),
    ('covid19-radiography-database/COVID-19_Radiography_Dataset/Viral Pneumonia', 'PNEUMONIA'),
    ('covid19-radiography-database/COVID-19_Radiography_Dataset/COVID', 'PNEUMONIA'),

    # DATASET B (Kids) - Train folder
    ('chest-xray-pneumonia/chest_xray/train/NORMAL', 'NORMAL'),
    ('chest-xray-pneumonia/chest_xray/train/PNEUMONIA', 'PNEUMONIA'),

    # DATASET B (Kids) - Test/Val folders (We merge them all to reshuffle)
    ('chest-xray-pneumonia/chest_xray/test/NORMAL', 'NORMAL'),
    ('chest-xray-pneumonia/chest_xray/test/PNEUMONIA', 'PNEUMONIA'),
    ('chest-xray-pneumonia/chest_xray/val/NORMAL', 'NORMAL'),
    ('chest-xray-pneumonia/chest_xray/val/PNEUMONIA', 'PNEUMONIA'),
]

# --- 2. COLLECT FILES ---
normal_files = []
pneumonia_files = []

for path, target_cls in sources:
    if os.path.exists(path):
        # Find all images
        images = [os.path.join(path, f) for f in os.listdir(path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        count = len(images)
        print(f"   Found {count} images in: {path} -> {target_cls}")

        if target_cls == 'NORMAL':
            normal_files.extend(images)
        else:
            pneumonia_files.extend(images)
    else:
        # Try fixing "Lung_Opacity" vs "Lung Opacity" naming issue
        if "Lung_Opacity" in path:
            alt_path = path.replace("Lung_Opacity", "Lung Opacity")
            if os.path.exists(alt_path):
                images = [os.path.join(alt_path, f) for f in os.listdir(alt_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
                print(f"   Found {len(images)} images in: {alt_path} -> {target_cls}")
                pneumonia_files.extend(images)
            else:
                print(f"‚ö†Ô∏è WARNING: Folder not found: {path}")
        else:
            print(f"‚ö†Ô∏è WARNING: Folder not found: {path}")

print("-" * 40)
print(f"üìä TOTAL NORMAL FOUND: {len(normal_files)}")
print(f"üìä TOTAL PNEUMONIA FOUND: {len(pneumonia_files)}")
print(f"üî• GRAND TOTAL: {len(normal_files) + len(pneumonia_files)}")
print("-" * 40)

# --- 3. DISTRIBUTE WITHOUT TRIMMING ---
def distribute(file_list, category):
    random.shuffle(file_list)
    total = len(file_list)
    train_end = int(0.70 * total)
    test_end = int(0.90 * total)

    print(f"üöÄ Moving {total} {category} images...")

    for i, f in enumerate(tqdm(file_list)):
        if i < train_end: split = 'train'
        elif i < test_end: split = 'test'
        else: split = 'val'

        shutil.copy(f, os.path.join(BASE_DIR, split, category, os.path.basename(f)))

distribute(normal_files, 'NORMAL')
distribute(pneumonia_files, 'PNEUMONIA')

print("\n‚úÖ SUCCESS! Full dataset prepared in 'Final_Dataset_Full'")

üïµÔ∏è SEARCHING FOR IMAGES...
   Found 0 images in: covid19-radiography-database/COVID-19_Radiography_Dataset/Normal -> NORMAL
   Found 0 images in: covid19-radiography-database/COVID-19_Radiography_Dataset/Lung_Opacity -> PNEUMONIA
   Found 0 images in: covid19-radiography-database/COVID-19_Radiography_Dataset/Viral Pneumonia -> PNEUMONIA
   Found 0 images in: covid19-radiography-database/COVID-19_Radiography_Dataset/COVID -> PNEUMONIA
   Found 1341 images in: chest-xray-pneumonia/chest_xray/train/NORMAL -> NORMAL
   Found 3875 images in: chest-xray-pneumonia/chest_xray/train/PNEUMONIA -> PNEUMONIA
   Found 234 images in: chest-xray-pneumonia/chest_xray/test/NORMAL -> NORMAL
   Found 390 images in: chest-xray-pneumonia/chest_xray/test/PNEUMONIA -> PNEUMONIA
   Found 8 images in: chest-xray-pneumonia/chest_xray/val/NORMAL -> NORMAL
   Found 8 images in: chest-xray-pneumonia/chest_xray/val/PNEUMONIA -> PNEUMONIA
----------------------------------------
üìä TOTAL NORMAL FOUND: 1583
ü

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1583/1583 [00:12<00:00, 130.19it/s]


üöÄ Moving 4273 PNEUMONIA images...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4273/4273 [00:01<00:00, 2751.09it/s]


‚úÖ SUCCESS! Full dataset prepared in 'Final_Dataset_Full'





In [7]:
# ==========================================
# FINAL FIX: AUTO-DISCOVERY MERGE (SMART SCAN)
# ==========================================
import os
import shutil
import random
from tqdm import tqdm

BASE_DIR = 'Final_Dataset_Full'
if os.path.exists(BASE_DIR): shutil.rmtree(BASE_DIR)

# Create destination folders
for split in ['train', 'test', 'val']:
    for cls in ['NORMAL', 'PNEUMONIA']:
        os.makedirs(os.path.join(BASE_DIR, split, cls), exist_ok=True)

print("üïµÔ∏è STARTING DEEP SCAN FOR ALL IMAGES...")

# Lists to hold file paths
normal_files = []
pneumonia_files = []

# We walk through every single folder in your Colab workspace
for root, dirs, files in os.walk('.'):
    # Skip the destination folder itself to avoid loops
    if BASE_DIR in root or '.config' in root:
        continue

    folder_name = os.path.basename(root).lower()

    # 1. IDENTIFY PNEUMONIA FOLDERS
    # We look for keywords: 'covid', 'lung_opacity', 'viral pneumonia', or just 'pneumonia'
    if folder_name in ['covid', 'lung_opacity', 'lung opacity', 'viral pneumonia', 'pneumonia']:
        # Found a sickness folder! Add images to Pneumonia list
        images = [os.path.join(root, f) for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        if len(images) > 0:
            print(f"   ‚úÖ Found PNEUMONIA data: {len(images)} images in '{root}'")
            pneumonia_files.extend(images)

    # 2. IDENTIFY NORMAL FOLDERS
    elif folder_name == 'normal':
        # Found a healthy folder! Add images to Normal list
        images = [os.path.join(root, f) for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        if len(images) > 0:
            print(f"   ‚úÖ Found NORMAL data:    {len(images)} images in '{root}'")
            normal_files.extend(images)

print("-" * 40)
print(f"üìä TOTAL NORMAL FOUND:    {len(normal_files)}")
print(f"üìä TOTAL PNEUMONIA FOUND: {len(pneumonia_files)}")
print(f"üî• GRAND TOTAL:           {len(normal_files) + len(pneumonia_files)}")
print("-" * 40)

if len(normal_files) < 5000:
    print("‚ö†Ô∏è WARNING: Still low on images. Did you run the 'Download' step?")
else:
    print("üöÄ SUCCESS! Starting merge...")

# --- DISTRIBUTE FILES ---
def distribute(file_list, category):
    random.shuffle(file_list)
    total = len(file_list)
    train_end = int(0.70 * total)
    test_end = int(0.90 * total)

    for i, f in enumerate(tqdm(file_list)):
        if i < train_end: split = 'train'
        elif i < test_end: split = 'test'
        else: split = 'val'

        # Handle duplicate filenames by renaming if necessary
        dest_path = os.path.join(BASE_DIR, split, category, os.path.basename(f))
        if os.path.exists(dest_path):
            filename, ext = os.path.splitext(os.path.basename(f))
            dest_path = os.path.join(BASE_DIR, split, category, f"{filename}_dup{i}{ext}")

        shutil.copy(f, dest_path)

distribute(normal_files, 'NORMAL')
distribute(pneumonia_files, 'PNEUMONIA')

print("\n‚úÖ FULL DATASET READY IN 'Final_Dataset_Full'")

üïµÔ∏è STARTING DEEP SCAN FOR ALL IMAGES...
   ‚úÖ Found PNEUMONIA data: 1102 images in './Final_Dataset/train/PNEUMONIA'
   ‚úÖ Found NORMAL data:    1102 images in './Final_Dataset/train/NORMAL'
   ‚úÖ Found PNEUMONIA data: 158 images in './Final_Dataset/val/PNEUMONIA'
   ‚úÖ Found NORMAL data:    158 images in './Final_Dataset/val/NORMAL'
   ‚úÖ Found PNEUMONIA data: 315 images in './Final_Dataset/test/PNEUMONIA'
   ‚úÖ Found NORMAL data:    315 images in './Final_Dataset/test/NORMAL'
   ‚úÖ Found PNEUMONIA data: 3875 images in './chest-xray-pneumonia/chest_xray/train/PNEUMONIA'
   ‚úÖ Found NORMAL data:    1341 images in './chest-xray-pneumonia/chest_xray/train/NORMAL'
   ‚úÖ Found PNEUMONIA data: 3875 images in './chest-xray-pneumonia/chest_xray/__MACOSX/chest_xray/train/PNEUMONIA'
   ‚úÖ Found NORMAL data:    1341 images in './chest-xray-pneumonia/chest_xray/__MACOSX/chest_xray/train/NORMAL'
   ‚úÖ Found PNEUMONIA data: 8 images in './chest-xray-pneumonia/chest_xray/__MACOSX/che

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6324/6324 [00:21<00:00, 296.10it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14394/14394 [00:12<00:00, 1175.44it/s]


‚úÖ FULL DATASET READY IN 'Final_Dataset_Full'





In [11]:
# ==========================================
# STEP 3: TRAIN ON FULL CLEAN DATASET (15k Images)
# ==========================================
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization

# Configuration
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
DATA_DIR = 'Final_Dataset_Full' # Pointing to the clean, merged dataset

print(f"üöÄ Preparing to train on cleaned dataset: {DATA_DIR}")

# 1. DATA GENERATORS
# Augmentation helps the model generalize better on this large dataset
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=15,
    zoom_range=0.2,
    horizontal_flip=True
)

val_test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
    f'{DATA_DIR}/train',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary'
)

val_generator = val_test_datagen.flow_from_directory(
    f'{DATA_DIR}/val',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary'
)

# 2. BUILD ROBUST CNN MODEL
model = Sequential([
    # Block 1
    Conv2D(32, (3,3), activation='relu', input_shape=(224,224,3)),
    BatchNormalization(),
    MaxPooling2D(2,2),

    # Block 2
    Conv2D(64, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(2,2),

    # Block 3
    Conv2D(128, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(2,2),

    # Block 4 (Deep Features)
    Conv2D(256, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(2,2),

    # Classification Head
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5), # Prevents overfitting
    Dense(1, activation='sigmoid') # Binary Output (0=Normal, 1=Pneumonia)
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 3. START TRAINING
# Since we have ~15,000 images, 5 Epochs is plenty to get high accuracy without waiting hours.
print("üî• Starting Training... (This will take approx 20-30 mins)")

history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=5
)

# 4. SAVE MODEL
model.save('pneumonia_mega_model.h5')
print("\n‚úÖ SUCCESS! Model saved as 'pneumonia_mega_model.h5'")

üöÄ Preparing to train on cleaned dataset: Final_Dataset_Full
Found 10373 images belonging to 2 classes.
Found 1482 images belonging to 2 classes.
üî• Starting Training... (This will take approx 20-30 mins)
Epoch 1/5
[1m325/325[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m239s[0m 711ms/step - accuracy: 0.8284 - loss: 1.6279 - val_accuracy: 0.6788 - val_loss: 18.7159
Epoch 2/5
[1m325/325[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m217s[0m 668ms/step - accuracy: 0.9079 - loss: 0.2412 - val_accuracy: 0.6862 - val_loss: 3.6997
Epoch 3/5
[1m325/325[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m216s[0m 662ms/step - accuracy: 0.9124 - loss: 0.2273 - val_accuracy: 0.9231 - val_loss: 0.2042
Epoch 4/5
[1m325/325[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m214s[0m 660ms/step - accuracy: 0.9316 - loss: 0.2070 - val_accura




‚úÖ SUCCESS! Model saved as 'pneumonia_mega_model.h5'


In [13]:
import tensorflow
import numpy
import PIL # This is Pillow
import opendatasets
import tqdm
import matplotlib

print("Copy these lines below:")
print("-" * 30)
print(f"tensorflow=={tensorflow.__version__}")
print(f"numpy=={numpy.__version__}")
print(f"Pillow=={PIL.__version__}")
print(f"opendatasets=={opendatasets.__version__}")
print(f"tqdm=={tqdm.__version__}")
print(f"matplotlib=={matplotlib.__version__}")
print("flask") # Flask is standard, version doesn't strictly matter
print("-" * 30)

Copy these lines below:
------------------------------
tensorflow==2.19.0
numpy==2.0.2
Pillow==11.3.0
opendatasets==0.1.22
tqdm==4.67.1
matplotlib==3.10.0
flask
------------------------------
