In [1]:
# ===============================
# 📌 Step 1: Mount Google Drive
# ===============================
from google.colab import drive
drive.mount('/content/drive')




Mounted at /content/drive


In [16]:
# Change this path to your ZIP dataset in Drive
zip_path = "/content/drive/Shareddrives/Eye Dataset (Mahin)/Datasets/Drive Dataset/DriveDataset.zip"

# ===============================
# 📌 Step 2: Unzip Dataset
# ===============================
!unzip -qo "$zip_path" -d /content/mydb

#dataset_path = "/content/dataset"  # extracted folder
train_path = "/content/mydb/DRIVE/test/images"
test_path = "/content/mydb/DRIVE/training/images"

In [17]:
# ✅ Install required libraries
!pip install python-docx opencv-python-headless

# ✅ Imports
import os
import cv2
import matplotlib.pyplot as plt
from docx import Document
from docx.shared import Inches
import seaborn as sns
import pandas as pd
import numpy as np
from PIL import Image
from IPython.display import display

# ✅ Initialize Word document
doc = Document()
doc.add_heading("DRIVE Eye Fundus Dataset Analysis", 0)

# ✅ Analyze images in a folder
def analyze_folder(folder_path, section_name):
    doc.add_heading(section_name, level=1)
    image_files = sorted([f for f in os.listdir(folder_path) if f.endswith(('.tif', '.jpg', '.png'))])
    doc.add_paragraph(f"Total images in {section_name}: {len(image_files)}")

    sizes = []
    channels = []

    for i, image_file in enumerate(image_files[:5]):
        img_path = os.path.join(folder_path, image_file)
        img = cv2.imread(img_path)
        if img is None:
            continue
        h, w, c = img.shape
        sizes.append((h, w))
        channels.append(c)

        # Save and insert preview image
        preview_path = f"/content/sample_{section_name.replace(' ', '_')}_{i}.jpg"
        cv2.imwrite(preview_path, img)
        doc.add_paragraph(f"Sample Image {i+1}: {image_file} - Size: {h}x{w}, Channels: {c}")
        doc.add_picture(preview_path, width=Inches(3.5))

    return image_files, sizes, channels

# ✅ Plot image size distribution
def plot_size_distribution(sizes, label):
    size_labels = [f"{h}x{w}" for h, w in sizes]
    size_counts = pd.Series(size_labels).value_counts()

    plt.figure(figsize=(8,4))
    sns.barplot(x=size_counts.index, y=size_counts.values, palette="viridis")
    plt.title(f"{label} - Image Size Distribution")
    plt.xticks(rotation=45)
    plt.tight_layout()

    chart_path = f"/content/{label.lower().replace(' ', '_')}_sizes.png"
    plt.savefig(chart_path)
    doc.add_picture(chart_path, width=Inches(5))
    plt.close()

# ✅ Plot pixel intensity histogram
def plot_intensity_histogram(folder_path, label):
    pixel_values = []
    image_files = [f for f in os.listdir(folder_path) if f.endswith(('.jpg', '.png', '.tif'))]
    for f in image_files[:10]:  # Limit to 10 images
        img = cv2.imread(os.path.join(folder_path, f))
        if img is None: continue
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        pixel_values.extend(gray.flatten())

    plt.figure(figsize=(6, 4))
    plt.hist(pixel_values, bins=50, color='skyblue')
    plt.title(f"{label} - Pixel Intensity Histogram")
    plt.xlabel("Pixel Value")
    plt.ylabel("Frequency")
    plt.tight_layout()

    hist_path = f"/content/{label.lower().replace(' ', '_')}_histogram.png"
    plt.savefig(hist_path)
    doc.add_picture(hist_path, width=Inches(5))
    plt.close()

# ✅ Analyze Training Set
train_files, train_sizes, train_channels = analyze_folder(train_path, "Training Set")

# ✅ Analyze Test Set
test_files, test_sizes, test_channels = analyze_folder(test_path, "Test Set")

# ✅ Add Summary
doc.add_heading("Summary", level=1)
doc.add_paragraph(f"Total Training Images: {len(train_files)}")
doc.add_paragraph(f"Total Test Images: {len(test_files)}")
doc.add_paragraph(f"Unique Training Image Sizes: {list(set(train_sizes))}")
doc.add_paragraph(f"Unique Test Image Sizes: {list(set(test_sizes))}")
doc.add_paragraph(f"Training Image Channels: {list(set(train_channels))}")
doc.add_paragraph(f"Test Image Channels: {list(set(test_channels))}")

# ✅ Add Charts
plot_size_distribution(train_sizes, "Training Set")
plot_size_distribution(test_sizes, "Test Set")
plot_intensity_histogram(train_path, "Training Set")
plot_intensity_histogram(test_path, "Test Set")

# ✅ Save DOCX Report
report_path = "/content/drive_dataset_analysis.docx"
doc.save(report_path)
print(f"✅ Word report saved to: {report_path}")





Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=size_counts.index, y=size_counts.values, palette="viridis")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=size_counts.index, y=size_counts.values, palette="viridis")


✅ Word report saved to: /content/drive_dataset_analysis.docx
