# Practical 1: Introduction to Python and Libraries
## Diabetic Retinopathy Image Dataset Analysis

### 1. Import Required Libraries

In [None]:
# Install required libraries (uncomment if needed)
# !pip install numpy pandas scikit-learn matplotlib seaborn pillow

import numpy as np
import pandas as pd
import os
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries imported successfully!")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")

### 2. Load Dataset - Explore Image Files

In [None]:
# Define dataset path
dataset_path = 'colored_images/colored_images/'

# Get all categories (folders)
categories = [d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))]
print("Categories found:", categories)
print(f"Total categories: {len(categories)}")

### 3. Create Dataset DataFrame

In [None]:
# Create a list to store image information
data = []

for category in categories:
    category_path = os.path.join(dataset_path, category)
    images = [f for f in os.listdir(category_path) if f.endswith('.png')]
    
    for img_name in images:
        img_path = os.path.join(category_path, img_name)
        
        # Extract eye side (left/right) from filename
        eye_side = 'left' if 'left' in img_name else 'right'
        patient_id = img_name.split('_')[0]
        
        data.append({
            'filename': img_name,
            'filepath': img_path,
            'category': category,
            'patient_id': patient_id,
            'eye_side': eye_side
        })

# Create DataFrame
df = pd.DataFrame(data)
print(f"Dataset loaded with {len(df)} images")
print(f"\nDataFrame shape: {df.shape}")

### 4. Basic Data Exploration

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
df.head()

In [None]:
# Display dataset info
print("Dataset Information:")
df.info()

In [None]:
# Display basic statistics
print("Dataset Description:")
df.describe(include='all')

### 5. Data Manipulation Tasks

In [None]:
# Count images per category
category_counts = df['category'].value_counts()
print("Images per category:")
print(category_counts)

In [None]:
# Count images per eye side
eye_counts = df['eye_side'].value_counts()
print("\nImages per eye side:")
print(eye_counts)

In [None]:
# Group by category and eye side
grouped = df.groupby(['category', 'eye_side']).size().reset_index(name='count')
print("\nImages grouped by category and eye side:")
print(grouped)

In [None]:
# Filter data - Get only 'Mild' category images
mild_images = df[df['category'] == 'Mild']
print(f"\nTotal 'Mild' category images: {len(mild_images)}")
mild_images.head()

### 6. Load and Display Sample Images

In [None]:
# Function to load and get image properties
def get_image_properties(img_path):
    img = Image.open(img_path)
    return {
        'width': img.width,
        'height': img.height,
        'mode': img.mode,
        'format': img.format
    }

# Get properties for first image
sample_img_path = df.iloc[0]['filepath']
props = get_image_properties(sample_img_path)
print("Sample image properties:")
for key, value in props.items():
    print(f"{key}: {value}")

In [None]:
# Display sample images from each category
fig, axes = plt.subplots(1, len(categories), figsize=(20, 4))

for idx, category in enumerate(categories):
    # Get first image from category
    img_path = df[df['category'] == category].iloc[0]['filepath']
    img = Image.open(img_path)
    
    axes[idx].imshow(img)
    axes[idx].set_title(f'{category}', fontsize=12, fontweight='bold')
    axes[idx].axis('off')

plt.tight_layout()
plt.suptitle('Sample Images from Each Category', y=1.02, fontsize=14, fontweight='bold')
plt.show()

### 7. Add Image Dimensions to DataFrame

In [None]:
# Add image dimensions (sampling first 100 images for speed)
sample_df = df.head(100).copy()

widths = []
heights = []

for img_path in sample_df['filepath']:
    img = Image.open(img_path)
    widths.append(img.width)
    heights.append(img.height)

sample_df['width'] = widths
sample_df['height'] = heights
sample_df['aspect_ratio'] = sample_df['width'] / sample_df['height']

print("Sample DataFrame with image dimensions:")
sample_df.head()

In [None]:
# Statistics on image dimensions
print("Image dimension statistics:")
print(sample_df[['width', 'height', 'aspect_ratio']].describe())

### 8. Save Processed Data

In [None]:
# Save the main DataFrame to CSV
df.to_csv('dataset_info.csv', index=False)
print("Dataset information saved to 'dataset_info.csv'")

# Save category counts
category_counts.to_csv('category_counts.csv', header=['count'])
print("Category counts saved to 'category_counts.csv'")

### 9. Summary Statistics

In [None]:
# Create summary report
print("="*50)
print("DATASET SUMMARY REPORT")
print("="*50)
print(f"Total Images: {len(df)}")
print(f"Total Categories: {len(categories)}")
print(f"Total Patients: {df['patient_id'].nunique()}")
print(f"\nCategory Distribution:")
for cat, count in category_counts.items():
    percentage = (count / len(df)) * 100
    print(f"  {cat}: {count} ({percentage:.2f}%)")
print("="*50)