# CT7160NI Computer Vision Coursework
## 01 - Data Exploration & Preprocessing

**Project:** Deep Learning-Based Plant Species Classification  
**Dataset:** Oxford 102 Flower Dataset  
**Author:** [Your Name]  
**Date:** [Date]

---

This notebook performs exploratory data analysis (EDA) on the Oxford 102 Flower Dataset.

### Contents
1. Import Libraries
2. Dataset Overview
3. Load Dataset Information
4. Class Distribution Analysis
5. Sample Images Visualization
6. Image Size Analysis
7. Summary Statistics


In [None]:
# Import Libraries
import os
import sys
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

# PyTorch
import torch
import torchvision

# SciPy for .mat file loading
import scipy.io as sio

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Add src to path
sys.path.append('../src')

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")


In [None]:
# Define paths
DATA_DIR = Path('../data/raw/oxford_flowers_102')
JPG_DIR = DATA_DIR / 'jpg'
LABELS_PATH = DATA_DIR / 'imagelabels.mat'
SPLITS_PATH = DATA_DIR / 'setid.mat'

# Check if dataset exists
print("Checking dataset files...")
print(f"Data directory exists: {DATA_DIR.exists()}")
print(f"JPG directory exists: {JPG_DIR.exists()}")
print(f"Labels file exists: {LABELS_PATH.exists()}")
print(f"Splits file exists: {SPLITS_PATH.exists()}")

if not DATA_DIR.exists():
    print("\n⚠️ Dataset not found! Please download from:")
    print("https://www.robots.ox.ac.uk/~vgg/data/flowers/102/")
    print("\nDownload these files:")
    print("1. 102flowers.tgz (images)")
    print("2. imagelabels.mat (labels)")
    print("3. setid.mat (train/val/test splits)")


In [None]:
# Load labels and splits (run only if dataset exists)
# Load labels (1-indexed, convert to 0-indexed)
labels_mat = sio.loadmat(LABELS_PATH)
all_labels = labels_mat['labels'].flatten() - 1

# Load split indices
splits_mat = sio.loadmat(SPLITS_PATH)
train_ids = splits_mat['trnid'].flatten() - 1
val_ids = splits_mat['valid'].flatten() - 1
test_ids = splits_mat['tstid'].flatten() - 1

print(f"Total images: {len(all_labels)}")
print(f"Number of classes: {len(np.unique(all_labels))}")
print(f"\nSplit sizes:")
print(f"  Training: {len(train_ids)} images")
print(f"  Validation: {len(val_ids)} images")
print(f"  Test: {len(test_ids)} images")
