# ImageNet Subset Downloader

This notebook helps you download and setup ImageNet subset datasets for experimenting with ResNet50 training.

## Available Datasets:
- **Tiny ImageNet**: 200 classes, 64x64 images, ~240MB
- **Imagenette**: 10 classes, 320x320 images, ~300MB
- **ImageWoof**: 10 dog breeds, 320x320 images, ~300MB

Perfect for quick experiments and testing your training pipeline!

In [None]:
# Import required libraries
import os
import urllib.request
import zipfile
import tarfile
from pathlib import Path
import shutil
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

In [None]:
def download_with_progress(url, filename, progress_widget=None):
    """Download file with progress indication"""
    print(f"📥 Downloading {filename}...")
    
    def progress_hook(count, block_size, total_size):
        if progress_widget and total_size > 0:
            percent = min(100, int(count * block_size * 100 / total_size))
            progress_widget.value = percent
            progress_widget.description = f"{percent}%"
        elif count % 100 == 0:  # Print every 100 blocks if no widget
            if total_size > 0:
                percent = int(count * block_size * 100 / total_size)
                print(f"   Progress: {percent}%")
    
    try:
        urllib.request.urlretrieve(url, filename, progress_hook)
        if progress_widget:
            progress_widget.value = 100
            progress_widget.description = "Complete!"
        print(f"✅ Download completed: {filename}")
        return True
    except Exception as e:
        print(f"❌ Download failed: {e}")
        return False

In [None]:
def extract_archive(archive_path, extract_to):
    """Extract archive file"""
    print(f"📦 Extracting {archive_path}...")
    
    try:
        if archive_path.endswith('.zip'):
            with zipfile.ZipFile(archive_path, 'r') as zip_ref:
                zip_ref.extractall(extract_to)
        elif archive_path.endswith(('.tar.gz', '.tgz')):
            with tarfile.open(archive_path, 'r:gz') as tar:
                tar.extractall(extract_to)
        
        print("✅ Extraction completed")
        return True
    except Exception as e:
        print(f"❌ Extraction failed: {e}")
        return False

In [None]:
def organize_tiny_imagenet_val(tiny_imagenet_path):
    """Organize Tiny ImageNet validation data"""
    val_dir = tiny_imagenet_path / "val"
    val_annotations = val_dir / "val_annotations.txt"
    
    if not val_annotations.exists():
        print("⚠️ Validation annotations not found")
        return
    
    print("🔧 Organizing validation data...")
    
    # Read validation annotations
    annotations = {}
    with open(val_annotations, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                filename = parts[0]
                class_name = parts[1]
                annotations[filename] = class_name
    
    # Create class directories and move images
    val_images_dir = val_dir / "images"
    if val_images_dir.exists():
        for class_name in set(annotations.values()):
            class_dir = val_dir / class_name
            class_dir.mkdir(exist_ok=True)
        
        for image_file in val_images_dir.glob("*.JPEG"):
            if image_file.name in annotations:
                class_name = annotations[image_file.name]
                target_dir = val_dir / class_name
                target_path = target_dir / image_file.name
                if not target_path.exists():
                    shutil.move(str(image_file), str(target_path))
        
        # Remove empty images directory
        if val_images_dir.exists() and not any(val_images_dir.iterdir()):
            val_images_dir.rmdir()
        
        print("✅ Validation data organized")

In [None]:
def download_tiny_imagenet(progress_widget=None):
    """Download and setup Tiny ImageNet"""
    print("🎯 Downloading Tiny ImageNet Dataset")
    print("   • 200 classes from ImageNet")
    print("   • 64x64 pixel images")
    print("   • ~240MB download size")
    print("   • Perfect for quick experiments")
    
    # Setup paths
    datasets_dir = Path("../datasets")
    datasets_dir.mkdir(exist_ok=True)
    
    zip_file = datasets_dir / "tiny-imagenet-200.zip"
    extract_dir = datasets_dir / "tiny-imagenet-200"
    
    # Download if not exists
    if not zip_file.exists():
        url = "http://cs231n.stanford.edu/tiny-imagenet-200.zip"
        if not download_with_progress(url, zip_file, progress_widget):
            return None
    else:
        print("✅ File already downloaded")
    
    # Extract if not exists
    if not extract_dir.exists():
        if not extract_archive(zip_file, datasets_dir):
            return None
    else:
        print("✅ Already extracted")
    
    # Organize validation data
    organize_tiny_imagenet_val(extract_dir)
    
    # Verify dataset
    train_dir = extract_dir / "train"
    val_dir = extract_dir / "val"
    
    if train_dir.exists() and val_dir.exists():
        train_classes = len([d for d in train_dir.iterdir() if d.is_dir()])
        val_classes = len([d for d in val_dir.iterdir() if d.is_dir()])
        
        print(f"✅ Dataset ready!")
        print(f"   📁 Location: {extract_dir}")
        print(f"   📊 Training classes: {train_classes}")
        print(f"   📊 Validation classes: {val_classes}")
        
        return extract_dir
    else:
        print("❌ Dataset verification failed")
        return None

In [None]:
def download_imagenette(progress_widget=None):
    """Download Imagenette 320px version"""
    print("🎯 Downloading Imagenette Dataset (320px)")
    print("   • 10 classes from ImageNet")
    print("   • 320x320 pixel images")
    print("   • ~300MB download size")
    print("   • Great for quick training tests")
    
    # Setup paths
    datasets_dir = Path("../datasets")
    datasets_dir.mkdir(exist_ok=True)
    
    tgz_file = datasets_dir / "imagenette2-320.tgz"
    extract_dir = datasets_dir / "imagenette2-320"
    
    # Download if not exists
    if not tgz_file.exists():
        url = "https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-320.tgz"
        if not download_with_progress(url, tgz_file, progress_widget):
            return None
    else:
        print("✅ File already downloaded")
    
    # Extract if not exists
    if not extract_dir.exists():
        if not extract_archive(tgz_file, datasets_dir):
            return None
    else:
        print("✅ Already extracted")
    
    # Verify dataset
    train_dir = extract_dir / "train"
    val_dir = extract_dir / "val"
    
    if train_dir.exists() and val_dir.exists():
        train_classes = len([d for d in train_dir.iterdir() if d.is_dir()])
        val_classes = len([d for d in val_dir.iterdir() if d.is_dir()])
        
        print(f"✅ Dataset ready!")
        print(f"   📁 Location: {extract_dir}")
        print(f"   📊 Training classes: {train_classes}")
        print(f"   📊 Validation classes: {val_classes}")
        
        # Show class names
        if train_classes == 10:
            print("   🏷️ Classes: tench, English springer, cassette player, chain saw,")
            print("              church, French horn, garbage truck, gas pump, golf ball, parachute")
        
        return extract_dir
    else:
        print("❌ Dataset verification failed")
        return None

In [None]:
def download_imagewoof(progress_widget=None):
    """Download ImageWoof 320px version"""
    print("🎯 Downloading ImageWoof Dataset (320px)")
    print("   • 10 dog breeds from ImageNet")
    print("   • 320x320 pixel images")
    print("   • ~300MB download size")
    print("   • Challenging classification task")
    
    # Setup paths
    datasets_dir = Path("../datasets")
    datasets_dir.mkdir(exist_ok=True)
    
    tgz_file = datasets_dir / "imagewoof2-320.tgz"
    extract_dir = datasets_dir / "imagewoof2-320"
    
    # Download if not exists
    if not tgz_file.exists():
        url = "https://s3.amazonaws.com/fast-ai-imageclas/imagewoof2-320.tgz"
        if not download_with_progress(url, tgz_file, progress_widget):
            return None
    else:
        print("✅ File already downloaded")
    
    # Extract if not exists
    if not extract_dir.exists():
        if not extract_archive(tgz_file, datasets_dir):
            return None
    else:
        print("✅ Already extracted")
    
    # Verify dataset
    train_dir = extract_dir / "train"
    val_dir = extract_dir / "val"
    
    if train_dir.exists() and val_dir.exists():
        train_classes = len([d for d in train_dir.iterdir() if d.is_dir()])
        val_classes = len([d for d in val_dir.iterdir() if d.is_dir()])
        
        print(f"✅ Dataset ready!")
        print(f"   📁 Location: {extract_dir}")
        print(f"   📊 Training classes: {train_classes}")
        print(f"   📊 Validation classes: {val_classes}")
        
        # Show class names
        if train_classes == 10:
            print("   🏷️ Dog breeds: Australian terrier, Border terrier, Samoyed, Beagle,")
            print("                  Shih-Tzu, English foxhound, Rhodesian ridgeback, Dingo, Golden retriever, Old English sheepdog")
        
        return extract_dir
    else:
        print("❌ Dataset verification failed")
        return None

## Interactive Dataset Downloader

Use the widget below to select and download your preferred dataset:

In [None]:
# Create interactive downloader widget
dataset_selector = widgets.Dropdown(
    options=[
        ('Tiny ImageNet (200 classes, 64x64, ~240MB)', 'tiny'),
        ('Imagenette (10 classes, 320x320, ~300MB)', 'imagenette'),
        ('ImageWoof (10 dog breeds, 320x320, ~300MB)', 'imagewoof')
    ],
    value='tiny',
    description='Dataset:',
    style={'description_width': 'initial'}
)

download_button = widgets.Button(
    description='Download Dataset',
    button_style='success',
    icon='download'
)

progress_bar = widgets.IntProgress(
    value=0,
    min=0,
    max=100,
    description='Ready',
    bar_style='info',
    style={'bar_color': '#2196F3'},
    orientation='horizontal'
)

output_area = widgets.Output()

def on_download_click(b):
    with output_area:
        clear_output(wait=True)
        
        # Reset progress bar
        progress_bar.value = 0
        progress_bar.description = "Starting..."
        
        dataset_type = dataset_selector.value
        
        if dataset_type == 'tiny':
            result = download_tiny_imagenet(progress_bar)
        elif dataset_type == 'imagenette':
            result = download_imagenette(progress_bar)
        elif dataset_type == 'imagewoof':
            result = download_imagewoof(progress_bar)
        
        if result:
            progress_bar.bar_style = 'success'
            print("\n🎉 Download completed successfully!")
            print(f"\n📋 Next steps:")
            print(f"   1. Go back to main folder: cd ..")
            print(f"   2. Test setup: python quick_test.py")
            print(f"   3. Run learning rate finder")
            print(f"   4. Start training!")
        else:
            progress_bar.bar_style = 'danger'
            progress_bar.description = "Failed"

download_button.on_click(on_download_click)

# Display widgets
display(widgets.VBox([
    widgets.HTML("<h3>🚀 Select and Download Dataset</h3>"),
    dataset_selector,
    download_button,
    progress_bar,
    output_area
]))

## Dataset Information Summary

After downloading, here's what you'll have:

In [None]:
def show_dataset_info():
    """Display information about available datasets"""
    datasets_dir = Path("../datasets")
    
    if not datasets_dir.exists():
        print("📁 No datasets folder found yet. Download a dataset first!")
        return
    
    print("📊 Available Datasets:")
    print("=" * 50)
    
    # Check for Tiny ImageNet
    tiny_path = datasets_dir / "tiny-imagenet-200"
    if tiny_path.exists():
        train_classes = len([d for d in (tiny_path / "train").iterdir() if d.is_dir()])
        print(f"✅ Tiny ImageNet")
        print(f"   📁 Path: {tiny_path}")
        print(f"   📊 Classes: {train_classes}")
        print(f"   🖼️ Image size: 64x64")
        print()
    
    # Check for Imagenette
    imagenette_path = datasets_dir / "imagenette2-320"
    if imagenette_path.exists():
        train_classes = len([d for d in (imagenette_path / "train").iterdir() if d.is_dir()])
        print(f"✅ Imagenette")
        print(f"   📁 Path: {imagenette_path}")
        print(f"   📊 Classes: {train_classes}")
        print(f"   🖼️ Image size: 320x320")
        print()
    
    # Check for ImageWoof
    imagewoof_path = datasets_dir / "imagewoof2-320"
    if imagewoof_path.exists():
        train_classes = len([d for d in (imagewoof_path / "train").iterdir() if d.is_dir()])
        print(f"✅ ImageWoof")
        print(f"   📁 Path: {imagewoof_path}")
        print(f"   📊 Classes: {train_classes}")
        print(f"   🖼️ Image size: 320x320")
        print()
    
    if not any([(tiny_path).exists(), (imagenette_path).exists(), (imagewoof_path).exists()]):
        print("📁 No datasets found. Use the downloader above to get started!")

# Run the function
show_dataset_info()

## Training Commands Generator

Once you've downloaded a dataset, use this to generate the appropriate training commands:

In [None]:
def generate_training_commands():
    """Generate training commands for available datasets"""
    datasets_dir = Path("../datasets")
    
    available_datasets = []
    if (datasets_dir / "tiny-imagenet-200").exists():
        available_datasets.append(("tiny-imagenet-200", "Tiny ImageNet", 64, 200))
    if (datasets_dir / "imagenette2-320").exists():
        available_datasets.append(("imagenette2-320", "Imagenette", 224, 10))
    if (datasets_dir / "imagewoof2-320").exists():
        available_datasets.append(("imagewoof2-320", "ImageWoof", 224, 10))
    
    if not available_datasets:
        print("❌ No datasets found. Download one first!")
        return
    
    print("🚀 Training Commands for Your Datasets")
    print("=" * 50)
    
    for folder, name, img_size, num_classes in available_datasets:
        print(f"\n📊 {name} ({img_size}x{img_size}, {num_classes} classes)")
        print(f"📁 Dataset: datasets/{folder}")
        
        # Determine parameters based on dataset
        if img_size == 64:  # Tiny ImageNet
            batch_size = 256
            epochs = 50
            lr = "1e-2"
        else:  # Full size images
            batch_size = 128
            epochs = 20
            lr = "3e-3"
        
        print(f"\n💻 Commands (from main project folder):")
        print(f"   # Quick test (2 epochs)")
        print(f"   python train_imagenet.py --data-dir datasets/{folder} --epochs 2 --batch-size 32")
        
        print(f"\n   # Full training")
        print(f"   python train_imagenet.py --data-dir datasets/{folder} --epochs {epochs} --batch-size {batch_size} --lr {lr}")
        
        if num_classes == 10:  # Imagenette/ImageWoof
            print(f"\n   # With pretrained weights (recommended)")
            print(f"   python train_imagenet.py --data-dir datasets/{folder} --epochs {epochs//2} --batch-size {batch_size} --lr {lr} --pretrained")
        
        print("-" * 40)

# Generate commands
generate_training_commands()

## 🎯 Quick Start Summary

1. **Use the downloader above** to get your preferred dataset
2. **Go back to main folder**: `cd ..`
3. **Test your setup**: `python quick_test.py`
        "4. **Find optimal learning rate**: `cd ../lr_optimization && jupyter notebook learning_rate_finder.ipynb`\\n",
5. **Start training** with the generated commands above

Happy training! 🚀