In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import seaborn as sns

# Set paths
train_dir = '/home/data/train'
test_dir = '/home/data/test'

# Get list of training images
train_files = os.listdir(train_dir)
print(f"Total training images: {len(train_files)}")

# Separate cats and dogs
cat_files = [f for f in train_files if f.startswith('cat')]
dog_files = [f for f in train_files if f.startswith('dog')]

print(f"Cat images: {len(cat_files)}")
print(f"Dog images: {len(dog_files)}")

# Check test images
test_files = os.listdir(test_dir)
print(f"Test images: {len(test_files)}")

Total training images: 22500
Cat images: 11242
Dog images: 11258
Test images: 2500


In [None]:
# Analyze image dimensions and characteristics
def analyze_images(image_files, directory, sample_size=1000):
    widths = []
    heights = []
    channels = []
    file_sizes = []
    
    # Sample random images for analysis
    sample_files = np.random.choice(image_files, min(sample_size, len(image_files)), replace=False)
    
    for img_file in sample_files:
        try:
            img_path = os.path.join(directory, img_file)
            
            # Get file size
            file_sizes.append(os.path.getsize(img_path) / 1024)  # in KB
            
            # Open image
            with Image.open(img_path) as img:
                width, height = img.size
                widths.append(width)
                heights.append(height)
                
                # Get channels
                if img.mode == 'RGB':
                    channels.append(3)
                elif img.mode == 'L':
                    channels.append(1)
                else:
                    channels.append(len(img.getbands()))
                    
        except Exception as e:
            print(f"Error processing {img_file}: {e}")
    
    return widths, heights, channels, file_sizes

# Analyze training images
print("Analyzing training images...")
train_widths, train_heights, train_channels, train_sizes = analyze_images(train_files, train_dir)

print(f"Sample size: {len(train_widths)} images")
print(f"Width - Mean: {np.mean(train_widths):.1f}, Std: {np.std(train_widths):.1f}")
print(f"Height - Mean: {np.mean(train_heights):.1f}, Std: {np.std(train_heights):.1f}")
print(f"File size (KB) - Mean: {np.mean(train_sizes):.1f}, Std: {np.std(train_sizes):.1f}")
print(f"Channel distribution: {np.unique(train_channels, return_counts=True)}")

# Check aspect ratios
aspect_ratios = np.array(train_widths) / np.array(train_heights)
print(f"Aspect ratio - Mean: {np.mean(aspect_ratios):.2f}, Std: {np.std(aspect_ratios):.2f}")