In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from collections import Counter

# Set paths
train_path = '/home/data/train/'
test_path = '/home/data/test/'

# Get list of training images
train_images = os.listdir(train_path)
print(f"Total training images: {len(train_images)}")

# Separate cats and dogs
cat_images = [img for img in train_images if img.startswith('cat')]
dog_images = [img for img in train_images if img.startswith('dog')]
print(f"Cat images: {len(cat_images)}")
print(f"Dog images: {len(dog_images)}")

Total training images: 22500
Cat images: 11242
Dog images: 11258


In [2]:
# Analyze image dimensions and sizes
def analyze_images(image_list, path, sample_size=1000):
    widths, heights, sizes = [], [], []
    
    # Sample random images for analysis
    sample = np.random.choice(image_list, min(sample_size, len(image_list)), replace=False)
    
    for img_name in sample:
        try:
            img_path = os.path.join(path, img_name)
            with Image.open(img_path) as img:
                width, height = img.size
                widths.append(width)
                heights.append(height)
                sizes.append(os.path.getsize(img_path))
        except Exception as e:
            print(f"Error with {img_name}: {e}")
    
    return widths, heights, sizes

# Analyze cat images
cat_widths, cat_heights, cat_sizes = analyze_images(cat_images, train_path)
print("Cat images analysis:")
print(f"  Width - Mean: {np.mean(cat_widths):.1f}, Std: {np.std(cat_widths):.1f}")
print(f"  Height - Mean: {np.mean(cat_heights):.1f}, Std: {np.std(cat_heights):.1f}")
print(f"  File size (KB) - Mean: {np.mean(cat_sizes)/1024:.1f}, Std: {np.std(cat_sizes)/1024:.1f}")

# Analyze dog images
dog_widths, dog_heights, dog_sizes = analyze_images(dog_images, train_path)
print("\nDog images analysis:")
print(f"  Width - Mean: {np.mean(dog_widths):.1f}, Std: {np.std(dog_widths):.1f}")
print(f"  Height - Mean: {np.mean(dog_heights):.1f}, Std: {np.std(dog_heights):.1f}")
print(f"  File size (KB) - Mean: {np.mean(dog_sizes)/1024:.1f}, Std: {np.std(dog_sizes)/1024:.1f}")

Cat images analysis:
  Width - Mean: 411.8, Std: 107.4
  Height - Mean: 355.3, Std: 94.9
  File size (KB) - Mean: 20.7, Std: 8.8

Dog images analysis:
  Width - Mean: 394.2, Std: 113.3
  Height - Mean: 362.7, Std: 98.0
  File size (KB) - Mean: 23.8, Std: 11.5


In [None]:
# Check image modes and formats
sample_images = np.random.choice(train_images, 20, replace=False)

modes = []
formats = []
for img_name in sample_images:
    img_path = os.path.join(train_path, img_name)
    with Image.open(img_path) as img:
        modes.append(img.mode)
        formats.append(img.format)

print("Image modes:", Counter(modes))
print("Image formats:", Counter(formats))

# Check for any grayscale images
grayscale_count = sum(1 for mode in modes if mode == 'L')
print(f"\nGrayscale images in sample: {grayscale_count}")

# Check dimension ranges
all_widths = cat_widths + dog_widths
all_heights = cat_heights + dog_heights

print(f"\nDimension ranges:")
print(f"  Width: {min(all_widths)} - {max(all_widths)} pixels")
print(f"  Height: {min(all_heights)} - {max(all_heights)} pixels")
print(f"  Aspect ratio range: {min(w/h for w,h in zip(all_widths, all_heights)):.2f} - {max(w/h for w,h in zip(all_widths, all_heights)):.2f}")