In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import seaborn as sns

# Set paths
train_dir = '/home/data/train/'
test_dir = '/home/data/test/'

# Check number of images
train_files = os.listdir(train_dir)
test_files = os.listdir(test_dir)

print(f"Training images: {len(train_files)}")
print(f"Test images: {len(test_files)}")

# Check class distribution
cat_files = [f for f in train_files if f.startswith('cat')]
dog_files = [f for f in train_files if f.startswith('dog')]

print(f"Cat images: {len(cat_files)}")
print(f"Dog images: {len(dog_files)}")
print(f"Class balance: {len(cat_files)/len(train_files):.2%} cats, {len(dog_files)/len(train_files):.2%} dogs")

Training images: 22500
Test images: 2500
Cat images: 11242
Dog images: 11258
Class balance: 49.96% cats, 50.04% dogs


In [None]:
# Check image sizes and dimensions
import random

def get_image_info(image_path):
    try:
        with Image.open(image_path) as img:
            return img.size, img.mode, os.path.getsize(image_path)
    except Exception as e:
        return None, None, None

# Sample random images from each class
random_cat = random.choice(cat_files)
random_dog = random.choice(dog_files)

cat_path = os.path.join(train_dir, random_cat)
dog_path = os.path.join(train_dir, random_dog)

print("Sample Cat Image:", random_cat)
cat_info = get_image_info(cat_path)
print(f"  Dimensions: {cat_info[0]}, Mode: {cat_info[1]}, Size: {cat_info[2]/1024:.1f}KB")

print("\nSample Dog Image:", random_dog)
dog_info = get_image_info(dog_path)
print(f"  Dimensions: {dog_info[0]}, Mode: {dog_info[1]}, Size: {dog_info[2]/1024:.1f}KB")

# Check a few more images to understand variation
print("\nChecking 10 random images for size variation:")
sample_files = random.sample(train_files, 10)
widths, heights, sizes = [], [], []

for f in sample_files:
    path = os.path.join(train_dir, f)
    size, mode, filesize = get_image_info(path)
    if size:
        widths.append(size[0])
        heights.append(size[1])
        sizes.append(filesize/1024)

print(f"Width range: {min(widths)} - {max(widths)}px")
print(f"Height range: {min(heights)} - {max(heights)}px")
print(f"File size range: {min(sizes):.1f} - {max(sizes):.1f}KB")