In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt

# Check data structure
train_dir = '/home/data/train'
test_dir = '/home/data/test'

print("Training images:", len(os.listdir(train_dir)))
print("Test images:", len(os.listdir(test_dir)))

# Sample some images to understand characteristics
sample_train = os.listdir(train_dir)[:10]
print("\nSample training filenames:", sample_train)

# Check image sizes and formats
sizes = []
for img_file in sample_train[:5]:
    img_path = os.path.join(train_dir, img_file)
    with Image.open(img_path) as img:
        sizes.append(img.size)
        print(f"{img_file}: size={img.size}, mode={img.mode}")

print(f"\nSample image sizes: {sizes}")

Training images: 22500
Test images: 2500

Sample training filenames: ['dog.5.jpg', 'cat.8112.jpg', 'cat.1197.jpg', 'dog.8491.jpg', 'dog.9129.jpg', 'cat.116.jpg', 'cat.5347.jpg', 'dog.5627.jpg', 'cat.1316.jpg', 'dog.9329.jpg']
dog.5.jpg: size=(300, 315), mode=RGB
cat.8112.jpg: size=(412, 230), mode=RGB
cat.1197.jpg: size=(500, 374), mode=RGB
dog.8491.jpg: size=(415, 480), mode=RGB
dog.9129.jpg: size=(319, 240), mode=RGB

Sample image sizes: [(300, 315), (412, 230), (500, 374), (415, 480), (319, 240)]


In [2]:
# Analyze training data distribution
train_files = os.listdir(train_dir)

# Count dogs and cats
dog_count = sum(1 for f in train_files if f.startswith('dog'))
cat_count = sum(1 for f in train_files if f.startswith('cat'))

print(f"Training set:")
print(f"  Dogs: {dog_count}")
print(f"  Cats: {cat_count}")
print(f"  Total: {len(train_files)}")
print(f"  Class balance: {dog_count/len(train_files):.2%} dogs, {cat_count/len(train_files):.2%} cats")

# Check a few more image characteristics
from collections import Counter

widths = []
heights = []
aspect_ratios = []

sample_size = 1000  # Check a subset for efficiency
for img_file in train_files[:sample_size]:
    img_path = os.path.join(train_dir, img_file)
    try:
        with Image.open(img_path) as img:
            w, h = img.size
            widths.append(w)
            heights.append(h)
            aspect_ratios.append(round(w/h, 1))
    except:
        pass

print(f"\nImage characteristics (sample of {sample_size}):")
print(f"  Width range: {min(widths)} - {max(widths)} px")
print(f"  Height range: {min(heights)} - {max(heights)} px")
print(f"  Most common aspect ratios: {Counter(aspect_ratios).most_common(5)}")

# Check test files
test_files = os.listdir(test_dir)
print(f"\nTest set:")
print(f"  Total images: {len(test_files)}")
print(f"  Sample test filenames: {test_files[:5]}")

Training set:
  Dogs: 11258
  Cats: 11242
  Total: 22500
  Class balance: 50.04% dogs, 49.96% cats

Image characteristics (sample of 1000):
  Width range: 50 - 500 px
  Height range: 38 - 500 px
  Most common aspect ratios: [(1.3, 397), (1.0, 82), (1.5, 82), (0.8, 80), (0.9, 73)]

Test set:
  Total images: 2500
  Sample test filenames: ['831.jpg', '1885.jpg', '1369.jpg', '550.jpg', '614.jpg']
