In [2]:
import os
import zipfile
import kaggle
from pathlib import Path


out_dir = Path("resources")
out_dir.mkdir(parents=True, exist_ok=True)

In [3]:
# download competition data
error = False
try:
	kaggle.api.competition_download_files(competition="dogs-vs-cats", path=str(out_dir), force=False)
except Exception as e:
	print(f"Error downloading data: {e}")
	error = True

zip_path = out_dir / "dogs-vs-cats.zip"
# extract the main zip file
try:
	with zipfile.ZipFile(zip_path, 'r') as zip_ref:
		zip_ref.extractall(out_dir)
except FileNotFoundError:
	print(f"{zip_path} not found. Make sure the file exists.")
	error = True

# extract train and test zip files into train and test folders
try:
	with zipfile.ZipFile(out_dir/'train.zip',"r") as z:
		z.extractall(out_dir/"train")
		
	with zipfile.ZipFile(out_dir/'test1.zip',"r") as z:
		z.extractall(out_dir/"test")
except FileNotFoundError as e:
	print(f"Error: {e}")
	error = True
except Exception as e:
	print(f"An error occurred while extracting: {e}")
	error = True

if not error:
	print("Data downloaded and extracted successfully.")
	os.remove(zip_path)
	os.remove(out_dir/'train.zip')
	os.remove(out_dir/'test1.zip') 

Data downloaded and extracted successfully.


In [4]:
# make folder structure, do that we can use hugginface datasets library directly
(out_dir/"train"/"cats").mkdir(parents=True, exist_ok=True)
(out_dir/"train"/"dogs").mkdir(parents=True, exist_ok=True)
(out_dir/"test"/"cats").mkdir(parents=True, exist_ok=True)
(out_dir/"test"/"dogs").mkdir(parents=True, exist_ok=True)

# move files to respective folders
for file_path in (out_dir/"train"/"train").iterdir():
	if file_path.name.startswith("cat"):
		file_path.rename(out_dir/"train"/"cats"/file_path.name)
	elif file_path.name.startswith("dog"):
		file_path.rename(out_dir/"train"/"dogs"/file_path.name)

# remove now empty folders
(os.rmdir(out_dir/"train"/"train"))

print("Folder structure created and files moved successfully.")

Folder structure created and files moved successfully.


In [13]:
dog_pictures = list((out_dir/"train"/"dogs").glob("dog.*.jpg"))
cat_pictures = list((out_dir/"train"/"cats").glob("cat.*.jpg"))
print(f"Number of dog pictures(training): {len(dog_pictures)}")
print(f"Number of cat pictures(training): {len(cat_pictures)}")

Number of dog pictures(training): 12500
Number of cat pictures(training): 12500


In [8]:
# see image sizes of dogs and cats
import cv2
import pandas as pd
dog_sizes = [cv2.imread(str(img)).shape for img in dog_pictures]
cat_sizes = [cv2.imread(str(img)).shape for img in cat_pictures]
# print(f"Dog image sizes (height, width, channels): {set(dog_sizes)}")
# print(f"Cat image sizes (height, width, channels): {set(cat_sizes)}")

df = pd.DataFrame(dog_sizes + cat_sizes, columns=['height', 'width', 'channels'])
print("Image size statistics:")
print(df.describe())

Image size statistics:
             height        width  channels
count  25000.000000  25000.00000   25000.0
mean     360.478080    404.09904       3.0
std       97.019959    109.03793       0.0
min       32.000000     42.00000       3.0
25%      301.000000    323.00000       3.0
50%      374.000000    447.00000       3.0
75%      421.000000    499.00000       3.0
max      768.000000   1050.00000       3.0
