In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch

from sklearn.model_selection import train_test_split

# 1. Loading the data and preprocessing

## 1.1 Split images and captions into training and validation sets

### 1.1.2 Splitting images

In [6]:
import os
import shutil

image_folder = "data/raw/Images"
train_folder = "data/train"
val_folder = "data/val"


# Create folders
if os.path.exists(train_folder):
    shutil.rmtree(train_folder)
if os.path.exists(val_folder):
    shutil.rmtree(val_folder)

os.makedirs(train_folder)
os.makedirs(val_folder)

# Split images
images = [f for f in os.listdir(image_folder) if f.endswith((".jpg", ".png"))]
train_images, val_images = train_test_split(images, test_size=0.2, random_state=42)


def move_images(images, source_folder, target_folder):
    for img in images:
        shutil.copy(os.path.join(source_folder, img), os.path.join(target_folder, img))


move_images(train_images, image_folder, train_folder)
move_images(val_images, image_folder, val_folder)

print(f"Training set: {len(os.listdir(train_folder))} images")
print(f"Validation set: {len(os.listdir(val_folder))} images")

Training set: 25426 images
Validation set: 6357 images


### 1.1.2 Splitting captions

In [7]:
# Split captions
captions_file = "data/captions.txt"
train_captions_file = "data/train/captions_train.txt"
val_captions_file = "data/val/captions_val.txt"

# Read captions
df = pd.read_csv(captions_file, sep=",", header=0, names=["image", "caption"])
train_captions = df[df["image"].isin(train_images)]
val_captions = df[df["image"].isin(val_images)]

# Save captions
train_captions.to_csv(train_captions_file, sep=",", header=False, index=False)
val_captions.to_csv(val_captions_file, sep=",", header=False, index=False)

print(f"Training set: {len(train_captions)} captions")
print(f"Validation set: {len(val_captions)} captions")

Training set: 127130 captions
Validation set: 31785 captions


## 1.2  Small data


In [9]:
# split images
small_train_images, small_val_images = train_test_split(train_images, test_size=0.02, random_state=42)
_, small_train_images = train_test_split(small_train_images, test_size=0.08 / 0.98, random_state=42)

small_train_folder = "data/train_small"
small_val_folder = "data/val_small"

# Create folders
if os.path.exists(small_train_folder):
    shutil.rmtree(small_train_folder)
if os.path.exists(small_val_folder):
    shutil.rmtree(small_val_folder)
os.makedirs(small_train_folder)
os.makedirs(small_val_folder)

move_images(small_train_images, image_folder, small_train_folder)
move_images(small_val_images, image_folder, small_val_folder)

print(f"Small Training set: {len(os.listdir(small_train_folder))} images")
print(f"Small Validation set: {len(os.listdir(small_val_folder))} images")

# Split captions
small_train_captions_file = "data/train_small/small_captions_train.txt"
small_val_captions_file = "data/val_small/small_captions_val.txt"

# Read captions
df = pd.read_csv(captions_file, sep=",", header=0, names=["image", "caption"])
small_train_captions = df[df["image"].isin(small_train_images)]
small_val_captions = df[df["image"].isin(small_val_images)]

# Save captions
small_train_captions.to_csv(
    small_train_captions_file, sep=",", header=False, index=False
)
small_val_captions.to_csv(small_val_captions_file, sep=",", header=False, index=False)

print(f"Small Training set: {len(small_train_captions)} captions")
print(f"Small Validation set: {len(small_val_captions)} captions")

Small Training set: 2035 images
Small Validation set: 509 images
Small Training set: 10175 captions
Small Validation set: 2545 captions
