In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch 

from sklearn.model_selection import train_test_split

## Split images into training and validation sets

### Splitting images

In [3]:
import os
import shutil

image_folder = "data/raw/Images"
train_folder = "data/train"
val_folder = "data/val"

# Create folders
os.makedirs(train_folder, exist_ok=True)
os.makedirs(val_folder, exist_ok=True)

# Split images
images = [f for f in os.listdir(image_folder) if f.endswith((".jpg", ".png"))]
train_images, val_images = train_test_split(images, test_size=0.2)


def move_images(images, source_folder, target_folder):
    for img in images:
        shutil.move(os.path.join(source_folder, img), os.path.join(target_folder, img))


move_images(train_images, image_folder, train_folder)
move_images(val_images, image_folder, val_folder)

print(f"Training set: {len(os.listdir(train_folder))} images")
print(f"Validation set: {len(os.listdir(val_folder))} images")

Training set: 25426 images
Validation set: 6357 images


### Splitting captions

In [4]:
# Split captions
captions_file = "data/captions.txt"
train_captions_file = "data/train/captions_train.txt"
val_captions_file = "data/val/captions_val.txt"

# Read captions
df = pd.read_csv(captions_file, sep=",", header=0, names=["image", "caption"])
train_captions = df[df["image"].isin(train_images)]
val_captions = df[df["image"].isin(val_images)]

# Save captions
train_captions.to_csv(train_captions_file, sep=",", header=False, index=False)
val_captions.to_csv(val_captions_file, sep=",", header=False, index=False)

print(f"Training set: {len(train_captions)} captions")
print(f"Validation set: {len(val_captions)} captions")

Training set: 127130 captions
Validation set: 31785 captions
