In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# %%capture
%%shell
# Ensure the Kaggle API directory exists
mkdir -p ~/.kaggle

# Copy the Kaggle API key to the correct location
# Note: You need to upload your kaggle.json file to '/content/drive/My Drive/kaggle.json' beforehand
cp '/content/drive/My Drive/kaggle.json' ~/.kaggle/

# Make the Kaggle API key file readable for the owner only (required for Kaggle API)
chmod 600 ~/.kaggle/kaggle.json

# Create a directory for the dataset if it doesn't already exist
mkdir -p /content/data

# Use Kaggle API to download the dataset to the specified directory
kaggle datasets download -d hsankesara/flickr-image-dataset -p /content/data

# Unzip the downloaded dataset
unzip -q "/content/data/flickr-image-dataset.zip" -d /content/data

# Remove the zip file to save space
rm "/content/data/flickr-image-dataset.zip"


In [None]:
# Load the CSV file (assuming the uploaded image represents a CSV format)
# Make sure to set the correct path to your CSV file
df = pd.read_csv('/content/data/flickr30k_images/results.csv', delimiter='|', header=None)

# Remove whitespace from the DataFrame
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Open a new text file to write the formatted data
with open('/content/data/flickr30k_images/Flickr30k.token.txt ', 'w') as f:
    for index, row in df.iterrows():
        # Extract the filename, index, and caption, and strip any leading/trailing whitespace
        filename = row[0].strip()
        # print(filename)
        caption_number = row[1].strip()
        # print(caption_number)
        caption = str(row[2]).strip()
        # print(caption)

        # Write the formatted line to the text file
        f.write(f'{filename}#{caption_number}\t{caption}\n')


In [None]:
# Get the unique filenames
unique_filenames = df[0].unique()

# First split: 80% for training, 20% for temporary dataset (which will become test and dev)
train_filenames, temp_filenames = train_test_split(unique_filenames, test_size=0.2, random_state=42)

# Second split of the temporary dataset: 50% for test, 50% for dev (which is 10% of the original each)
test_filenames, dev_filenames = train_test_split(temp_filenames, test_size=0.5, random_state=42)

# Write the filenames to their respective files
def write_filenames_to_file(filenames, file_path):
    with open(file_path, 'w') as f:
        for filename in filenames:
            f.write(filename + '\n')

write_filenames_to_file(train_filenames, '/content/data/flickr30k_images/Flickr_30k.trainImages.txt')
write_filenames_to_file(test_filenames, '/content/data/flickr30k_images/Flickr_30k.testImages.txt')
write_filenames_to_file(dev_filenames, '/content/data/flickr30k_images/Flickr_30k.devImages.txt')
