## **Install Dependencies**

- !pip install boto3 datasets
- !pip install --upgrade pyarrow datasets

In [1]:
import os
import boto3
import shutil
import zipfile
import botocore
import warnings
import pandas as pd
from tqdm import tqdm
from concurrent import futures
from datasets import load_dataset

warnings.filterwarnings('ignore')

hugging_df = pd.DataFrame(load_dataset("Andyrasika/image_captioning", split="train"))

Downloading data:   0%|          | 0.00/139M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/507444 [00:00<?, ? examples/s]

In [2]:
max_rows = 200000
resultant_data = set()
seen_ids = set()

progress_bar = tqdm(total=max_rows, desc="Processing Rows")

def select_data(data_row):
    return (data_row['image_id'], data_row['caption'])

for _, data_row in hugging_df.iterrows():
    if len(resultant_data) >= max_rows:
        break
    data_tuple = select_data(data_row)
    image_id = data_tuple[0]
    if image_id not in seen_ids:
        seen_ids.add(image_id)
        resultant_data.add(data_tuple)
        progress_bar.update(1)

data = pd.DataFrame(list(resultant_data), columns=['image_id', 'caption'])

data.to_csv('unique_images.csv', index=False)

Processing Rows: 100%|█████████▉| 199933/200000 [00:14<00:00, 14859.39it/s]

In [3]:
BUCKET_NAME = 'open-images-dataset'
bucket = boto3.resource(
    's3', config=botocore.config.Config(
        signature_version=botocore.UNSIGNED)).Bucket(BUCKET_NAME)

splits = ['train', 'validation', 'test']
download_folder = f'images_{max_rows//1000}k'

if not os.path.exists(download_folder):
    os.makedirs(download_folder)

In [4]:
from tqdm.notebook import tqdm

def download_one_image(image_id):
    for split in splits:
        try:
            file_path = os.path.join(download_folder, f'{image_id}.jpg')
            bucket.download_file(f'{split}/{image_id}.jpg', file_path)
            return 1
        except botocore.exceptions.ClientError:
            continue
        except Exception as e:
            print(f'Unexpected error: {e}')
            return None
    return None

def process_image(row):
    image_id = row['image_id']
    result = download_one_image(image_id)
    if result is None:
        print(f'ERROR: Image `{image_id}` not found in any of the specified splits.')
        return None
    else:
        caption = row['caption']
        return {'image_id': image_id, 'caption': caption}

info_list = []
with futures.ThreadPoolExecutor(max_workers=10) as executor:
    results = list(tqdm(executor.map(process_image, data.to_dict('records')), total=data.shape[0]))

info_list = [result for result in results if result is not None]
info_df = pd.DataFrame(info_list)
info_df.to_csv('image_info.csv', index=False)

  0%|          | 0/200000 [00:00<?, ?it/s]



In [5]:
!pip install kaggle



In [6]:
from google.colab import files
files.upload();

Saving kaggle.json to kaggle.json


In [7]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [8]:
metadata = {
    "title": "Image Captioning Dataset 200k",
    "id": "phanichaitanya349/captioning-dataset-200k",
    "licenses": [
        {
            "name": "CC0-1.0"
        }
    ]
}

import json
with open('dataset-metadata.json', 'w') as f:
    json.dump(metadata, f)

In [9]:
!mkdir img-caption-dataset
!mv dataset-metadata.json img-caption-dataset/
!mv image_info.csv img-caption-dataset/

In [10]:
!cd images_200k

In [11]:
def zip_and_remove(folder_path, zip_file_name):
    with zipfile.ZipFile(zip_file_name, 'w') as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in tqdm(files, desc="Zipping and removing files", unit="file"):
                file_path = os.path.join(root, file)
                zipf.write(file_path, os.path.relpath(file_path, folder_path))
                os.remove(file_path)

folder_path = 'images_200k'
zip_file_name = 'images_200k.zip'
zip_and_remove(folder_path, zip_file_name)

!zip -r images_200k.zip images_200k

In [12]:
!mv images_200k.zip img-caption-dataset/

In [13]:
!kaggle datasets create -p img-caption-dataset/

Starting upload for file image_info.csv
100% 37.2M/37.2M [00:01<00:00, 37.5MB/s]
Upload successful: image_info.csv (37MB)
Starting upload for file images_200k.zip
100% 58.5G/58.5G [09:09<00:00, 114MB/s]
Upload successful: images_200k.zip (59GB)
Your private Dataset is being created. Please check progress at https://www.kaggle.com/datasets/phanichaitanya349/captioning-dataset-200k
