<a href="https://colab.research.google.com/github/Phani943/Image-Captioning/blob/main/data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Install Dependencies**

- !pip install boto3 datasets
- !pip install --upgrade pyarrow datasets

In [2]:
import os
import boto3
import shutil
import botocore
import warnings
import pandas as pd
from concurrent import futures
from tqdm.notebook import tqdm
from datasets import load_dataset

warnings.filterwarnings('ignore')

hugging_df = pd.DataFrame(load_dataset("Andyrasika/image_captioning", split="train"))

Downloading data:   0%|          | 0.00/139M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/507444 [00:00<?, ? examples/s]

In [3]:
max_rows = 100000
resultant_data = set()
seen_ids = set()

progress_bar = tqdm(total=max_rows, desc="Processing Rows")

def select_data(data_row):
    return (data_row['image_id'], data_row['caption'])

for _, data_row in hugging_df.iterrows():
    if len(resultant_data) >= max_rows:
        break
    data_tuple = select_data(data_row)
    image_id = data_tuple[0]
    if image_id not in seen_ids:
        seen_ids.add(image_id)
        resultant_data.add(data_tuple)
        progress_bar.update(1)

data = pd.DataFrame(list(resultant_data), columns=['image_id', 'caption'])

data.to_csv('unique_images.csv', index=False)

Processing Rows:   0%|          | 0/100000 [00:00<?, ?it/s]

In [4]:
BUCKET_NAME = 'open-images-dataset'
bucket = boto3.resource(
    's3', config=botocore.config.Config(
        signature_version=botocore.UNSIGNED)).Bucket(BUCKET_NAME)

splits = ['train', 'validation', 'test']
download_folder = f'images_{max_rows}'

if not os.path.exists(download_folder):
    os.makedirs(download_folder)

In [5]:
def download_one_image(image_id):
    for split in splits:
        try:
            file_path = os.path.join(download_folder, f'{image_id}.jpg')
            bucket.download_file(f'{split}/{image_id}.jpg', file_path)
            return 1
        except botocore.exceptions.ClientError:
            continue
        except Exception as e:
            print(f'Unexpected error: {e}')
            return None
    return None

def process_image(row):
    image_id = row['image_id']
    result = download_one_image(image_id)
    if result is None:
        print(f'ERROR: Image `{image_id}` not found in any of the specified splits.')
        return None
    else:
        caption = row['caption']
        return {'image_id': image_id, 'caption': caption}

info_list = []
with futures.ThreadPoolExecutor(max_workers=10) as executor:
    results = list(tqdm(executor.map(process_image, data.to_dict('records')), total=data.shape[0]))

info_list = [result for result in results if result is not None]
info_df = pd.DataFrame(info_list)
info_df.to_csv('image_info.csv', index=False)

  0%|          | 0/100000 [00:00<?, ?it/s]



In [6]:
!pip install kaggle



In [7]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"phanichaitanya349","key":"bf75a9ff734cd759ca5b66c7808f0dfb"}'}

In [8]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [9]:
metadata = {
    "title": "Captioning Dataset",
    "id": "phanichaitanya349/captioning-dataset",
    "licenses": [
        {
            "name": "CC0-1.0"
        }
    ]
}

import json
with open('dataset-metadata.json', 'w') as f:
    json.dump(metadata, f)

In [10]:
!mkdir caption-dataset
!mv dataset-metadata.json caption-dataset/
!mv image_info.csv caption-dataset/

In [None]:
!zip -r images_100000.zip /content/images_100000

In [12]:
!mv images_100000.zip caption-dataset/

In [15]:
!kaggle datasets create -p caption-dataset/

Starting upload for file image_info.csv
100% 18.6M/18.6M [00:01<00:00, 18.4MB/s]
Upload successful: image_info.csv (19MB)
Starting upload for file images_100000.zip
100% 29.2G/29.2G [05:11<00:00, 101MB/s]
Upload successful: images_100000.zip (29GB)
Your private Dataset is being created. Please check progress at https://www.kaggle.com/datasets/phanichaitanya349/captioning-dataset
