<a href="https://colab.research.google.com/github/Phani943/Image-Captioning/blob/main/download_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import requests
import pandas as pd
from PIL import Image
from tqdm import tqdm
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor

In [2]:
num_rows = 10000
df = pd.read_csv('data_info.csv', nrows=num_rows)

In [3]:
output_dir = 'downloaded_images'
os.makedirs(output_dir, exist_ok=True)

In [4]:
def download_image(img_info):
    img_id, url = img_info
    try:
        response = requests.get(url)
        response.raise_for_status()

        img = Image.open(BytesIO(response.content))

        img_file_path = os.path.join(output_dir, f'{img_id}.jpg')
        img.save(img_file_path)
        return f"Image {img_id} saved as {img_file_path}"
    except Exception as e:
        return f"Error downloading {url}: {e}"

In [5]:
img_info_list = [(row['img_id'], row['coco_url']) for _, row in df.iterrows()]

with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(tqdm(executor.map(download_image, img_info_list), total=len(img_info_list)))

100%|██████████| 10000/10000 [03:30<00:00, 47.56it/s]


In [6]:
!pip install kaggle



In [7]:
from google.colab import files
files.upload();

Saving kaggle.json to kaggle.json


In [8]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [9]:

metadata = {
    "title": "Image Captioning Dataset",
    "id": "phanichaitanya349/img-caption-dataset",
    "description": "A dataset of images and their corresponding captions.",
    "licenses": [
        {
            "name": "CC0-1.0"
        }
    ]
}

import json
with open('dataset-metadata.json', 'w') as f:
    json.dump(metadata, f)

In [None]:
!zip -r images_10k.zip /content/downloaded_images

In [11]:
!mkdir img-caption-dataset
!mv dataset-metadata.json img-caption-dataset/
!mv data_info.csv img-caption-dataset/
!mv images_10k.zip img-caption-dataset/

In [12]:
!kaggle datasets create -p img-caption-dataset/

Starting upload for file data_info.csv
100% 37.7M/37.7M [00:00<00:00, 54.9MB/s]
Upload successful: data_info.csv (38MB)
Starting upload for file images_10k.zip
100% 488M/488M [00:05<00:00, 87.9MB/s]
Upload successful: images_10k.zip (488MB)
Your private Dataset is being created. Please check progress at https://www.kaggle.com/datasets/phanichaitanya349/img-caption-dataset
