In [None]:
import requests
import zipfile
import os
from tqdm import tqdm
# %env JOBLIB_TEMP_FOLDER=/tmp

from google.colab import drive
drive.mount('/content/drive')

# Define the URLs for the VQA v2 dataset
_URLS = {
    "questions": {
        "train": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip",
        # "val": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip",
        # "test-dev": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Test_mscoco.zip",
        # "test": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Test_mscoco.zip",
    },
    "annotations": {
        "train": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip",
        # "val": "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip",
    },
    "images": {
        "train": "http://images.cocodataset.org/zips/train2014.zip",
        # "val": "http://images.cocodataset.org/zips/val2014.zip",
        # "test-dev": "http://images.cocodataset.org/zips/test2015.zip",
        # "test": "http://images.cocodataset.org/zips/test2015.zip",
    },
}

# Create a folder to store the dataset files
output_dir = "/content/drive/MyDrive/vqa_v2_data"
os.makedirs(output_dir, exist_ok=True)

# Function to download and extract zip files
def download_and_extract(url, output_path):
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get("content-length", 0))
    block_size = 1024  # 1 KB
    t = tqdm(total=total_size, unit="iB", unit_scale=True)

    with open(output_path, "wb") as file:
        for data in response.iter_content(block_size):
            t.update(len(data))
            file.write(data)
    t.close()

    with zipfile.ZipFile(output_path, "r") as zip_ref:
        zip_ref.extractall(output_dir)

# Download and extract datasets
for key, value in _URLS.items():
    for name, url in value.items():
        zip_file_path = os.path.join(output_dir, f"{key}_{name}.zip")
        print(f"Downloading {name} {key} dataset from {url}...")
        download_and_extract(url, zip_file_path)
        print(f"Finished downloading and extracting {name} {key} dataset.")

Mounted at /content/drive
Downloading train questions dataset from https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip...


100%|██████████| 7.24M/7.24M [00:00<00:00, 27.7MiB/s]


Finished downloading and extracting train questions dataset.
Downloading train annotations dataset from https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip...


100%|██████████| 21.7M/21.7M [00:00<00:00, 47.2MiB/s]


Finished downloading and extracting train annotations dataset.
Downloading train images dataset from http://images.cocodataset.org/zips/train2014.zip...


100%|██████████| 13.5G/13.5G [03:20<00:00, 67.5MiB/s]


Finished downloading and extracting train images dataset.
