download images from unsplashImages.tsv from the data folder in drive locally (this colab), zip them up, and then send them to drive so that the other colab can download the zip and use the pictures.

In [None]:
import pandas as pd
import numpy as np
import glob
import os
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import shutil
import requests

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# save 2000 image sample sample to drive

unsplash_path = '/content/drive/MyDrive/Amortized Optimal Transport/Data/unsplashImages.tsv'
unsplash_tsv = glob.glob(unsplash_path)[0]
unsplash_df = pd.read_csv(unsplash_tsv, sep='\t', header=0)
print(unsplash_df.columns)
unsplash_df = unsplash_df[['photo_id', 'photo_image_url', 'photo_width', 'photo_height']]
unsplash_df_sample = unsplash_df.sample(n=2000)

Index(['photo_id', 'photo_url', 'photo_image_url', 'photo_submitted_at',
       'photo_featured', 'photo_width', 'photo_height', 'photo_aspect_ratio',
       'photo_description', 'photographer_username', 'photographer_first_name',
       'photographer_last_name', 'exif_camera_make', 'exif_camera_model',
       'exif_iso', 'exif_aperture_value', 'exif_focal_length',
       'exif_exposure_time', 'photo_location_name', 'photo_location_latitude',
       'photo_location_longitude', 'photo_location_country',
       'photo_location_city', 'stats_views', 'stats_downloads',
       'ai_description', 'ai_primary_landmark_name',
       'ai_primary_landmark_latitude', 'ai_primary_landmark_longitude',
       'ai_primary_landmark_confidence', 'blur_hash'],
      dtype='object')


In [None]:
def download_image(args):
    """
    args: tuple of (url, save_path)
    downloads single image from url to save_path
    """
    url, save_path = args

    # append width parameter to url to save bandwidth/storage
    if "?" in url:
        dl_url = f"{url}&w=512"
    else:
        dl_url = f"{url}?w=512"

    try:
        # time out in case a link is bad
        response = requests.get(dl_url, timeout=10)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
            return True
        else:
          print(f"Got status code {response.status_code}")
    except Exception as e:
        return False
    return False

def hydrate_dataset(df, output_dir):
    """
    Downloads images from input df to output_dir
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    urls = df['photo_image_url']

    # Prepare arguments for threading
    tasks = []
    for i, url in enumerate(urls):
        filename = f"{i:05d}.jpg"
        save_path = os.path.join(output_dir, filename)
        tasks.append((url, save_path))

    print("Starting threaded download...")
    with ThreadPoolExecutor(max_workers=16) as executor:
        results = list(tqdm(executor.map(download_image, tasks), total=len(tasks)))

    print(f"Successfully downloaded {sum(results)} images to {output_dir}")

def backup_to_drive(local_folder, drive_folder, zip_name='images'):
    """
    Zips local folder and uploads it to Google Drive.
    """
    print(f"Zipping {local_folder}...")
    archive_path = shutil.make_archive(base_name=f'/content/{zip_name}', format='zip', root_dir=local_folder)

    if not os.path.exists(drive_folder):
        os.makedirs(drive_folder)

    print(f"Copying to {drive_folder}...")
    try:
        shutil.copy(archive_path, drive_folder)
        print("complete")
    except Exception as e:
        print(f"error: {e}")

In [None]:
output_dir = "/content/dataset/raw_images"
hydrate_dataset(unsplash_df_sample, output_dir)
backup_to_drive(output_dir, '/content/drive/MyDrive/Amortized Optimal Transport/Data')

Starting threaded download...


100%|██████████| 2000/2000 [01:13<00:00, 27.27it/s]


Successfully downloaded 2000 images to /content/dataset/raw_images
Zipping /content/dataset/raw_images...
Copying to /content/drive/MyDrive/Amortized Optimal Transport/Data...
complete
