In [1]:
import os
import requests
import pandas as pd
import concurrent.futures


## Downloads and recreates the original structure from s3 (use s3 to https first)
- Uses threading to speed up the process (I/O bound operation, no MP necessary)
- change the variable "output_directory" to the destination directory
- default is test_download, the program will create it if not exists

In [2]:
df = pd.read_csv(filepath_or_buffer="results_csv/html_fruits.csv")


In [3]:
urls = df["url"].values.tolist()
output_directory = "test_download"
if not os.path.exists(path=output_directory):
    os.makedirs(output_directory)


In [4]:
def download_and_categorize(image_url):
    """
    Downloads and saves an image based on its url, assuming :
    url.split("/")[-2] is category and url.split("/")[-1] is name
    Creates directory with category name if not exists, saves the image.

    Args:
    - image_url : url of an image, with : url.split("/")[-2] = category && url.split("/")[-1] = name
    """

    image_bin = requests.get(image_url).content
    url_split = image_url.split("/")
    category = url_split[-2]
    name = url_split[-1]
    if not os.path.exists(path=f"{output_directory}/{category}/"):
        os.makedirs(f"{output_directory}/{category}/")
    with open(f"{output_directory}/{category}/{name}", "wb") as image_file:
        image_file.write(image_bin)



In [5]:
# Downloads with concurrency
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(download_and_categorize, urls)
