In [None]:
import os
import requests
import shutil
import pandas as pd

In [None]:
class ImageScrape:
    def __init__(self, img_urls, dataset_name, dataset_type, ecomm_shop, destination_dir, img_extension=".jpeg"):
        """
        Initialize Scraper Attributes
        
        Args:
            img_urls (list): URL columns from the dataset
            dataset_name (str): name of the dataset
            dataset_type (str): train or test
            ecomm_shop (str): ecommerce shop name, i.e., amazon, walmart etc
            destination_dir (str): local_dir path
            img_extension (str): default image extension
        """
        self.img_urls = img_urls
        self.dataset_name = dataset_name
        self.dataset_type = dataset_type
        self.ecomm_shop = ecomm_shop
        self.img_extension = img_extension
        self.destination_dir = destination_dir

    @property
    def absolute_path(self):
        return f"{self.destination_dir}/{self.dataset_name}/{self.dataset_type}/{self.ecomm_shop}/"

    def download_images(self, img_urls, row_number):
        """
        Download the images from list of urls of each row
        
        Args:
            img_urls: (<img_url1>, <img_url2> . .)
            row_number: row number
        """
        
        print(f"------ Download images from row: {row_number} ------")

        path = f"{self.absolute_path}/{row_number}"
        os.makedirs(path, exist_ok=True)

        for img_ind, image in enumerate(img_urls):
            img = requests.get(image, stream=True)
            if img.status_code == 200:
                with open(f"{path}/{img_ind}{self.img_extension}", "wb") as fp:
                    shutil.copyfileobj(img.raw, fp)
            else:
                print(f"Image download failed: {row_number}: {img_ind}")

    def process(self):
        """
        Process the URLs for every dataset, type and ecomm_site
        """
        for index, url_list in enumerate(self.img_urls):
            self.download_images(url_list, index)



In [None]:
def scrape_images(file_names, dataset_path, dataset_name, shop_names):
    """
    Download images with the arguments passed
    
    Args:
        file_names (list): train/test filenames
        dataset_path (str): dataset location
        dataset_name (str): dataset name
        shop_names (list): e-commerce shop names
    """
    
    col_names = ["image_url1", "image_url2"]
    
    for index, file in enumerate(file_names):
        df = pd.read_csv(f"{dataset_path}/{dataset_name}/{file}")

        for index, url_col in enumerate(col_names):
            shop = shop_names[0] if index == 0 else  shop_names[1]
            urls = [eval(url_list) for url_list in df[url_col]]
    
            scraper = ImageScrape(img_urls=urls,
                                  dataset_name=dataset_name,
                                  dataset_type="train" if "train" in file else "test",
                                  ecomm_shop=shop,
                                  destination_dir="images/")
    
            scraper.process()

In [None]:
# Download all the images for the ProMapEn dataset
scrape_images(
    file_names = ["promapen-train_data.csv", "promapen-test_data.csv"],
    dataset_path = "datasets",
    dataset_name = "ProMapEn",
    shop_names = ["walmart", "amazon"]
)
