In [3]:
import pandas as pd
import os
import requests
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse
from tqdm import tqdm

In [4]:
# Function to download an image from a link and save it
def download_image(row, folder_path):
    image_url = row['image_link']
    entity_name = row['entity_name']
    
    # Parse the image name from the URL
    image_name = os.path.basename(urlparse(image_url).path)
    
    # Create the filename in the format image_name_entity_name
    file_name = f"{os.path.splitext(image_name)[0]}_{entity_name}{os.path.splitext(image_name)[1]}"
    
    # Full path to save the image
    save_path = os.path.join(folder_path, file_name)
    
    try:
        # Download the image
        response = requests.get(image_url, stream=True)
        if response.status_code == 200:
            with open(save_path, 'wb') as img_file:
                for chunk in response.iter_content(1024):
                    img_file.write(chunk)
        else:
            print(f"Failed to download {image_url}")
    except Exception as e:
        print(f"Error downloading {image_url}: {e}")

In [5]:
# Main function to read the CSV and start downloading images
def download_images_from_csv(csv_file, folder_path, num_threads=10):
    # Load the CSV file
    df = pd.read_csv(csv_file)
    
    # Ensure the folder exists
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Use ThreadPoolExecutor to download images concurrently
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Use tqdm to display a progress bar
        list(tqdm(executor.map(lambda row: download_image(row, folder_path), [row for _, row in df.iterrows()]), total=len(df)))

In [None]:
if __name__ == "__main__":
    # Path to your CSV file
    csv_file = r"E:/Random Python Scripts/Amazon ML Challenge/Dataset/train.csv"
    
    # Folder to save the images
    folder_path = r'F:/Amazon ML Challlenge'
    
    # Download images using multithreading
    download_images_from_csv(csv_file, folder_path, num_threads=16)  # Adjust num_threads as needed

  0%|                                                                          | 360/263859 [01:06<11:45:51,  6.22it/s]

Failed to download https://m.media-amazon.com/images/I/1yw53vfQtS.jpg


  1%|▍                                                                       | 1440/263859 [05:00<292:01:53,  4.01s/it]

Failed to download https://m.media-amazon.com/images/I/DzP2RMRQO0.jpg
Error downloading https://m.media-amazon.com/images/I/71t2sNVC+1L.jpg: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Failed to download https://m.media-amazon.com/images/I/lwd2cSmT2ux.jpg
Failed to download https://m.media-amazon.com/images/I/VCEdbX8DT28.jpg
Failed to download https://m.media-amazon.com/images/I/J2DXsUjR8ay.jpg
Error downloading https://m.media-amazon.com/images/I/81Tu6FYslZL.jpg: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))
Error downloading https://m.media-amazon.com/images/I/81cFH-eOP4L.jpg: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or establis