In [1]:
import pandas as pd
import requests
import os
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

csv_file_path = '/home/student/Downloads/yt_trending_data_cleaned.csv'  
images_directory = '/home/student/Downloads/mongoimages/US_img'  # Directory where images will be saved
url_column_name = 'thumbnail_link'  # The name of the column containing the image URLs
region_column_name = 'region'  # The name of the column containing the region information

os.makedirs(images_directory, exist_ok=True)
df = pd.read_csv(csv_file_path)

# Filter the DataFrame for rows where the region is "US"
df = df[df[region_column_name] == 'US']

# Initializing counters
downloaded_count = 0
failed_count = 0

# Function to download an image
def download_image(url, directory, index):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            file_path = os.path.join(directory, f"image_{index}.jpg")
            with open(file_path, 'wb') as f:
                f.write(response.content)
            return True, url
        else:
            return False, url
    except requests.RequestException:
        return False, url

# Function to handle the results of the downloads
def handle_result(future):
    global downloaded_count, failed_count
    success, url = future.result()
    if success:
        downloaded_count += 1
        #print(f"Downloaded: {url}")
    else:
        failed_count += 1
        #print(f"Failed to download: {url}")

# Use ThreadPoolExecutor to download images concurrently
with ThreadPoolExecutor(max_workers=10) as executor:  
    futures = [executor.submit(download_image, row[url_column_name], images_directory, index) for index, row in df.iterrows()]
    for future in as_completed(futures):
        handle_result(future)

# Print summary
print(f"Total downloaded: {downloaded_count}")
print(f"Total failed: {failed_count}")


Total downloaded: 242418
Total failed: 6411
