In [3]:
import pandas as pd
import requests
import os
from tqdm import tqdm
import time
from pathlib import Path

def download_image(url, save_path):
    # Headers giả lập trình duyệt web
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        # Kiểm tra content-type
        content_type = response.headers.get('content-type', '')
        if not any(img_type in content_type.lower() for img_type in ['jpeg', 'jpg', 'png', 'webp']):
            raise ValueError(f"Invalid content type: {content_type}")
            
        with open(save_path, 'wb') as f:
            f.write(response.content)
        return True
    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")
        return False

def main():
    # Read CSV file
    csv_path = "./data/traffic_images_100_short_captions.csv"
    df = pd.read_csv(csv_path)
    
    # Create image directory if it doesn't exist
    image_dir = Path("images")
    image_dir.mkdir(exist_ok=True)
    
    # Download images with progress bar
    print("Downloading images...")
    success_count = 0
    
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        url = row['original_url']
        
        # Create filename with index
        filename = f"img_{idx:06d}" # Format: img_000001, img_000002, etc.
        
        # Determine extension from URL or default to .jpg
        ext = os.path.splitext(url.split('/')[-1])[1]
        if not ext or ext.lower() not in ['.jpg', '.jpeg', '.png', '.webp']:
            ext = '.jpg'
            
        save_path = image_dir / f"{filename}{ext}"
        
        # Skip if file already exists
        if save_path.exists():
            success_count += 1
            continue
        
        # Try downloading with increasing delays on failure
        max_retries = 3
        delay = 1
        
        for attempt in range(max_retries):
            if download_image(url, save_path):
                success_count += 1
                break
            elif attempt < max_retries - 1:  # Don't sleep after last attempt
                time.sleep(delay)
                delay *= 2  # Exponential backoff
    
    print(f"\nDownload completed! Successfully downloaded {success_count}/{len(df)} images")

if __name__ == "__main__":
    main()

Downloading images...


  0%|          | 0/83 [00:00<?, ?it/s]

100%|██████████| 83/83 [00:21<00:00,  3.81it/s]


Download completed! Successfully downloaded 83/83 images



