In [11]:
import os
import requests
import re
import time
import random
import cv2
import numpy as np
from io import BytesIO

def create_directory(directory):
    """Create a directory if it doesn't exist."""
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created.")
    else:
        print(f"Directory '{directory}' already exists.")
    return directory

def get_bing_image_urls(query, count=20):
    """Get image URLs from Bing Image Search."""
    print(f"Searching for '{query}' images on Bing...")
    
    # Format the search query for Bing
    search_url = f"https://www.bing.com/images/search?q={query.replace(' ', '+')}&form=HDRSC2&first=1"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Cache-Control': 'max-age=0'
    }
    
    try:
        response = requests.get(search_url, headers=headers)
        response.raise_for_status()
        
        # Regular expression to find image URLs in the HTML
        img_pattern = r'murl&quot;:&quot;(https?://[^&]+)&quot;'
        matches = re.findall(img_pattern, response.text)
        
        # Clean up URLs (remove escaped characters)
        image_urls = []
        for url in matches:
            clean_url = url.replace('\\', '')
            if clean_url not in image_urls:
                image_urls.append(clean_url)
                if len(image_urls) >= count:
                    break
        
        print(f"Found {len(image_urls)} image URLs.")
        return image_urls
    
    except Exception as e:
        print(f"Error searching Bing: {e}")
        return []

def is_valid_image(image_data):
    """Check if the image data can be properly read by OpenCV."""
    try:
        # Convert image data to numpy array
        nparr = np.frombuffer(image_data, np.uint8)
        
        # Decode the image
        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
        
        # Check if image is valid and not empty
        if img is None or img.size == 0:
            return False
            
        # Try converting to grayscale (this will catch many common issues)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # Additional checks can be added here if needed
        return True
    except Exception as e:
        print(f"Image validation error: {e}")
        return False

def download_image(url, directory, file_name):
    """Download an image from the given URL, validate it can be used by OpenCV, and save it to the specified directory."""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Referer': 'https://www.bing.com/images',
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Get the image data
        image_data = response.content
        
        # Validate that OpenCV can read this image
        if not is_valid_image(image_data):
            print(f"Skipping invalid image from {url}")
            return False
        
        # Determine the file extension based on content type
        content_type = response.headers.get('Content-Type', '')
        if 'jpeg' in content_type or 'jpg' in content_type:
            ext = 'jpg'
        elif 'png' in content_type:
            ext = 'png'
        elif 'webp' in content_type:
            ext = 'webp'
        elif 'gif' in content_type:
            ext = 'gif'
        else:
            # If content type is not clear, try to get extension from URL
            if '.jpg' in url or '.jpeg' in url:
                ext = 'jpg'
            elif '.png' in url:
                ext = 'png'
            elif '.webp' in url:
                ext = 'webp'
            elif '.gif' in url:
                ext = 'gif'
            else:
                ext = 'jpg'  # Default to jpg
        
        # Full path with file name
        file_path = os.path.join(directory, f"{file_name}.{ext}")
        
        # Save the image
        with open(file_path, 'wb') as f:
            f.write(image_data)
        
        # Double-check the saved file
        try:
            img = cv2.imread(file_path)
            if img is None or img.size == 0:
                print(f"Warning: Saved file {file_path} cannot be read by OpenCV. Removing it.")
                os.remove(file_path)
                return False
            # Try converting to grayscale as a final validation
            cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        except Exception as e:
            print(f"Post-save validation failed for {file_path}: {e}. Removing file.")
            if os.path.exists(file_path):
                os.remove(file_path)
            return False
            
        print(f"Downloaded and validated: {file_name}.{ext}")
        return True
    
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return False

def download_hardik_pandya_images(count=20, output_dir='hardik_pandya_images', 
                                 queries=None):
    """Main function to download Hardik Pandya images that can be processed by OpenCV"""
    if queries is None:
        queries = [
            'Hardik Pandya cricket portrait',
            'Hardik Pandya face clear',
            'Hardik Pandya close up',
            'Hardik Pandya headshot',
            'Hardik Pandya India team portrait'
        ]
    
    # Create the output directory
    output_dir = create_directory(output_dir)
    
    # Get image URLs for each query
    all_image_urls = []
    
    # Get more images than needed since some might fail validation
    extra_factor = 2  # Get 2x the needed count since some will fail validation
    adjusted_count = count * extra_factor
    
    images_per_query = adjusted_count // len(queries)
    remaining = adjusted_count % len(queries)
    
    for i, query in enumerate(queries):
        query_count = images_per_query + (1 if i < remaining else 0)
        if query_count > 0:
            urls = get_bing_image_urls(query, query_count)
            all_image_urls.extend(urls)
    
    # Shuffle to get a mix of different types of images
    random.shuffle(all_image_urls)
    
    # Download and validate images
    print(f"\nDownloading and validating images...")
    
    successful_downloads = 0
    i = 0
    
    while successful_downloads < count and i < len(all_image_urls):
        url = all_image_urls[i]
        file_name = f"hardik_pandya_{successful_downloads+1}"
        success = download_image(url, output_dir, file_name)
        
        if success:
            successful_downloads += 1
            print(f"Progress: {successful_downloads}/{count} valid images downloaded")
            
        # Small delay to avoid being blocked
        time.sleep(0.5)
        i += 1
        
        if i >= len(all_image_urls) and successful_downloads < count:
            print(f"Ran out of image URLs. Only found {successful_downloads} valid images out of {count} requested.")
            break
    
    print(f"\nDownload complete! {successful_downloads} validated images saved to '{output_dir}'.")
    return output_dir

In [12]:
download_hardik_pandya_images(
    count=200, 
    output_dir='hardik_images',
    queries=['Hardik Pandya batting', 'Hardik Pandya FAMILY', 'Hardik Pandya teammates']
)

Directory 'hardik_images' created.
Searching for 'Hardik Pandya batting' images on Bing...
Found 83 image URLs.
Searching for 'Hardik Pandya FAMILY' images on Bing...
Found 90 image URLs.
Searching for 'Hardik Pandya teammates' images on Bing...
Found 30 image URLs.

Downloading and validating images...
Downloaded and validated: hardik_pandya_1.webp
Progress: 1/200 valid images downloaded
Error downloading https://editorial01.shutterstock.com/preview-440/10322624di/24b28cd5/Shutterstock_10322624di.jpg: 404 Client Error: Not Found for url: https://editorial01.shutterstock.com/preview-440/10322624di/24b28cd5/Shutterstock_10322624di.jpg
Downloaded and validated: hardik_pandya_2.jpg
Progress: 2/200 valid images downloaded
Downloaded and validated: hardik_pandya_3.jpg
Progress: 3/200 valid images downloaded
Downloaded and validated: hardik_pandya_4.jpg
Progress: 4/200 valid images downloaded
Downloaded and validated: hardik_pandya_5.webp
Progress: 5/200 valid images downloaded
Downloaded an

'hardik_images'