In [None]:
import requests
import os
import time

In [None]:
def scrape_faces_in_news(query_list, output_dir, max_results=10, delay=5):
    """
    Scrapes Wikimedia Commons for images based on a list of search queries.

    Args:
        query_list (list): List of search queries (e.g., ["face AND news", "portrait AND news"]).
        output_dir (str): Directory to save the images.
        max_results (int): Maximum number of images to download per query.
        delay (int): Delay (in seconds) between API calls to avoid rate limits.
    """
    base_url = "https://commons.wikimedia.org/w/api.php"

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Loop through each search query
    for query in query_list:
        print(f"\nSearching for: '{query}'\n" + "-" * 40)
        query_exhausted = False  # Flag to detect when no more results are found

        # Step 1: Search for images
        search_params = {
            "action": "query",
            "format": "json",
            "list": "search",
            "srsearch": query,
            "srnamespace": 6,  # Media files only
            "srlimit": max_results  # Limit the number of results
        }
        response = requests.get(base_url, params=search_params)
        search_results = response.json().get('query', {}).get('search', [])

        if not search_results:
            print(f"No more images found for query: '{query}'. Moving to the next query.\n")
            continue  # Skip to the next query if no results are found

        # Step 2: Download each image
        for result in search_results:
            title = result['title']  # e.g., "File:Example.jpg"
            
            # Get image information (URL and metadata)
            image_info_params = {
                "action": "query",
                "format": "json",
                "titles": title,
                "prop": "imageinfo",
                "iiprop": "url|extmetadata|size"  # Fetch image URLs and metadata
            }
            image_info_response = requests.get(base_url, params=image_info_params)
            pages = image_info_response.json().get('query', {}).get('pages', {})

            for page_id, page_data in pages.items():
                if "imageinfo" in page_data:
                    image_url = page_data["imageinfo"][0]["url"]
                    ext_metadata = page_data["imageinfo"][0].get("extmetadata", {})
                    license_info = ext_metadata.get("LicenseShortName", {}).get("value", "Unknown License")

                    # Step 3: Download the image
                    image_data = requests.get(image_url).content
                    file_name = os.path.join(output_dir, title.replace("File:", "").replace(" ", "_"))

                    with open(file_name, 'wb') as file:
                        file.write(image_data)
                        print(f"Downloaded: {file_name} (License: {license_info})")

                    # Delay to avoid rate-limiting
                    time.sleep(delay)

        # Flagging query as exhausted
        if len(search_results) < max_results:
            print(f"No more results for query: '{query}'.\n")
            query_exhausted = True

        # Log completion of the current query
        if not query_exhausted:
            print(f"Finished processing all available images for query: '{query}'.\n")

    print("Scraping and downloading completed for all queries.")

In [None]:
# Example usage
query_combinations = [
    "face AND news",
    "portrait AND journalist",
    "headshot AND news",
    "reporter AND face",
    "journalist portrait"
]

scrape_faces_in_news(
    query_list=query_combinations,
    output_dir="faces_in_news_images",
    max_results=10,  # Limit to 10 images per query
    delay=5  # 5 seconds between requests
)