In [None]:
### Extract
import requests
import json
import os
import time
from urllib.parse import quote


def get_next_page(response):
    """Extracts the next page URL from the API response."""
    return response.get("paging", {}).get("next")


def fetch_data(api_url):
    """Fetches paginated data from the API."""
    all_data = []
    
    while api_url:
        try:
            response = requests.get(api_url)
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
            
            json_data = response.json()
            data = json_data.get("data", [])
            all_data.extend(data)
            
            print(f"Extracted {len(data)} records. Total: {len(all_data)}")

            api_url = get_next_page(json_data)
            time.sleep(2)  # Respect API rate limits

        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            break

    return all_data


def extract_data(bylines, output_dir, access_token, year, api_version="v21.0", country="CZ", language="cs"):
    """
    Extracts and saves ad data from the Facebook Ads API for a list of bylines.
    
    Parameters:
        bylines (list): List of bylines to query.
        output_dir (str): Directory to save the JSON files.
        access_token (str): Facebook API access token.
        year (int): Year for filtering ads.
        api_version (str): API version (default: v21.0).
        country (str): Country code for filtering ads (default: CZ).
        language (str): Language code for filtering ads (default: cs).
    """
    os.makedirs(output_dir, exist_ok=True)  # Ensure output directory exists

    fields = ",".join([
        "id", "ad_snapshot_url", "ad_creation_time", "ad_creative_bodies", "ad_creative_link_captions",
        "ad_creative_link_descriptions", "ad_creative_link_titles", "ad_delivery_start_time", 
        "ad_delivery_stop_time", "bylines", "currency", "delivery_by_region", "demographic_distribution", 
        "estimated_audience_size", "impressions", "languages", "page_id", "page_name", "publisher_platforms", 
        "spend", "target_locations", "target_gender", "target_ages", "eu_total_reach", "beneficiary_payers", 
        "age_country_gender_reach_breakdown"
    ])
    
    for byline in bylines:
        print(f"Extracting Data for: {byline}")
        encoded_byline = quote(f'["{byline}"]')

        api_url = (
            f"https://graph.facebook.com/{api_version}/ads_archive?"
            f"bylines={encoded_byline}&ad_type=POLITICAL_AND_ISSUE_ADS"
            f"&ad_reached_countries=['{country}']&access_token={access_token}"
            f"&unmask_removed_content=true&fields={fields}&limit=199"
            f"&search_terms=''&languages=['{language}']"
            f"&ad_delivery_date_min={year}-04-01"
            # f"&ad_delivery_date_max={year+1}-01-01"
        )
        
        extracted_data = fetch_data(api_url)
        time.sleep(5)  # Delay between different bylines

        if extracted_data:
            filename = os.path.join(output_dir, f"data_{byline}.json")
            with open(filename, "w", encoding="utf-8") as json_file:
                json.dump(extracted_data, json_file, indent=4, ensure_ascii=False)
            print(f"Saved data to {filename}")
        else:
            print(f"No data extracted for {byline}.")


# Parameters
year = 2025
base_dir = os.getcwd()
output_dir = os.path.join(base_dir, f"bylines_ads/{year}")
bylines = ["Štěpán Slovák","Václav Pláteník","Robert Teleky","Karel Smetana","Marie Pošarová - SPD","Kamal Farhan","Pavla Pivoňka Vaňková STAN","Jana Hanzlíková - poslankyně",'Samuel Zabolotný','Česká pirátská strana', 'EUROPEUM Institute for European Policy', 'FTV Prima', 'ANO', 'Naše zdravotnictví', 'Svoboda a přímá demokracie (SPD)', 'ČSOB', 'ODS', 'Lékaři bez hranic - Médecins Sans Frontières in Czech Republic, o. p. s.', 'Aliance pro budoucnost', 'STAN', 'Starostové a nezávislí • STAN', 'PRAHA SOBĚ', 'Člověk v tísni, o.p.s.', 'SH media, spol. s r.o.', 'Komunistická strana Čech a Moravy', 'Zdeněk Hraba - senátor', 'Ondrej Prokop', 'Ministerstvo práce a sociálních věcí', 'Tomáš Zdechovský', 'Milion Chvilek, z. s.', 'EU Social', 'EU Justice and Consumers', 'ODS - Občanská demokratická strana', 'Sociální demokracie', 'Amnesty International ČR', 'Občanská demokratická strana', 'Tipsport', 'Karel Janeček', 'Martin Kuba', 'Starostové a nezávislí', 'Svoboda zvířat Plzeň, z.s.', 'CARE Česká republika', 'Replastuj.cz', 'ANO 2011', 'Reportér magazín', 'Andrej Babiš', 'Člověk v tísni o.p.s.', 'DFMG', 'TOP 09', 'SEN 21', 'Kupředu do minulosti s.r.o.', 'Ministerstvo pro místní rozvoj ČR', 'Svoboda a přímá demokracie', 'Greenpeace Česká republika', 'STAROSTOVÉ A NEZÁVISLÍ', 'KDU','KDU-ČSL', 'Česká pirátská strana - Praha', 'XTV', 'Hnutí DUHA - Přátelé Země Česká republika', 'Zelení - Strana zelených', 'PŘÍSAHA - občanské hnutí Roberta Šlachty', 'Český rozhlas', 'CZECH NEWS CENTER a. s.', 'Oldřich Hájek', 'Transparency International ČR','Nadační fond pro Ukrajinu','DICK-TATOR','Český rozhlas','Berenika Peštová-poslankyně','Martin Benkovič, 1. místostarosta Prahy 17 - Řepy','Denik.to','Zjisti víc','Jakub Horák','Aliance pro budoucnost','GEN' 'Odkryto.cz','Heroine.cz','OBRAZ - Obránci zvířat','Jana Maláčová','Fairtrade Česko a Slovensko','DFMG','Nikola Bartůšek','Ecoista','Nadační fond PRAVDA O VODĚ','Radomír Nepil - místostarosta Osmičky','Daniel Kůs - Radní Plzně','Radim F. Holeček','Nadační fond Svědomí národa']
# bylines = ["Berenika Peštová-poslankyně","Samuel Zabolotný","Štěpán Slovák","Václav Pláteník","Robert Teleky","Karel Smetana","Marie Pošarová - SPD","Kamal Farhan","Pavla Pivoňka Vaňková STAN","Jana Hanzlíková - poslankyně"]
access_token = "EAALnc8im5MUBO9FtPZCD8AcvH8w0CfpLw4BTgWtLI0Ivs7d71FI5BMirPdtx3ejqC4l8OUu8foPYjEEgsGtDZB3nyWeS2SSq2OGHYEo3yJnrawuZC6q5bz2BdQj7NSB4kYVQiYNVLFSofwgjtAICABUIMGpK6F1GlMo9t4uYS29XszNgHnC4tkYtLn1"  # Use environment variable

if not access_token:
    raise ValueError("Access token is missing. Set the 'FB_ACCESS_TOKEN' environment variable.")

# Execute extraction
extract_data(bylines, output_dir, access_token, year)


In [None]:
### Process
import os
import json
import re


def replace_id_with_underscore_id(json_data):
    """Recursively replaces 'id' with '_id' in JSON data."""
    if isinstance(json_data, dict):
        if "id" in json_data:
            json_data["_id"] = json_data.pop("id")
        for value in json_data.values():
            replace_id_with_underscore_id(value)
    elif isinstance(json_data, list):
        for item in json_data:
            replace_id_with_underscore_id(item)
    return json_data


def process_json_files(folder_path, process_function):
    """Applies a transformation function to all JSON files in a given folder."""
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as json_file:
                data = json.load(json_file)

            processed_data = process_function(data)

            with open(file_path, "w", encoding="utf-8") as json_file:
                json.dump(processed_data, json_file, indent=4, ensure_ascii=False)


def append_json_files(folder_path):
    """Combines all JSON files in a directory into a single list."""
    all_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as json_file:
                all_data.extend(json.load(json_file))
    return all_data


def remove_emojis(text):
    """Removes emojis and symbols from a given text."""
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Miscellaneous Symbols and Pictographs
        "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
        "\U0001F1E0-\U0001F1FF"  # Flags
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002600-\U000026FF"  # Miscellaneous Symbols
        "\U00000200-\U00002BFF"  # Additional symbols
        "\U0001F004"             # Mahjong tiles
        "\U0001F0CF"             # Playing cards
        "\n"                     # Line break
        "]",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub("", text)


def clean_ad_creative_bodies(data):
    """Removes emojis from 'ad_creative_bodies' field in JSON data."""
    for item in data:
        if "ad_creative_bodies" in item:
            if isinstance(item["ad_creative_bodies"], str):
                item["ad_creative_bodies"] = remove_emojis(item["ad_creative_bodies"])
            elif isinstance(item["ad_creative_bodies"], list):
                item["ad_creative_bodies"] = [remove_emojis(body) for body in item["ad_creative_bodies"]]
    return data

def clean_ad_creative_link_titles(data):
    """Removes emojis from 'ad_creative_link_titles' field in JSON data."""
    for item in data:
        if "ad_creative_link_titles" in item:
            if isinstance(item["ad_creative_link_titles"], str):
                item["ad_creative_link_titles"] = remove_emojis(item["ad_creative_link_titles"])
            elif isinstance(item["ad_creative_link_titles"], list):
                item["ad_creative_link_titles"] = [remove_emojis(body) for body in item["ad_creative_link_titles"]]
    return data


# Process all years
for year in range(2025, 2026):
    base_dir = os.getcwd()
    output_dir = os.path.join(base_dir, f"bylines_ads/{year}")

    if not os.path.exists(output_dir):
        print(f"Skipping {year} - No data found.")
        continue

    print(f"Processing data for year {year}...")

    # Step 1: Replace 'id' with '_id' in all JSON files
    process_json_files(output_dir, replace_id_with_underscore_id)
    print(f"Replaced 'id' with '_id' in {year} data.")

    # Step 2: Append all JSON data into a single file
    appended_data = append_json_files(output_dir)
    all_data_file = os.path.join(output_dir, "data_all.json")
    with open(all_data_file, "w", encoding="utf-8") as json_file:
        json.dump(appended_data, json_file, indent=4, ensure_ascii=False)
    print(f"Appended {len(appended_data)} records into {all_data_file}.")

    # Step 3: Remove emojis from 'ad_creative_bodies'
    cleaned_data = clean_ad_creative_bodies(appended_data)
    cleaned_data = clean_ad_creative_link_titles(cleaned_data)
    cleaned_data_file = os.path.join(output_dir, "data_all_cleaned.json")
    with open(cleaned_data_file, "w", encoding="utf-8") as json_file:
        json.dump(cleaned_data, json_file, indent=4, ensure_ascii=False)
    print(f"Cleaned 'ad_creative_bodies' and saved to {cleaned_data_file}.\n")

print("Processing complete.")


# Remove the processed files after completion
for year in range(2025, 2026):
    output_dir = os.path.join(base_dir, f"bylines_ads\\{year}")
    file_path = os.path.join(output_dir, "data_all.json")
    
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Removed file: {file_path}")
    else:
        print(f"File not found, skipping: {file_path}")


In [None]:
### Push to mongo
from pymongo import MongoClient
import json
import os
from tqdm import tqdm

client = MongoClient('mongodb://localhost:27017')
db = client["spending_db"]
collection = db["spending"]

for year in range(2025, 2026):
    base_dir = os.getcwd()
    output_dir = os.path.join(base_dir, f"bylines_ads\\{year}")
    file_path = os.path.join(output_dir, "data_all_cleaned.json")

    with open(file_path, "r", encoding="utf-8") as file:
        json_data = json.load(file)

    new_count = 0
    updated_count = 0

    print(f"\nProcessing year {year}...")
    for document in tqdm(json_data, desc="Inserting documents", unit="doc"):
        result = collection.replace_one(
            {"_id": document["_id"]},
            document,
            upsert=True
        )
        if result.upserted_id is not None:
            new_count += 1
        elif result.modified_count == 1:
            updated_count += 1

    print(f"Year {year} Results:")
    print(f"New documents inserted: {new_count}")
    print(f"Existing documents updated: {updated_count}\n")

client.close()

# Remove the processed files after completion
for year in range(2025, 2026):
    output_dir = os.path.join(base_dir, f"bylines_ads\\{year}")
    file_path = os.path.join(output_dir, "data_all_cleaned.json")
    
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Removed file: {file_path}")
    else:
        print(f"File not found, skipping: {file_path}")

In [None]:
### Check distinct values in MongoDB
from pymongo import MongoClient
import pandas as pd

client = MongoClient('mongodb://localhost:27017')
db = client["spending_db"]
collection = db["spending"]

distinct_payer = collection.distinct("payer")
distinct_beneficiary = collection.distinct("beneficiary")
distinct_page_id= collection.distinct("page_id")

print(f"Total distinct page_ids: {len(distinct_payer)}")
print(f"Total distinct page_ids: {len(distinct_beneficiary)}")
print(f"Total distinct page_ids: {len(distinct_page_id)}")



# Load the CSV file into a DataFrame
csv_path = "C:\\Users\\jirip\\Documents\\Developer\\python\\political_ads\\ad_data\\image_urls.csv"
df = pd.read_csv(csv_path)

# Create a list of page_id
page_id_list = df["page_id"].tolist()

print(f"Loaded {len(page_id_list)} page IDs from the CSV.")

# Ensure both sets contain strings
page_id_list = set(map(str, page_id_list))
distinct_page_id = set(map(str, distinct_page_id))
# Find differences between page_id_list and distinct_page_id
page_ids_in_csv_not_in_db = set(page_id_list) - set(distinct_page_id)
page_ids_in_db_not_in_csv = set(distinct_page_id) - set(page_id_list)

print(f"Page IDs in CSV but not in MongoDB: {len(page_ids_in_csv_not_in_db)}")
print(f"Page IDs in MongoDB but not in CSV: {len(page_ids_in_db_not_in_csv)}")

if page_ids_in_csv_not_in_db:
    print("Sample Page IDs in CSV but not in MongoDB:", list(page_ids_in_csv_not_in_db)[:100])

if page_ids_in_db_not_in_csv:
    print("Sample Page IDs in MongoDB but not in CSV:", list(page_ids_in_db_not_in_csv)[:100])
# pipeline = [
#     {"$group": {"_id": "$page_id", "page_names": {"$addToSet": "$page_name"}}},
#     {"$project": {"_id": 1, "page_names": 1, "count": {"$size": "$page_names"}}},
#     {"$match": {"count": {"$gt": 1}}}
# ]
#  
# results = collection.aggregate(pipeline)
# 
# for result in results:
#     print(f"Page ID: {result['_id']}, Page Names: {result['page_names']}")

In [None]:
### Download logos from Facebook Ads Library
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import requests
import os
from pymongo import MongoClient

def setup_mongo_connection(uri="mongodb://localhost:27017", db_name="spending_db", collection_name="spending"):
    """Establishes a connection to MongoDB and returns the collection."""
    client = MongoClient(uri)
    return client[db_name][collection_name]

def setup_chrome_driver():
    """Initializes and returns a Selenium Chrome WebDriver."""
    service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode for efficiency
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    return webdriver.Chrome(service=service, options=options)

def navigate_to_page(driver, page_id, delay=2):
    """Navigates to the Facebook Ads Library page for a given page ID."""
    url = f"https://www.facebook.com/ads/library/?active_status=all&ad_type=political_and_issue_ads&country=CZ&is_targeted_country=false&media_type=all&search_type=page&source=ad-report&view_all_page_id={page_id}"
    driver.get(url)
    time.sleep(delay)

def download_image(driver, page_id, output_dir="ad_data"):
    """Downloads the logo image for a given page ID and saves it locally."""
    os.makedirs(output_dir, exist_ok=True)
    try:
        div_element = driver.find_element(By.CSS_SELECTOR, "div.x9f619.x1n2onr6.x1ja2u2z")
        image_element = div_element.find_element(By.TAG_NAME, "img")
        image_src = image_element.get_attribute("src")
        img_data = requests.get(image_src, timeout=5).content
        image_path = os.path.join(output_dir, f"{page_id}_logo.png")
        with open(image_path, 'wb') as handler:
            handler.write(img_data)
        print(f"Downloaded image for Page ID {page_id}")
    except Exception as e:
        print(f"Error downloading image for {page_id}: {e}")

def main():
    collection = setup_mongo_connection()
    all = collection.distinct("page_id")
    
    # Construct relative path and read in one step
    df = pd.read_csv(os.path.join("ad_data", "image_urls.csv"))
    done = df["page_id"].unique().tolist()
    
    
    
    # page_ids = list(set(all) - set(done))
    page_ids = ['116936778126005', '1591945024361681', '1812508368973199', '192926034666717', '102500511866371', '163914317133681']
    print(f"Remaining Page IDs to process: {len(page_ids)}")
    driver = setup_chrome_driver()
    
    try:
        for page_id in page_ids:
            print(f"Processing Page ID: {page_id}")
            navigate_to_page(driver, page_id)
            download_image(driver, page_id)
    except Exception as e:
        print(f"Unexpected error: {e}")
    finally:
        driver.quit()
        print("Driver closed.")

if __name__ == "__main__":
    main()


In [None]:
#### Create CSV with image URLs
import os
import pandas as pd

# Define output directory
output_dir = os.path.join(os.getcwd(), "ad_data")

# Ensure the directory exists before listing files
if not os.path.exists(output_dir):
    raise FileNotFoundError(f"Directory '{output_dir}' does not exist.")

# List images with '_logo.png' suffix
image_names = [file for file in os.listdir(output_dir) if file.endswith("_logo_circular.png")]

# Function to generate raw GitHub URLs
def github_to_raw_url(image_name: str) -> str:
    return f"https://raw.githubusercontent.com/Thepilli/political_ads/refs/heads/main/ad_data/{image_name}"

# Prepare data for the DataFrame
df = pd.DataFrame([
    {"page_id": str(image.replace("_logo.png", "")), "url_key": github_to_raw_url(image)}
    for image in image_names
])

# Ensure 'page_id' is stored explicitly as a string
df["page_id"] = df["page_id"].astype(str)

# Save to CSV
csv_path = os.path.join(output_dir, "image_urls.csv")
df.to_csv(csv_path, index=False)
print(f"CSV file saved at: {csv_path}")

# Print raw URLs
for _, row in df.iterrows():
    print(f"Raw URL: {row['url_key']}")


In [None]:
### Circular crop images

from PIL import Image, ImageDraw
import numpy as np
import os

# Define output directory
output_dir = os.path.join(os.getcwd(), "ad_data")

# Ensure the directory exists before listing files
if not os.path.exists(output_dir):
    raise FileNotFoundError(f"Directory '{output_dir}' does not exist.")

# List images with '_logo.png' suffix
image_names = [file for file in os.listdir(output_dir) if file.endswith("_logo.png")]
image_names = ['116936778126005_logo.png', '1591945024361681_logo.png', '1812508368973199_logo.png', '192926034666717_logo.png', '102500511866371_logo.png', '163914317133681_logo.png']


def circular_crop(image_path, output_path):
    # Open the image
    img = Image.open(image_path).convert("RGBA")
    
    # Create same size mask with transparent background
    mask = Image.new("L", img.size, 0)
    draw = ImageDraw.Draw(mask)

    # Define the circular region (centered)
    size = min(img.size)
    left = (img.width - size) // 2
    top = (img.height - size) // 2
    right = left + size
    bottom = top + size

    # Draw a white filled circle on the mask
    draw.ellipse((left, top, right, bottom), fill=255)

    # Apply mask to image
    circular_img = Image.new("RGBA", img.size, (0, 0, 0, 0))
    circular_img.paste(img, (0, 0), mask=mask)

    # Crop the circular region and save
    circular_img = circular_img.crop((left, top, right, bottom))
    circular_img.save(output_path, format="PNG")


# Loop through all images and apply circular cropping
for image_name in image_names:
    input_path = os.path.join(output_dir, image_name)
    output_path = os.path.join(output_dir, image_name.replace("_logo.png", "_logo_circular.png"))
    circular_crop(input_path, output_path)
    print(f"Circular cropped image saved at: {output_path}")


In [None]:
### Download latest data from Facebook Ads Library
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Initialize the WebDriver (using Chrome in this example)
driver = webdriver.Chrome()

# Replace 'URL_HERE' with the actual webpage URL
driver.get('https://www.facebook.com/ads/library/report/?source=nav-header')

try:
    # Wait up to 20 seconds for the element to be clickable
    element = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div/div/div/div/div/div/div[1]/div/div[2]/div[3]/div/div[11]/div/div/div/div/div/div[1]/div[2]/div/div[2]/div[4]/a/div'))
    )
    element.click()
    print("Element clicked successfully!")
    time.sleep(5)  # Wait for a few seconds to see the result
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    # Close the browser after a short delay (optional)
    driver.quit()
    


In [None]:
# single image compression
from PIL import Image
import os

# Define the image path directly
image_path = 'C://Users//Jiri.Pillar//Developer//political_ads//screenshot_full_element_1.png'

# Define the output directory
output_dir = os.path.dirname(image_path)

# Create output filename (change extension to .jpg)
image_name = os.path.basename(image_path)
output_path = os.path.join(output_dir, image_name.replace("screenshot_full_element", "compressed").replace(".png", ".jpg"))

# Get the size before compression
size_before = os.path.getsize(image_path)

# Open the image and convert to JPEG
with Image.open(image_path) as img:
    # Convert to RGB mode (required for JPEG format)
    rgb_img = img.convert('RGB')
    # Save as JPEG with quality setting (1-95, lower means more compression)
    rgb_img.save(output_path, "JPEG", quality=80)

# Get the size after compression
size_after = os.path.getsize(output_path)

print(f"Image: {image_name}")
print(f"Size before compression: {size_before / 1024:.2f} KB")
print(f"Size after compression: {size_after / 1024:.2f} KB")
print(f"Compression ratio: {size_before / size_after:.2f}x")


In [27]:
import os
import json
import time
from pathlib import Path
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Constants
JSON_DIR = Path(r"C:/Users/Jiri.Pillar/Developer/political_ads/bylines_ads/2025")
SCREENSHOT_LIMIT = 10
FACEBOOK_URL = "https://www.facebook.com/"
SCREENSHOT_DIR = Path("screenshots")
COMPRESSED_DIR = SCREENSHOT_DIR / "compressed"
SCREENSHOT_DIR.mkdir(exist_ok=True)
COMPRESSED_DIR.mkdir(exist_ok=True)


def extract_ad_data(directory: Path):
    extracted = []
    for file_path in directory.glob("*.json"):
        with file_path.open("r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                for item in data:
                    url = item.get("ad_snapshot_url")
                    ad_id = item.get("_id")
                    if url and ad_id:
                        extracted.append({"ad_snapshot_url": url, "_id": ad_id})
            except json.JSONDecodeError as e:
                print(f"❌ Failed to parse JSON file: {file_path.name} – {e}")
    return extracted


def setup_chrome_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1920,3000")
    return webdriver.Chrome(options=options)


def handle_facebook_popup(driver):
    try:
        WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((
                By.XPATH, '/html/body/div[3]/div[2]/div/div/div/div/div[3]/div[2]/div/div[1]/div[2]/div/div[1]'
            ))
        ).click()
        time.sleep(2)
    except Exception:
        print("⚠️ No popup or click failed – continuing.")


def compress_image(input_path: Path, output_path: Path, quality: int = 80):
    try:
        with Image.open(input_path) as img:
            rgb_img = img.convert("RGB")
            rgb_img.save(output_path, "JPEG", quality=quality)
        print(f"🗜️  Compressed: {output_path.name}")
    except Exception as e:
        print(f"❌ Compression failed for {input_path.name}: {e}")


def capture_screenshots(driver, ads):
    for record in ads[:SCREENSHOT_LIMIT]:
        url = record["ad_snapshot_url"]
        ad_id = record["_id"]
        png_path = SCREENSHOT_DIR / f"{ad_id}.png"
        jpg_path = COMPRESSED_DIR / f"compressed_{ad_id}.jpg"
        try:
            driver.get(url)
            element = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div/div/div/div'))
            )
            driver.execute_script("arguments[0].scrollIntoView(true);", element)
            element.screenshot(str(png_path))
            print(f"✅ Screenshot saved: {png_path.name}")
            compress_image(png_path, jpg_path)
        except Exception as e:
            print(f"❌ Failed to process {ad_id}: {e}")


def main():
    ads = extract_ad_data(JSON_DIR)
    if not ads:
        print("⚠️ No ads found to process.")
        return

    driver = setup_chrome_driver()
    driver.get(FACEBOOK_URL)
    handle_facebook_popup(driver)

    capture_screenshots(driver, ads)

    driver.quit()
    print("✅ All done.")


if __name__ == "__main__":
    main()


✅ Screenshot saved: 1283226729871042.png
🗜️  Compressed: compressed_1283226729871042.jpg
✅ Screenshot saved: 10064172176950533.png
🗜️  Compressed: compressed_10064172176950533.jpg
✅ Screenshot saved: 1638016523576583.png
🗜️  Compressed: compressed_1638016523576583.jpg
✅ Screenshot saved: 651712204500788.png
🗜️  Compressed: compressed_651712204500788.jpg
✅ Screenshot saved: 1339941867119237.png
🗜️  Compressed: compressed_1339941867119237.jpg
✅ Screenshot saved: 579743854534708.png
🗜️  Compressed: compressed_579743854534708.jpg
✅ Screenshot saved: 1643921919822886.png
🗜️  Compressed: compressed_1643921919822886.jpg
✅ Screenshot saved: 1208872917255806.png
🗜️  Compressed: compressed_1208872917255806.jpg
✅ Screenshot saved: 1740493313180171.png
🗜️  Compressed: compressed_1740493313180171.jpg
✅ Screenshot saved: 551357280991839.png
🗜️  Compressed: compressed_551357280991839.jpg
✅ All done.


In [None]:
import os
import json
import time
from pathlib import Path
from PIL import Image
from multiprocessing import Pool, cpu_count
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Constants
JSON_DIR = Path(r"C:/Users/Jiri.Pillar/Developer/political_ads/bylines_ads/2025")
SCREENSHOT_DIR = Path("screenshots")
COMPRESSED_DIR = SCREENSHOT_DIR / "compressed"
SCREENSHOT_DIR.mkdir(exist_ok=True)
COMPRESSED_DIR.mkdir(exist_ok=True)
MAX_ITEMS = None  # Set to an integer if you want to limit the number of ads

# Extract ads from JSON files
def extract_ad_data(directory: Path):
    extracted = []
    for file_path in directory.glob("*.json"):
        with file_path.open("r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                for item in data:
                    url = item.get("ad_snapshot_url")
                    ad_id = item.get("_id")
                    if url and ad_id:
                        extracted.append((url, ad_id))
            except json.JSONDecodeError:
                continue
    return extracted


# Screenshot + compress job per ad
def process_ad(ad_tuple):
    url, ad_id = ad_tuple
    png_path = SCREENSHOT_DIR / f"{ad_id}.png"
    jpg_path = COMPRESSED_DIR / f"compressed_{ad_id}.jpg"

    try:
        # Setup headless driver per process
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--window-size=1920,3000")
        driver = webdriver.Chrome(options=options)
        driver.set_page_load_timeout(30)

        driver.get(url)
        try:
            element = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div/div/div/div'))
            )
            driver.execute_script("arguments[0].scrollIntoView(true);", element)
            element.screenshot(str(png_path))
        except Exception:
            driver.quit()
            return

        driver.quit()

        try:
            with Image.open(png_path) as img:
                img.convert("RGB").save(jpg_path, "JPEG", quality=80)
        except Exception:
            return
        finally:
            if png_path.exists():
                png_path.unlink()

    except Exception:
        pass  # silently skip broken item


def main():
    ads = extract_ad_data(JSON_DIR)
    if MAX_ITEMS:
        ads = ads[:MAX_ITEMS]

    with Pool(processes=cpu_count()) as pool:
        pool.map(process_ad, ads)


if __name__ == "__main__":
    main()


In [5]:
import os
import json
import time
from pathlib import Path
from PIL import Image
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Constants
JSON_DIR = Path(r"C:/Users/Jiri.Pillar/Developer/political_ads/bylines_ads/2025")
SCREENSHOT_DIR = Path("screenshots")
COMPRESSED_DIR = SCREENSHOT_DIR / "compressed"
SCREENSHOT_DIR.mkdir(exist_ok=True)
COMPRESSED_DIR.mkdir(exist_ok=True)
MAX_ITEMS = None  # Set to e.g. 100 to limit

# Extract ads from JSON files
def extract_ad_data(directory: Path):
    ads = []
    for file_path in directory.glob("*.json"):
        try:
            with file_path.open("r", encoding="utf-8") as f:
                data = json.load(f)
                for item in data:
                    url = item.get("ad_snapshot_url")
                    ad_id = item.get("_id")
                    if url and ad_id:
                        ads.append((url, ad_id))
        except json.JSONDecodeError:
            continue
    return ads

# Setup Chrome driver
def setup_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1920,3000")
    return webdriver.Chrome(options=options)

# Handle Facebook popup
def handle_facebook_popup(driver):
    driver.get("https://www.facebook.com/")
    try:
        WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable((
                By.XPATH,
                '/html/body/div[3]/div[2]/div/div/div/div/div[3]/div[2]/div/div[1]/div[2]/div/div[1]'
            ))
        ).click()
        print("[✓] Facebook popup handled.")
    except Exception:
        print("[!] No popup or already accepted.")

# Process a single ad
def process_ad(driver, url, ad_id):
    png_path = SCREENSHOT_DIR / f"{ad_id}.png"
    jpg_path = COMPRESSED_DIR / f"compressed_{ad_id}.jpg"

    try:
        driver.get(url)
        element = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div/div/div/div'))
        )
        driver.execute_script("arguments[0].scrollIntoView(true);", element)
        element.screenshot(str(png_path))
        print(f"[✓] Screenshot: {ad_id}")
    except Exception:
        print(f"[!] Screenshot failed: {ad_id}")
        return

    try:
        with Image.open(png_path) as img:
            img.convert("RGB").save(jpg_path, "JPEG", quality=60)
        png_path.unlink()
        print(f"    └─ Compressed: {ad_id}")
    except Exception:
        print(f"[!] Compression failed: {ad_id}")

def main():
    ads = extract_ad_data(JSON_DIR)
    if MAX_ITEMS:
        ads = ads[:MAX_ITEMS]

    driver = setup_driver()
    handle_facebook_popup(driver)
    time.sleep(2)

    for idx, (url, ad_id) in enumerate(ads, 1):
        print(f"\n[{idx}/{len(ads)}] Processing ad: {ad_id}")
        process_ad(driver, url, ad_id)

    driver.quit()

if __name__ == "__main__":
    main()


[✓] Facebook popup handled.

[1/985] Processing ad: 1283226729871042
[✓] Screenshot: 1283226729871042
    └─ Compressed: 1283226729871042

[2/985] Processing ad: 10064172176950533
[✓] Screenshot: 10064172176950533
    └─ Compressed: 10064172176950533

[3/985] Processing ad: 1638016523576583
[✓] Screenshot: 1638016523576583
    └─ Compressed: 1638016523576583

[4/985] Processing ad: 651712204500788
[✓] Screenshot: 651712204500788
    └─ Compressed: 651712204500788

[5/985] Processing ad: 1339941867119237
[✓] Screenshot: 1339941867119237
    └─ Compressed: 1339941867119237

[6/985] Processing ad: 579743854534708
[✓] Screenshot: 579743854534708
    └─ Compressed: 579743854534708

[7/985] Processing ad: 1643921919822886
[✓] Screenshot: 1643921919822886
    └─ Compressed: 1643921919822886

[8/985] Processing ad: 1208872917255806
[✓] Screenshot: 1208872917255806
    └─ Compressed: 1208872917255806

[9/985] Processing ad: 1740493313180171
[✓] Screenshot: 1740493313180171
    └─ Compressed: 17