In [1]:
### Extract
import requests
import json
import os
import time
from urllib.parse import quote
from tqdm import tqdm


def get_next_page(response):
    """Extracts the next page URL from the API response."""
    return response.get("paging", {}).get("next")


def fetch_data(api_url):
    """Fetches paginated data from the API."""
    all_data = []
    
    while api_url:
        try:
            response = requests.get(api_url)
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
            
            json_data = response.json()
            data = json_data.get("data", [])
            all_data.extend(data)
            
            print(f"Extracted {len(data)} records. Total: {len(all_data)}")

            api_url = get_next_page(json_data)
            time.sleep(2)  # Respect API rate limits

        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            break

    return all_data


def extract_data(bylines, output_dir, access_token, year, api_version="v23.0", country="CZ", language="cs"):
    """
    Extracts and saves ad data from the Facebook Ads API for a list of bylines.
    
    Parameters:
        bylines (list): List of bylines to query.
        output_dir (str): Directory to save the JSON files.
        access_token (str): Facebook API access token.
        year (int): Year for filtering ads.
        api_version (str): API version (default: v21.0).
        country (str): Country code for filtering ads (default: CZ).
        language (str): Language code for filtering ads (default: cs).
    """
    os.makedirs(output_dir, exist_ok=True)  # Ensure output directory exists

    fields = ",".join([
        "id", "ad_snapshot_url", "ad_creation_time", "ad_creative_bodies", "ad_creative_link_captions",
        "ad_creative_link_descriptions", "ad_creative_link_titles", "ad_delivery_start_time", 
        "ad_delivery_stop_time", "bylines", "currency", "delivery_by_region", "demographic_distribution", 
        "estimated_audience_size", "impressions", "languages", "page_id", "page_name", "publisher_platforms", 
        "spend", "target_locations", "target_gender", "target_ages", "eu_total_reach", "beneficiary_payers", 
        "age_country_gender_reach_breakdown"
    ])
    
    # for byline in bylines:
    bylines_iter = tqdm(bylines, desc="Bylines", unit="byline")
    for byline in bylines_iter:
        print(f" Extracting Data for: {byline}")
        encoded_byline = quote(f'["{byline}"]')

        api_url = (
            f"https://graph.facebook.com/{api_version}/ads_archive?"
            f"bylines={encoded_byline}&ad_type=POLITICAL_AND_ISSUE_ADS"
            f"&ad_reached_countries=['{country}']&access_token={access_token}"
            f"&unmask_removed_content=true&fields={fields}&limit=199"
            f"&search_terms=''&languages=['{language}']"
            f"&ad_delivery_date_min={year}-08-01"
            # f"&ad_delivery_date_max={year+1}-01-01"
        )
        
        extracted_data = fetch_data(api_url)
        time.sleep(5)  # Delay between different bylines

        if extracted_data:
            filename = os.path.join(output_dir, f"data_{byline}.json")
            with open(filename, "w", encoding="utf-8") as json_file:
                json.dump(extracted_data, json_file, indent=4, ensure_ascii=False)
            print(f"Saved data to {filename}")
        else:
            print(f"No data extracted for {byline}.")

 
# Parameters
year = 2025
base_dir = os.getcwd()
output_dir = os.path.join(base_dir, f"volby_2025_ads/{year}")
bylines = ['Daniel Kůs - Radní Plzně','Daniel Kůs','Pavel Staněk','Matěj Ondřej Havel - východočeský poslanec','Motoristé sobě','Pavla Pivoňka Vaňková STAN','Pavla Pivoňka Vaňková','Štěpán Slovák','Václav Pláteník','Robert Teleky','Karel Smetana','Marie Pošarová - SPD','Kamal Farhan','Jana Hanzlíková - poslankyně','Samuel Zabolotný','Česká pirátská strana','ANO','Svoboda a přímá demokracie (SPD)','ODS','STAN','Starostové a nezávislí • STAN','Komunistická strana Čech a Moravy','Ondrej Prokop','ODS - Občanská demokratická strana','Sociální demokracie','Občanská demokratická strana','Martin Kuba','Starostové a nezávislí','ANO 2011','Andrej Babiš','TOP 09','Svoboda a přímá demokracie','STAROSTOVÉ A NEZÁVISLÍ','KDU','KDU-ČSL','Česká pirátská strana - Praha','Oldřich Hájek','Berenika Peštová-poslankyně','Martin Benkovič, 1. místostarosta Prahy 17 - Řepy','Jakub Horák','Jana Maláčová','Radomír Nepil - místostarosta Osmičky','Radim F. Holeček','Bold News','Oldřich Hájek - senátor','Jana Bobošíková','Zdeněk Kettner - SPD','Pavel Tůma','Stanislav Blaha','Pavla Pivoňka Vaňková STAN','Martin Sedeke','SPOLU pro Prahu','Roman Roun','MUDr. Jan Síla • SPD','Pavel Huml','Ondřej Počarovský, Radní Prahy 10','Pavel Dobeš • místostarosta Prahy 3','Lukáš Otys','Benjamin Činčila','Eduard Hulicius - kandidát na poslance','Kužílková Lucie','Lucie Kužílková','Eva Decroix','Martin Baxa']
access_token = "EAALnc8im5MUBPNNdiuRoEp18NscWZAZAKYTT0oNkKAIZB7yu46T6ES3xHTesd8SvPB3NGl5tnHp9atwdIi0sCLStEFZBl41WMi2tiw7PwimuPRZBCbSOlReRHDqvnQZA1nE1R0gWCsWsNLj4zU3DPlhu6Iyr3zv5nkPlyiZCixrSyPZAv9bdgUYZCCqAFGHZAuYZB5veLzm8e34w109C7JV"  # Use environment variable

if not access_token:
    raise ValueError("Access token is missing. Set the 'FB_ACCESS_TOKEN' environment variable.")

# Execute extraction
extract_data(bylines, output_dir, access_token, year)


Bylines:   0%|          | 0/64 [00:00<?, ?byline/s]

 Extracting Data for: Daniel Kůs - Radní Plzně
Extracted 21 records. Total: 21
Extracted 0 records. Total: 21


Bylines:   2%|▏         | 1/64 [00:11<11:40, 11.13s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Daniel Kůs - Radní Plzně.json
 Extracting Data for: Daniel Kůs
Extracted 0 records. Total: 0


Bylines:   3%|▎         | 2/64 [00:18<09:21,  9.05s/byline]

No data extracted for Daniel Kůs.
 Extracting Data for: Pavel Staněk
Extracted 28 records. Total: 28
Extracted 0 records. Total: 28


Bylines:   5%|▍         | 3/64 [00:29<10:02,  9.88s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Pavel Staněk.json
 Extracting Data for: Matěj Ondřej Havel - východočeský poslanec
Extracted 0 records. Total: 0


Bylines:   6%|▋         | 4/64 [00:37<08:59,  8.99s/byline]

No data extracted for Matěj Ondřej Havel - východočeský poslanec.
 Extracting Data for: Motoristé sobě
Extracted 199 records. Total: 199
Extracted 120 records. Total: 319
Extracted 0 records. Total: 319


Bylines:   8%|▊         | 5/64 [00:56<12:37, 12.83s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Motoristé sobě.json
 Extracting Data for: Pavla Pivoňka Vaňková STAN
Extracted 0 records. Total: 0


Bylines:   9%|▉         | 6/64 [01:04<10:45, 11.14s/byline]

No data extracted for Pavla Pivoňka Vaňková STAN.
 Extracting Data for: Pavla Pivoňka Vaňková
Extracted 0 records. Total: 0


Bylines:  11%|█         | 7/64 [01:12<09:35, 10.09s/byline]

No data extracted for Pavla Pivoňka Vaňková.
 Extracting Data for: Štěpán Slovák
Extracted 0 records. Total: 0


Bylines:  12%|█▎        | 8/64 [01:20<08:42,  9.32s/byline]

No data extracted for Štěpán Slovák.
 Extracting Data for: Václav Pláteník
Extracted 0 records. Total: 0


Bylines:  14%|█▍        | 9/64 [01:27<08:02,  8.77s/byline]

No data extracted for Václav Pláteník.
 Extracting Data for: Robert Teleky
Extracted 72 records. Total: 72
Extracted 0 records. Total: 72


Bylines:  16%|█▌        | 10/64 [01:40<09:01, 10.03s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Robert Teleky.json
 Extracting Data for: Karel Smetana
Extracted 1 records. Total: 1
Extracted 0 records. Total: 1


Bylines:  17%|█▋        | 11/64 [01:51<09:06, 10.32s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Karel Smetana.json
 Extracting Data for: Marie Pošarová - SPD
Extracted 2 records. Total: 2
Extracted 0 records. Total: 2


Bylines:  19%|█▉        | 12/64 [02:02<08:57, 10.34s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Marie Pošarová - SPD.json
 Extracting Data for: Kamal Farhan
Extracted 0 records. Total: 0


Bylines:  20%|██        | 13/64 [02:09<08:06,  9.53s/byline]

No data extracted for Kamal Farhan.
 Extracting Data for: Jana Hanzlíková - poslankyně
Extracted 1 records. Total: 1
Extracted 0 records. Total: 1


Bylines:  22%|██▏       | 14/64 [02:20<08:19, 10.00s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Jana Hanzlíková - poslankyně.json
 Extracting Data for: Samuel Zabolotný
Extracted 0 records. Total: 0


Bylines:  23%|██▎       | 15/64 [02:28<07:37,  9.34s/byline]

No data extracted for Samuel Zabolotný.
 Extracting Data for: Česká pirátská strana
Extracted 188 records. Total: 188
Extracted 0 records. Total: 188


Bylines:  25%|██▌       | 16/64 [02:44<08:58, 11.23s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Česká pirátská strana.json
 Extracting Data for: ANO
Extracted 24 records. Total: 24
Extracted 0 records. Total: 24


Bylines:  27%|██▋       | 17/64 [02:55<08:49, 11.26s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_ANO.json
 Extracting Data for: Svoboda a přímá demokracie (SPD)
Extracted 7 records. Total: 7
Extracted 0 records. Total: 7


Bylines:  28%|██▊       | 18/64 [03:08<08:54, 11.63s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Svoboda a přímá demokracie (SPD).json
 Extracting Data for: ODS
Extracted 14 records. Total: 14
Extracted 0 records. Total: 14


Bylines:  30%|██▉       | 19/64 [03:19<08:35, 11.46s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_ODS.json
 Extracting Data for: STAN
Extracted 1 records. Total: 1
Extracted 0 records. Total: 1


Bylines:  31%|███▏      | 20/64 [03:29<08:09, 11.13s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_STAN.json
 Extracting Data for: Starostové a nezávislí • STAN
Extracted 199 records. Total: 199
Extracted 199 records. Total: 398
Extracted 49 records. Total: 447
Extracted 0 records. Total: 447


Bylines:  33%|███▎      | 21/64 [03:55<11:15, 15.71s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Starostové a nezávislí • STAN.json
 Extracting Data for: Komunistická strana Čech a Moravy
Extracted 0 records. Total: 0


Bylines:  34%|███▍      | 22/64 [04:03<09:17, 13.26s/byline]

No data extracted for Komunistická strana Čech a Moravy.
 Extracting Data for: Ondrej Prokop
Extracted 8 records. Total: 8
Extracted 0 records. Total: 8


Bylines:  36%|███▌      | 23/64 [04:13<08:28, 12.39s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Ondrej Prokop.json
 Extracting Data for: ODS - Občanská demokratická strana
Extracted 199 records. Total: 199
Extracted 199 records. Total: 398
Extracted 199 records. Total: 597
Extracted 199 records. Total: 796
Extracted 199 records. Total: 995
Extracted 199 records. Total: 1194
Extracted 199 records. Total: 1393
Extracted 156 records. Total: 1549
Extracted 0 records. Total: 1549


Bylines:  38%|███▊      | 24/64 [05:12<17:30, 26.25s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_ODS - Občanská demokratická strana.json
 Extracting Data for: Sociální demokracie
Extracted 4 records. Total: 4
Extracted 0 records. Total: 4


Bylines:  39%|███▉      | 25/64 [05:24<14:17, 22.00s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Sociální demokracie.json
 Extracting Data for: Občanská demokratická strana
Extracted 199 records. Total: 199
Extracted 199 records. Total: 398
Extracted 78 records. Total: 476
Extracted 0 records. Total: 476


Bylines:  41%|████      | 26/64 [05:49<14:33, 22.97s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Občanská demokratická strana.json
 Extracting Data for: Martin Kuba
Extracted 1 records. Total: 1
Extracted 0 records. Total: 1


Bylines:  42%|████▏     | 27/64 [06:00<11:59, 19.46s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Martin Kuba.json
 Extracting Data for: Starostové a nezávislí
Extracted 23 records. Total: 23
Extracted 0 records. Total: 23


Bylines:  44%|████▍     | 28/64 [06:12<10:09, 16.93s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Starostové a nezávislí.json
 Extracting Data for: ANO 2011
Extracted 0 records. Total: 0


Bylines:  45%|████▌     | 29/64 [06:19<08:15, 14.17s/byline]

No data extracted for ANO 2011.
 Extracting Data for: Andrej Babiš
Extracted 54 records. Total: 54
Extracted 0 records. Total: 54


Bylines:  47%|████▋     | 30/64 [06:31<07:42, 13.59s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Andrej Babiš.json
 Extracting Data for: TOP 09
Extracted 97 records. Total: 97
Extracted 0 records. Total: 97


Bylines:  48%|████▊     | 31/64 [06:45<07:30, 13.66s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_TOP 09.json
 Extracting Data for: Svoboda a přímá demokracie
Extracted 144 records. Total: 144
Extracted 0 records. Total: 144


Bylines:  50%|█████     | 32/64 [06:59<07:13, 13.55s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Svoboda a přímá demokracie.json
 Extracting Data for: STAROSTOVÉ A NEZÁVISLÍ
Extracted 1 records. Total: 1
Extracted 0 records. Total: 1


Bylines:  52%|█████▏    | 33/64 [07:10<06:37, 12.81s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_STAROSTOVÉ A NEZÁVISLÍ.json
 Extracting Data for: KDU
Extracted 0 records. Total: 0


Bylines:  53%|█████▎    | 34/64 [07:17<05:36, 11.22s/byline]

No data extracted for KDU.
 Extracting Data for: KDU-ČSL
Extracted 151 records. Total: 151
Extracted 0 records. Total: 151


Bylines:  55%|█████▍    | 35/64 [07:31<05:50, 12.08s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_KDU-ČSL.json
 Extracting Data for: Česká pirátská strana - Praha
Extracted 22 records. Total: 22
Extracted 0 records. Total: 22


Bylines:  56%|█████▋    | 36/64 [07:43<05:36, 12.02s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Česká pirátská strana - Praha.json
 Extracting Data for: Oldřich Hájek
Extracted 0 records. Total: 0


Bylines:  58%|█████▊    | 37/64 [07:51<04:49, 10.71s/byline]

No data extracted for Oldřich Hájek.
 Extracting Data for: Berenika Peštová-poslankyně
Extracted 0 records. Total: 0


Bylines:  59%|█████▉    | 38/64 [07:59<04:21, 10.04s/byline]

No data extracted for Berenika Peštová-poslankyně.
 Extracting Data for: Martin Benkovič, 1. místostarosta Prahy 17 - Řepy
Extracted 10 records. Total: 10
Extracted 0 records. Total: 10


Bylines:  61%|██████    | 39/64 [08:10<04:13, 10.14s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Martin Benkovič, 1. místostarosta Prahy 17 - Řepy.json
 Extracting Data for: Jakub Horák
Extracted 0 records. Total: 0


Bylines:  62%|██████▎   | 40/64 [08:18<03:47,  9.48s/byline]

No data extracted for Jakub Horák.
 Extracting Data for: Jana Maláčová
Extracted 4 records. Total: 4
Extracted 0 records. Total: 4


Bylines:  64%|██████▍   | 41/64 [08:29<03:50, 10.04s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Jana Maláčová.json
 Extracting Data for: Radomír Nepil - místostarosta Osmičky
Extracted 3 records. Total: 3
Extracted 0 records. Total: 3


Bylines:  66%|██████▌   | 42/64 [08:40<03:48, 10.41s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Radomír Nepil - místostarosta Osmičky.json
 Extracting Data for: Radim F. Holeček
Extracted 4 records. Total: 4
Extracted 0 records. Total: 4


Bylines:  67%|██████▋   | 43/64 [08:51<03:39, 10.48s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Radim F. Holeček.json
 Extracting Data for: Bold News
Extracted 41 records. Total: 41
Extracted 0 records. Total: 41


Bylines:  69%|██████▉   | 44/64 [09:05<03:50, 11.55s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Bold News.json
 Extracting Data for: Oldřich Hájek - senátor
Extracted 0 records. Total: 0


Bylines:  70%|███████   | 45/64 [09:14<03:28, 10.95s/byline]

No data extracted for Oldřich Hájek - senátor.
 Extracting Data for: Jana Bobošíková
Extracted 57 records. Total: 57
Extracted 0 records. Total: 57


Bylines:  72%|███████▏  | 46/64 [09:27<03:24, 11.36s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Jana Bobošíková.json
 Extracting Data for: Zdeněk Kettner - SPD
Extracted 8 records. Total: 8
Extracted 0 records. Total: 8


Bylines:  73%|███████▎  | 47/64 [09:37<03:08, 11.07s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Zdeněk Kettner - SPD.json
 Extracting Data for: Pavel Tůma
Extracted 0 records. Total: 0


Bylines:  75%|███████▌  | 48/64 [09:45<02:43, 10.22s/byline]

No data extracted for Pavel Tůma.
 Extracting Data for: Stanislav Blaha
Extracted 24 records. Total: 24
Extracted 0 records. Total: 24


Bylines:  77%|███████▋  | 49/64 [09:56<02:36, 10.47s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Stanislav Blaha.json
 Extracting Data for: Pavla Pivoňka Vaňková STAN
Extracted 0 records. Total: 0


Bylines:  78%|███████▊  | 50/64 [10:04<02:16,  9.73s/byline]

No data extracted for Pavla Pivoňka Vaňková STAN.
 Extracting Data for: Martin Sedeke
Extracted 15 records. Total: 15
Extracted 0 records. Total: 15


Bylines:  80%|███████▉  | 51/64 [10:15<02:09,  9.97s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Martin Sedeke.json
 Extracting Data for: SPOLU pro Prahu
Extracted 5 records. Total: 5
Extracted 0 records. Total: 5


Bylines:  81%|████████▏ | 52/64 [10:27<02:05, 10.45s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_SPOLU pro Prahu.json
 Extracting Data for: Roman Roun
Extracted 12 records. Total: 12
Extracted 0 records. Total: 12


Bylines:  83%|████████▎ | 53/64 [10:37<01:56, 10.58s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Roman Roun.json
 Extracting Data for: MUDr. Jan Síla • SPD
Extracted 0 records. Total: 0


Bylines:  84%|████████▍ | 54/64 [10:46<01:39,  9.92s/byline]

No data extracted for MUDr. Jan Síla • SPD.
 Extracting Data for: Pavel Huml
Extracted 1 records. Total: 1
Extracted 0 records. Total: 1


Bylines:  86%|████████▌ | 55/64 [10:57<01:32, 10.30s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Pavel Huml.json
 Extracting Data for: Ondřej Počarovský, Radní Prahy 10
Extracted 0 records. Total: 0


Bylines:  88%|████████▊ | 56/64 [11:05<01:16,  9.54s/byline]

No data extracted for Ondřej Počarovský, Radní Prahy 10.
 Extracting Data for: Pavel Dobeš • místostarosta Prahy 3
Extracted 44 records. Total: 44
Extracted 0 records. Total: 44


Bylines:  89%|████████▉ | 57/64 [11:17<01:11, 10.24s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Pavel Dobeš • místostarosta Prahy 3.json
 Extracting Data for: Lukáš Otys
Extracted 13 records. Total: 13
Extracted 0 records. Total: 13


Bylines:  91%|█████████ | 58/64 [11:29<01:04, 10.80s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Lukáš Otys.json
 Extracting Data for: Benjamin Činčila
Extracted 6 records. Total: 6
Extracted 0 records. Total: 6


Bylines:  92%|█████████▏| 59/64 [11:41<00:55, 11.10s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Benjamin Činčila.json
 Extracting Data for: Eduard Hulicius - kandidát na poslance
Extracted 19 records. Total: 19
Extracted 0 records. Total: 19


Bylines:  94%|█████████▍| 60/64 [11:52<00:44, 11.12s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Eduard Hulicius - kandidát na poslance.json
 Extracting Data for: Kužílková Lucie
Extracted 6 records. Total: 6
Extracted 0 records. Total: 6


Bylines:  95%|█████████▌| 61/64 [12:04<00:34, 11.38s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Kužílková Lucie.json
 Extracting Data for: Lucie Kužílková
Extracted 1 records. Total: 1
Extracted 0 records. Total: 1


Bylines:  97%|█████████▋| 62/64 [12:15<00:22, 11.40s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Lucie Kužílková.json
 Extracting Data for: Eva Decroix
Extracted 11 records. Total: 11
Extracted 0 records. Total: 11


Bylines:  98%|█████████▊| 63/64 [12:27<00:11, 11.47s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Eva Decroix.json
 Extracting Data for: Martin Baxa
Extracted 12 records. Total: 12
Extracted 0 records. Total: 12


Bylines: 100%|██████████| 64/64 [12:38<00:00, 11.85s/byline]

Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_Martin Baxa.json





In [2]:
### Process
import os
import json
import re


def replace_id_with_underscore_id(json_data):
    """Recursively replaces 'id' with '_id' in JSON data."""
    if isinstance(json_data, dict):
        if "id" in json_data:
            json_data["_id"] = json_data.pop("id")
        for value in json_data.values():
            replace_id_with_underscore_id(value)
    elif isinstance(json_data, list):
        for item in json_data:
            replace_id_with_underscore_id(item)
    return json_data


def process_json_files(folder_path, process_function):
    """Applies a transformation function to all JSON files in a given folder."""
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as json_file:
                data = json.load(json_file)

            processed_data = process_function(data)

            with open(file_path, "w", encoding="utf-8") as json_file:
                json.dump(processed_data, json_file, indent=4, ensure_ascii=False)


def append_json_files(folder_path):
    """Combines all JSON files in a directory into a single list."""
    all_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as json_file:
                all_data.extend(json.load(json_file))
    return all_data


def remove_emojis(text):
    """Removes emojis and symbols from a given text."""
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Miscellaneous Symbols and Pictographs
        "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
        "\U0001F1E0-\U0001F1FF"  # Flags
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002600-\U000026FF"  # Miscellaneous Symbols
        "\U00000200-\U00002BFF"  # Additional symbols
        "\U0001F004"             # Mahjong tiles
        "\U0001F0CF"             # Playing cards
        "\n"                     # Line break
        "]",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub("", text)


def clean_ad_creative_bodies(data):
    """Removes emojis from 'ad_creative_bodies' field in JSON data."""
    for item in data:
        if "ad_creative_bodies" in item:
            if isinstance(item["ad_creative_bodies"], str):
                item["ad_creative_bodies"] = remove_emojis(item["ad_creative_bodies"])
            elif isinstance(item["ad_creative_bodies"], list):
                item["ad_creative_bodies"] = [remove_emojis(body) for body in item["ad_creative_bodies"]]
    return data

def clean_ad_creative_link_titles(data):
    """Removes emojis from 'ad_creative_link_titles' field in JSON data."""
    for item in data:
        if "ad_creative_link_titles" in item:
            if isinstance(item["ad_creative_link_titles"], str):
                item["ad_creative_link_titles"] = remove_emojis(item["ad_creative_link_titles"])
            elif isinstance(item["ad_creative_link_titles"], list):
                item["ad_creative_link_titles"] = [remove_emojis(body) for body in item["ad_creative_link_titles"]]
    return data


# Process all years
for year in range(2025, 2026):
    base_dir = os.getcwd()
    output_dir = os.path.join(base_dir, f"volby_2025_ads/{year}")

    if not os.path.exists(output_dir):
        print(f"Skipping {year} - No data found.")
        continue

    print(f"Processing data for year {year}...")

    # Step 1: Replace 'id' with '_id' in all JSON files
    process_json_files(output_dir, replace_id_with_underscore_id)
    print(f"Replaced 'id' with '_id' in {year} data.")

    # Step 2: Append all JSON data into a single file
    appended_data = append_json_files(output_dir)
    all_data_file = os.path.join(output_dir, "data_all.json")
    with open(all_data_file, "w", encoding="utf-8") as json_file:
        json.dump(appended_data, json_file, indent=4, ensure_ascii=False)
    print(f"Appended {len(appended_data)} records into {all_data_file}.")

    # Step 3: Remove emojis from 'ad_creative_bodies'
    cleaned_data = clean_ad_creative_bodies(appended_data)
    cleaned_data = clean_ad_creative_link_titles(cleaned_data)
    cleaned_data_file = os.path.join(output_dir, "data_all_cleaned.json")
    with open(cleaned_data_file, "w", encoding="utf-8") as json_file:
        json.dump(cleaned_data, json_file, indent=4, ensure_ascii=False)
    print(f"Cleaned 'ad_creative_bodies' and saved to {cleaned_data_file}.\n")

print("Processing complete.")


# Remove the processed files after completion
for year in range(2025, 2026):
    output_dir = os.path.join(base_dir, f"volby_2025_ads\\{year}")
    file_path = os.path.join(output_dir, "data_all.json")
    
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Removed file: {file_path}")
    else:
        print(f"File not found, skipping: {file_path}")


Processing data for year 2025...
Replaced 'id' with '_id' in 2025 data.
Appended 4261 records into c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_all.json.
Cleaned 'ad_creative_bodies' and saved to c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads/2025\data_all_cleaned.json.

Processing complete.
Removed file: c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads\2025\data_all.json


In [3]:
### Push to mongo
from pymongo import MongoClient
import json
import os
from tqdm import tqdm

client = MongoClient('mongodb://localhost:27017')
db = client["volby_2025_db"]
collection = db["volby_2025"]

for year in range(2025, 2026):
    base_dir = os.getcwd()
    output_dir = os.path.join(base_dir, f"volby_2025_ads\\{year}")
    file_path = os.path.join(output_dir, "data_all_cleaned.json")

    with open(file_path, "r", encoding="utf-8") as file:
        json_data = json.load(file)

    new_count = 0
    updated_count = 0

    print(f"\nProcessing year {year}...")
    for document in tqdm(json_data, desc="Inserting documents", unit="doc"):
        result = collection.replace_one(
            {"_id": document["_id"]},
            document,
            upsert=True
        )
        if result.upserted_id is not None:
            new_count += 1
        elif result.modified_count == 1:
            updated_count += 1

    print(f"Year {year} Results:")
    print(f"New documents inserted: {new_count}")
    print(f"Existing documents updated: {updated_count}\n")

client.close()

# Remove the processed files after completion
for year in range(2025, 2026):
    output_dir = os.path.join(base_dir, f"volby_2025_ads\\{year}")
    file_path = os.path.join(output_dir, "data_all_cleaned.json")
    
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Removed file: {file_path}")
    else:
        print(f"File not found, skipping: {file_path}")


Processing year 2025...


Inserting documents: 100%|██████████| 4261/4261 [00:03<00:00, 1216.64doc/s]

Year 2025 Results:
New documents inserted: 1441
Existing documents updated: 2820

Removed file: c:\Users\jirip\Documents\Developer\python\political_ads\volby_2025_ads\2025\data_all_cleaned.json





In [8]:
### Check distinct values in MongoDB
from pymongo import MongoClient
import pandas as pd

client = MongoClient('mongodb://localhost:27017')
db = client["volby_2025_db"]
collection = db["volby_2025"]


distinct_page_id= collection.distinct("page_id")
distinct_id= collection.distinct("_id")

print(f"Total distinct page_ids: {len(distinct_page_id)}")
print(f"Total distinct _id: {len(distinct_id)}")


# Load the CSV file into a DataFrame
csv_path = "C:\\Users\\jirip\\Documents\\Developer\\python\\political_ads\\image_urls.csv"
df = pd.read_csv(csv_path)

# Create a list of page_id
page_id_list = df["page_id"].tolist()

print(f"Loaded {len(page_id_list)} page IDs from the CSV.")

# Ensure both sets contain strings
page_id_list = set(map(str, page_id_list))
distinct_page_id = set(map(str, distinct_page_id))
# Find differences between page_id_list and distinct_page_id
page_ids_in_csv_not_in_db = set(page_id_list) - set(distinct_page_id)
page_ids_in_db_not_in_csv = set(distinct_page_id) - set(page_id_list)

print(f"Page IDs in CSV but not in MongoDB: {len(page_ids_in_csv_not_in_db)}")
print(f"Page IDs in MongoDB but not in CSV: {len(page_ids_in_db_not_in_csv)}")

if page_ids_in_csv_not_in_db:
    print("Sample Page IDs in CSV but not in MongoDB:", list(page_ids_in_csv_not_in_db)[:200])

if page_ids_in_db_not_in_csv:
    print("Sample Page IDs in MongoDB but not in CSV:", list(page_ids_in_db_not_in_csv)[:200])
# pipeline = [
#     {"$group": {"_id": "$page_id", "page_names": {"$addToSet": "$page_name"}}},
#     {"$project": {"_id": 1, "page_names": 1, "count": {"$size": "$page_names"}}},
#     {"$match": {"count": {"$gt": 1}}}
# ]
#  
# results = collection.aggregate(pipeline)
# 
# for result in results:
#     print(f"Page ID: {result['_id']}, Page Names: {result['page_names']}")

Total distinct page_ids: 185
Total distinct _id: 10326
Loaded 892 page IDs from the CSV.
Page IDs in CSV but not in MongoDB: 707
Page IDs in MongoDB but not in CSV: 0
Sample Page IDs in CSV but not in MongoDB: ['104818162305682', '107685452090075', '112020906847068', '102176319150440', '104315995914919', '107996628144164', '111857801623752', '101756458187757', '108624290965848', '1510242952524260', '1755511574766906', '743012349164310', '100761461345420', '1993685137318931', '303091886551823', '1354343051279168', '623900784396150', '105134482009598', '109501160564065', '317802208282505', '107814901486149', '852785408073624', '107942197433640', '656594044727811', '1824133914469306', '400970967000022', '687843237907830', '126192198099', '634887113291677', '111740250703509', '104280985749225', '108342850697974', '250391278786394', '58887737763', '285098274162', '291971547991645', '112928034739894', '101862641420449', '1047063658789197', '170261140345723', '463118950345', '194268297419434'

In [5]:
### Download logos from Facebook Ads Library
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import requests
import os
from pymongo import MongoClient

def setup_mongo_connection(uri="mongodb://localhost:27017", db_name="spending_db", collection_name="spending"):
    """Establishes a connection to MongoDB and returns the collection."""
    client = MongoClient(uri)
    return client[db_name][collection_name]

def setup_chrome_driver():
    """Initializes and returns a Selenium Chrome WebDriver."""
    service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode for efficiency
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    return webdriver.Chrome(service=service, options=options)

def navigate_to_page(driver, page_id, delay=2):
    """Navigates to the Facebook Ads Library page for a given page ID."""
    url = f"https://www.facebook.com/ads/library/?active_status=all&ad_type=political_and_issue_ads&country=CZ&is_targeted_country=false&media_type=all&search_type=page&source=ad-report&view_all_page_id={page_id}"
    driver.get(url)
    time.sleep(delay)

def download_image(driver, page_id, output_dir="ad_data"):
    """Downloads the logo image for a given page ID and saves it locally."""
    os.makedirs(output_dir, exist_ok=True)
    try:
        div_element = driver.find_element(By.CSS_SELECTOR, "div.x9f619.x1n2onr6.x1ja2u2z")
        image_element = div_element.find_element(By.TAG_NAME, "img")
        image_src = image_element.get_attribute("src")
        img_data = requests.get(image_src, timeout=5).content
        image_path = os.path.join(output_dir, f"{page_id}_logo.png")
        with open(image_path, 'wb') as handler:
            handler.write(img_data)
        print(f"Downloaded image for Page ID {page_id}")
    except Exception as e:
        print(f"Error downloading image for {page_id}: {e}")

def main():
    collection = setup_mongo_connection()
    all = collection.distinct("page_id")
    
    # Construct relative path and read in one step
    df = pd.read_csv(os.path.join("image_urls.csv"))
    done = df["page_id"].unique().tolist()
    
    # page_ids = list(set(all) - set(done))
    page_ids = ['113081303665953', '2258789304193151', '274791392594407', '827811930672498', '316873818412560']
    print(f"Remaining Page IDs to process: {len(page_ids)}")
    driver = setup_chrome_driver()
    
    try:
        for page_id in page_ids:
            print(f"Processing Page ID: {page_id}")
            navigate_to_page(driver, page_id)
            download_image(driver, page_id)
    except Exception as e:
        print(f"Unexpected error: {e}")
    finally:
        driver.quit()
        print("Driver closed.")

if __name__ == "__main__":
    main()

Remaining Page IDs to process: 5
Processing Page ID: 113081303665953
Downloaded image for Page ID 113081303665953
Processing Page ID: 2258789304193151
Downloaded image for Page ID 2258789304193151
Processing Page ID: 274791392594407
Downloaded image for Page ID 274791392594407
Processing Page ID: 827811930672498
Downloaded image for Page ID 827811930672498
Processing Page ID: 316873818412560
Downloaded image for Page ID 316873818412560
Driver closed.


In [6]:
### Circular crop images

from PIL import Image, ImageDraw
import numpy as np
import os

# Define output directory
output_dir = os.path.join(os.getcwd(), "ad_data")

# Ensure the directory exists before listing files
if not os.path.exists(output_dir):
    raise FileNotFoundError(f"Directory '{output_dir}' does not exist.")

# List images with '_logo.png' suffix
image_names = [file for file in os.listdir(output_dir) if file.endswith("_logo.png")]
image_names = ['113081303665953_logo.png', '2258789304193151_logo.png', '274791392594407_logo.png', '827811930672498_logo.png', '316873818412560_logo.png']


def circular_crop(image_path, output_path):
    # Open the image
    img = Image.open(image_path).convert("RGBA")
    
    # Create same size mask with transparent background
    mask = Image.new("L", img.size, 0)
    draw = ImageDraw.Draw(mask)

    # Define the circular region (centered)
    size = min(img.size)
    left = (img.width - size) // 2
    top = (img.height - size) // 2
    right = left + size
    bottom = top + size

    # Draw a white filled circle on the mask
    draw.ellipse((left, top, right, bottom), fill=255)

    # Apply mask to image
    circular_img = Image.new("RGBA", img.size, (0, 0, 0, 0))
    circular_img.paste(img, (0, 0), mask=mask)

    # Crop the circular region and save
    circular_img = circular_img.crop((left, top, right, bottom))
    circular_img.save(output_path, format="PNG")


# Loop through all images and apply circular cropping
for image_name in image_names:
    input_path = os.path.join(output_dir, image_name)
    output_path = os.path.join(output_dir, image_name.replace("_logo.png", "_logo_circular.png"))
    circular_crop(input_path, output_path)
    print(f"Circular cropped image saved at: {output_path}")
    if os.path.exists(input_path):
        os.remove(input_path)
        print(f"Deleted original image: {input_path}")

Circular cropped image saved at: c:\Users\jirip\Documents\Developer\python\political_ads\ad_data\113081303665953_logo_circular.png
Deleted original image: c:\Users\jirip\Documents\Developer\python\political_ads\ad_data\113081303665953_logo.png
Circular cropped image saved at: c:\Users\jirip\Documents\Developer\python\political_ads\ad_data\2258789304193151_logo_circular.png
Deleted original image: c:\Users\jirip\Documents\Developer\python\political_ads\ad_data\2258789304193151_logo.png
Circular cropped image saved at: c:\Users\jirip\Documents\Developer\python\political_ads\ad_data\274791392594407_logo_circular.png
Deleted original image: c:\Users\jirip\Documents\Developer\python\political_ads\ad_data\274791392594407_logo.png
Circular cropped image saved at: c:\Users\jirip\Documents\Developer\python\political_ads\ad_data\827811930672498_logo_circular.png
Deleted original image: c:\Users\jirip\Documents\Developer\python\political_ads\ad_data\827811930672498_logo.png
Circular cropped image

In [7]:
#### Create CSV with image URLs
import os
import pandas as pd

# Define output directory
input_dir = os.path.join(os.getcwd(), "ad_data")

# Ensure the directory exists before listing files
if not os.path.exists(input_dir):
    raise FileNotFoundError(f"Directory '{input_dir}' does not exist.")

# List images with '_logo.png' suffix
image_names = [file for file in os.listdir(input_dir) if file.endswith("_logo_circular.png")]

# Function to generate raw GitHub URLs
def github_to_raw_url(image_name: str) -> str:
    return f"https://raw.githubusercontent.com/Thepilli/political_ads/refs/heads/main/ad_data/{image_name}"

# Prepare data for the DataFrame
df = pd.DataFrame([
    {"page_id": str(image.replace("_logo_circular.png", "")), "url_key": github_to_raw_url(image)}
    for image in image_names
])

# Ensure 'page_id' is stored explicitly as a string
df["page_id"] = df["page_id"].astype(str)

# Save to CSV
csv_path = os.path.join(os.getcwd(), "image_urls.csv")
df.to_csv(csv_path, index=False)
print(f"CSV file saved at: {csv_path}")

# Print raw URLs
for _, row in df.iterrows():
    print(f"Raw URL: {row['url_key']}")


CSV file saved at: c:\Users\jirip\Documents\Developer\python\political_ads\image_urls.csv
Raw URL: https://raw.githubusercontent.com/Thepilli/political_ads/refs/heads/main/ad_data/100117115202167_logo_circular.png
Raw URL: https://raw.githubusercontent.com/Thepilli/political_ads/refs/heads/main/ad_data/100136888779860_logo_circular.png
Raw URL: https://raw.githubusercontent.com/Thepilli/political_ads/refs/heads/main/ad_data/100174253112291_logo_circular.png
Raw URL: https://raw.githubusercontent.com/Thepilli/political_ads/refs/heads/main/ad_data/100187828478849_logo_circular.png
Raw URL: https://raw.githubusercontent.com/Thepilli/political_ads/refs/heads/main/ad_data/100191411972962_logo_circular.png
Raw URL: https://raw.githubusercontent.com/Thepilli/political_ads/refs/heads/main/ad_data/100197192559708_logo_circular.png
Raw URL: https://raw.githubusercontent.com/Thepilli/political_ads/refs/heads/main/ad_data/100201752249169_logo_circular.png
Raw URL: https://raw.githubusercontent.com

In [None]:
import os
from pymongo import MongoClient
import pandas as pd

client = MongoClient('mongodb://localhost:27017')
db = client["volby_2025_db"]
collection = db["v_volby_2025_urls"]

distinct_id= collection.distinct("_id")

print(f"Total distinct _id: {len(distinct_id)}")


In [9]:
from pymongo import MongoClient


client = MongoClient('mongodb://localhost:27017')
db = client["volby_2025_db"]
collection = db["v_volby_2025_urls"]

ad_pairs = [
    (str(doc["_id"]), doc.get("ad_snapshot_url", ""))
    for doc in collection.find(
        {}, 
        {"_id": 1, "ad_snapshot_url": 1}  # Projection: no duplicate "_id"
    )
]
print(ad_pairs[:5])  # Show a sample of the result

[('1167966842045760', 'https://www.facebook.com/ads/archive/render_ad/?id=1167966842045760&access_token=EAALnc8im5MUBPNNdiuRoEp18NscWZAZAKYTT0oNkKAIZB7yu46T6ES3xHTesd8SvPB3NGl5tnHp9atwdIi0sCLStEFZBl41WMi2tiw7PwimuPRZBCbSOlReRHDqvnQZA1nE1R0gWCsWsNLj4zU3DPlhu6Iyr3zv5nkPlyiZCixrSyPZAv9bdgUYZCCqAFGHZAuYZB5veLzm8e34w109C7JV'), ('1935222390575014', 'https://www.facebook.com/ads/archive/render_ad/?id=1935222390575014&access_token=EAALnc8im5MUBPNNdiuRoEp18NscWZAZAKYTT0oNkKAIZB7yu46T6ES3xHTesd8SvPB3NGl5tnHp9atwdIi0sCLStEFZBl41WMi2tiw7PwimuPRZBCbSOlReRHDqvnQZA1nE1R0gWCsWsNLj4zU3DPlhu6Iyr3zv5nkPlyiZCixrSyPZAv9bdgUYZCCqAFGHZAuYZB5veLzm8e34w109C7JV'), ('1826728784721903', 'https://www.facebook.com/ads/archive/render_ad/?id=1826728784721903&access_token=EAALnc8im5MUBPNNdiuRoEp18NscWZAZAKYTT0oNkKAIZB7yu46T6ES3xHTesd8SvPB3NGl5tnHp9atwdIi0sCLStEFZBl41WMi2tiw7PwimuPRZBCbSOlReRHDqvnQZA1nE1R0gWCsWsNLj4zU3DPlhu6Iyr3zv5nkPlyiZCixrSyPZAv9bdgUYZCCqAFGHZAuYZB5veLzm8e34w109C7JV'), ('1101833192055015', 'https://

In [10]:
print(f"Total ad pairs retrieved: {len(ad_pairs)}")

Total ad pairs retrieved: 10326


In [14]:
from pathlib import Path
SCREENSHOT_DIR = Path("volby_2025_ads/screenshots")
# create list of ids from files in SCREENSHOT_DIR (file stems without extension)
if not SCREENSHOT_DIR.exists():
    raise FileNotFoundError(f"Directory '{SCREENSHOT_DIR}' does not exist.")

screenshot_ids = [
    p.stem
    for p in SCREENSHOT_DIR.iterdir()
    if p.is_file() and p.suffix.lower() in {".jpg", ".jpeg", ".png"}
]

print(f"Found {len(screenshot_ids)} ids")



Found 8431 ids


In [13]:
# Exclude screenshot_ids from ad_pairs
# Requires variables `ad_pairs` (list of (ad_id, url)) and `screenshot_ids` (list of ids) to be defined in other cells.

screenshot_set = set(map(str, screenshot_ids))
filtered_ad_pairs = [(ad_id, url) for ad_id, url in ad_pairs if str(ad_id) not in screenshot_set]

print(f"Original ad_pairs: {len(ad_pairs)}")
print(f"Filtered ad_pairs (excluding screenshots): {len(filtered_ad_pairs)}")
print("Sample remaining ad_pairs:", filtered_ad_pairs[:10])

Original ad_pairs: 10326
Filtered ad_pairs (excluding screenshots): 1895
Sample remaining ad_pairs: [('24015784968093483', 'https://www.facebook.com/ads/archive/render_ad/?id=24015784968093483&access_token=EAALnc8im5MUBPNNdiuRoEp18NscWZAZAKYTT0oNkKAIZB7yu46T6ES3xHTesd8SvPB3NGl5tnHp9atwdIi0sCLStEFZBl41WMi2tiw7PwimuPRZBCbSOlReRHDqvnQZA1nE1R0gWCsWsNLj4zU3DPlhu6Iyr3zv5nkPlyiZCixrSyPZAv9bdgUYZCCqAFGHZAuYZB5veLzm8e34w109C7JV'), ('1551144382538287', 'https://www.facebook.com/ads/archive/render_ad/?id=1551144382538287&access_token=EAALnc8im5MUBPNNdiuRoEp18NscWZAZAKYTT0oNkKAIZB7yu46T6ES3xHTesd8SvPB3NGl5tnHp9atwdIi0sCLStEFZBl41WMi2tiw7PwimuPRZBCbSOlReRHDqvnQZA1nE1R0gWCsWsNLj4zU3DPlhu6Iyr3zv5nkPlyiZCixrSyPZAv9bdgUYZCCqAFGHZAuYZB5veLzm8e34w109C7JV'), ('3279681472189268', 'https://www.facebook.com/ads/archive/render_ad/?id=3279681472189268&access_token=EAALnc8im5MUBPNNdiuRoEp18NscWZAZAKYTT0oNkKAIZB7yu46T6ES3xHTesd8SvPB3NGl5tnHp9atwdIi0sCLStEFZBl41WMi2tiw7PwimuPRZBCbSOlReRHDqvnQZA1nE1R0gWCsWsNLj4zU3

In [15]:
# DOWNLOAD AND COMPRESS IMAGES OF AD IDs FROM A FOLDER

import os
import json
import time
from pathlib import Path
from PIL import Image
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pymongo import MongoClient



# Constants
# JSON_DIR = Path(r"C:/Users/jirip/Documents/Developer/python/political_ads/volby_2025_ads/2025")
SCREENSHOT_DIR = Path("volby_2025_ads/screenshots")
SCREENSHOT_DIR.mkdir(exist_ok=True)
# MAX_ITEMS = None  # Set to e.g. 100 to limit
MAX_ITEMS = 10  # Set to e.g. 100 to limit
access_token = "EAALnc8im5MUBPNNdiuRoEp18NscWZAZAKYTT0oNkKAIZB7yu46T6ES3xHTesd8SvPB3NGl5tnHp9atwdIi0sCLStEFZBl41WMi2tiw7PwimuPRZBCbSOlReRHDqvnQZA1nE1R0gWCsWsNLj4zU3DPlhu6Iyr3zv5nkPlyiZCixrSyPZAv9bdgUYZCCqAFGHZAuYZB5veLzm8e34w109C7JV"  # Use environment variable

# Setup Chrome driver
def setup_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1920,3000")
    return webdriver.Chrome(options=options)

# Handle Facebook popup
def handle_facebook_popup(driver):
    driver.get("https://www.facebook.com/")
    try:
        WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable((
                By.XPATH,
                '/html/body/div[3]/div[2]/div/div/div/div/div[3]/div[2]/div/div[1]/div[2]/div/div[1]'
            ))
        ).click()
        print("[✓] Facebook popup handled.")
    except Exception:
        print("[!] No popup or already accepted.")

# Process a single ad
def process_ad(driver, url, ad_id):
    png_path = SCREENSHOT_DIR / f"{ad_id}.png"
    jpg_path = SCREENSHOT_DIR / f"{ad_id}.jpg"

    try:
        driver.get(url)
        element = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div/div/div/div'))
        )
        driver.execute_script("arguments[0].scrollIntoView(true);", element)
        element.screenshot(str(png_path))
        print(f"[✓] Screenshot: {ad_id}")
    except Exception as e:
        print(f"[!] Screenshot failed: {ad_id}")
        print(f"Error: {e}")
        return

    try:
        with Image.open(png_path) as img:
            img.convert("RGB").save(jpg_path, "JPEG", quality=60)
        png_path.unlink()
        print(f"    └─ Compressed: {ad_id}")
    except Exception:
        print(f"[!] Compression failed: {ad_id}")

def main():
    client = MongoClient('mongodb://localhost:27017')
    db = client["volby_2025_db"]
    collection = db["v_volby_2025_urls"]
    ads = collection.distinct("_id")
    print(f"Found {len(ads)} ads.")
        
    processed_ads = [
    os.path.splitext(f)[0]
    for f in os.listdir(SCREENSHOT_DIR)
    if os.path.isfile(os.path.join(SCREENSHOT_DIR, f))
    ]
    print(f"Already processed {len(processed_ads)} ads.")
    remaining_ads = set(ads) - set(processed_ads)
    print(f"Remaining ads to process: {len(remaining_ads)}")
    if not remaining_ads:
        print("No ads to process.")
        return

    if MAX_ITEMS:
        ads = ads[:MAX_ITEMS]

    driver = setup_driver()
    handle_facebook_popup(driver)
    time.sleep(2)

    for idx, ad_id in enumerate(remaining_ads, 1):
        url = f'https://www.facebook.com/ads/archive/render_ad/?id={ad_id}&access_token={access_token}'
        print(f"\n[{idx}/{len(remaining_ads)}] Processing ad: {ad_id}")
        process_ad(driver, url, ad_id)

    driver.quit()

if __name__ == "__main__":
    main()
    

Found 10326 ads.
Already processed 8431 ads.
Remaining ads to process: 1895
[✓] Facebook popup handled.

[1/1895] Processing ad: 1830303304552165
[✓] Screenshot: 1830303304552165
    └─ Compressed: 1830303304552165

[2/1895] Processing ad: 1306825314569274
[✓] Screenshot: 1306825314569274
    └─ Compressed: 1306825314569274

[3/1895] Processing ad: 1324157475900456
[✓] Screenshot: 1324157475900456
    └─ Compressed: 1324157475900456

[4/1895] Processing ad: 1307179827812165
[✓] Screenshot: 1307179827812165
    └─ Compressed: 1307179827812165

[5/1895] Processing ad: 1044626220854366
[✓] Screenshot: 1044626220854366
    └─ Compressed: 1044626220854366

[6/1895] Processing ad: 2852857991568605
[✓] Screenshot: 2852857991568605
    └─ Compressed: 2852857991568605

[7/1895] Processing ad: 1493593338320808
[✓] Screenshot: 1493593338320808
    └─ Compressed: 1493593338320808

[8/1895] Processing ad: 2299174610539739
[✓] Screenshot: 2299174610539739
    └─ Compressed: 2299174610539739

[9/1895

In [None]:
import os
from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017')
db = client["spending_db"]
collection = db["v_spending_urls"]
ads = collection.distinct("_id")

# Use the SCREENSHOT_DIR variable which points to 'screenshots'
file_names = [
    os.path.splitext(f)[0]
    for f in os.listdir(SCREENSHOT_DIR)
    if os.path.isfile(os.path.join(SCREENSHOT_DIR, f))
]

print(len(file_names))
print(len(ads))

remaining_ads = set(ads) - set(file_names)
print(f"Remaining ads to process: {len(remaining_ads)}")

In [None]:
# Create CSV with ad image URLs
import os
import pandas as pd
import re
# Path to the screenshots folder
input_dir = os.path.join(os.getcwd(), "volby_2025_ads/screenshots")

# Get all jpg files in the folder
files = [f for f in os.listdir(input_dir) if f.endswith('.jpg')]

# Create empty lists to store data
ad_ids = []
ad_urls = []

# Process each file
for file in files:
    # Extract the ID from the filename using regex
    match = re.match(r'(\d+)\.jpg', file)
    if match:
        ad_id = match.group(1)
    else:
        continue

    # Create the URL for this file
    url = f"https://raw.githubusercontent.com/Thepilli/political_ads/refs/heads/main/volby_2025_ads/screenshots/{ad_id}.jpg"

    # Append to our lists
    ad_ids.append(ad_id)
    ad_urls.append(url)

# Create the DataFrame
df = pd.DataFrame({
    'ad_id': ad_ids,
    'ad_url': ad_urls
})

# Display the first few rows of the DataFrame
print(df.head())

# Optionally save to CSV
df.to_csv('ad_urls.csv', index=False)


In [None]:
### GET URLs from MongoDB
from pymongo import MongoClient
import pandas as pd

client = MongoClient('mongodb://localhost:27017')
db = client["spending_db"]
collection = db["v_spending_urls"]

# Fetch all documents from the collection and load into a DataFrame
data = list(collection.find())
df = pd.DataFrame(data)
print(df.head())
