In [None]:
### Extract
import requests
import json
import os
import time
from urllib.parse import quote


def get_next_page(response):
    """Extracts the next page URL from the API response."""
    return response.get("paging", {}).get("next")


def fetch_data(api_url):
    """Fetches paginated data from the API."""
    all_data = []
    
    while api_url:
        try:
            response = requests.get(api_url)
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
            
            json_data = response.json()
            data = json_data.get("data", [])
            all_data.extend(data)
            
            print(f"Extracted {len(data)} records. Total: {len(all_data)}")

            api_url = get_next_page(json_data)
            time.sleep(2)  # Respect API rate limits

        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            break

    return all_data


def extract_data(bylines, output_dir, access_token, year, api_version="v21.0", country="CZ", language="cs"):
    """
    Extracts and saves ad data from the Facebook Ads API for a list of bylines.
    
    Parameters:
        bylines (list): List of bylines to query.
        output_dir (str): Directory to save the JSON files.
        access_token (str): Facebook API access token.
        year (int): Year for filtering ads.
        api_version (str): API version (default: v21.0).
        country (str): Country code for filtering ads (default: CZ).
        language (str): Language code for filtering ads (default: cs).
    """
    os.makedirs(output_dir, exist_ok=True)  # Ensure output directory exists

    fields = ",".join([
        "id", "ad_snapshot_url", "ad_creation_time", "ad_creative_bodies", "ad_creative_link_captions",
        "ad_creative_link_descriptions", "ad_creative_link_titles", "ad_delivery_start_time", 
        "ad_delivery_stop_time", "bylines", "currency", "delivery_by_region", "demographic_distribution", 
        "estimated_audience_size", "impressions", "languages", "page_id", "page_name", "publisher_platforms", 
        "spend", "target_locations", "target_gender", "target_ages", "eu_total_reach", "beneficiary_payers", 
        "age_country_gender_reach_breakdown"
    ])
    
    for byline in bylines:
        print(f"Extracting Data for: {byline}")
        encoded_byline = quote(f'["{byline}"]')

        api_url = (
            f"https://graph.facebook.com/{api_version}/ads_archive?"
            f"bylines={encoded_byline}&ad_type=POLITICAL_AND_ISSUE_ADS"
            f"&ad_reached_countries=['{country}']&access_token={access_token}"
            f"&unmask_removed_content=true&fields={fields}&limit=199"
            f"&search_terms=''&languages=['{language}']"
            f"&ad_delivery_date_min={year}-01-01"
            f"&ad_delivery_date_max={year+1}-01-01"
        )
        
        extracted_data = fetch_data(api_url)
        time.sleep(5)  # Delay between different bylines

        if extracted_data:
            filename = os.path.join(output_dir, f"data_{byline}.json")
            with open(filename, "w", encoding="utf-8") as json_file:
                json.dump(extracted_data, json_file, indent=4, ensure_ascii=False)
            print(f"Saved data to {filename}")
        else:
            print(f"No data extracted for {byline}.")


# Parameters
year = 2023
base_dir = os.getcwd()
output_dir = os.path.join(base_dir, f"bylines_ads/{year}")
# bylines = ["SH media, spol. s r.o."]
bylines = ['Česká pirátská strana', 'EUROPEUM Institute for European Policy', 'FTV Prima', 'ANO', 'Naše zdravotnictví', 'Svoboda a přímá demokracie (SPD)', 'ČSOB', 'ODS', 'Lékaři bez hranic - Médecins Sans Frontières in Czech Republic, o. p. s.', 'Aliance pro budoucnost', 'STAN', 'Starostové a nezávislí • STAN', 'PRAHA SOBĚ', 'Člověk v tísni, o.p.s.', 'SH media, spol. s r.o.', 'Komunistická strana Čech a Moravy', 'Zdeněk Hraba - senátor', 'Ondrej Prokop', 'Ministerstvo práce a sociálních věcí', 'Tomáš Zdechovský', 'Milion Chvilek, z. s.', 'EU Social', 'EU Justice and Consumers', 'ODS - Občanská demokratická strana', 'Sociální demokracie', 'Amnesty International ČR', 'Občanská demokratická strana', 'Tipsport', 'Karel Janeček', 'Martin Kuba', 'Starostové a nezávislí', 'Svoboda zvířat Plzeň, z.s.', 'CARE Česká republika', 'Replastuj.cz', 'ANO 2011', 'Reportér magazín', 'Andrej Babiš', 'Člověk v tísni o.p.s.', 'DFMG', 'TOP 09', 'SEN 21', 'Kupředu do minulosti s.r.o.', 'Ministerstvo pro místní rozvoj ČR', 'Svoboda a přímá demokracie', 'Greenpeace Česká republika', 'STAROSTOVÉ A NEZÁVISLÍ', 'KDU-ČSL', 'Česká pirátská strana - Praha', 'XTV', 'Hnutí DUHA - Přátelé Země Česká republika', 'Zelení - Strana zelených', 'PŘÍSAHA - občanské hnutí Roberta Šlachty', 'Český rozhlas', 'CZECH NEWS CENTER a. s.', 'Oldřich Hájek', 'Transparency International ČR']

access_token = "EAALnc8im5MUBO9FtPZCD8AcvH8w0CfpLw4BTgWtLI0Ivs7d71FI5BMirPdtx3ejqC4l8OUu8foPYjEEgsGtDZB3nyWeS2SSq2OGHYEo3yJnrawuZC6q5bz2BdQj7NSB4kYVQiYNVLFSofwgjtAICABUIMGpK6F1GlMo9t4uYS29XszNgHnC4tkYtLn1"  # Use environment variable

if not access_token:
    raise ValueError("Access token is missing. Set the 'FB_ACCESS_TOKEN' environment variable.")

# Execute extraction
extract_data(bylines, output_dir, access_token, year)


In [None]:
### Process
import os
import json
import re


def replace_id_with_underscore_id(json_data):
    """Recursively replaces 'id' with '_id' in JSON data."""
    if isinstance(json_data, dict):
        if "id" in json_data:
            json_data["_id"] = json_data.pop("id")
        for value in json_data.values():
            replace_id_with_underscore_id(value)
    elif isinstance(json_data, list):
        for item in json_data:
            replace_id_with_underscore_id(item)


def process_json_files(folder_path, process_function):
    """Applies a transformation function to all JSON files in a given folder."""
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as json_file:
                data = json.load(json_file)

            processed_data = process_function(data)

            with open(file_path, "w", encoding="utf-8") as json_file:
                json.dump(processed_data, json_file, indent=4, ensure_ascii=False)


def append_json_files(folder_path):
    """Combines all JSON files in a directory into a single list."""
    all_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as json_file:
                all_data.extend(json.load(json_file))
    return all_data


def remove_emojis(text):
    """Removes emojis and symbols from a given text."""
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Miscellaneous Symbols and Pictographs
        "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
        "\U0001F1E0-\U0001F1FF"  # Flags
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002600-\U000026FF"  # Miscellaneous Symbols
        "\U00000200-\U00002BFF"  # Additional symbols
        "\U0001F004"             # Mahjong tiles
        "\U0001F0CF"             # Playing cards
        "\n"                     # Line break
        "]",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub("", text)


def clean_ad_creative_bodies(data):
    """Removes emojis from 'ad_creative_bodies' field in JSON data."""
    for item in data:
        if "ad_creative_bodies" in item:
            if isinstance(item["ad_creative_bodies"], str):
                item["ad_creative_bodies"] = remove_emojis(item["ad_creative_bodies"])
            elif isinstance(item["ad_creative_bodies"], list):
                item["ad_creative_bodies"] = [remove_emojis(body) for body in item["ad_creative_bodies"]]
    return data


# Process all years
for year in range(2022, 2026):
    base_dir = os.getcwd()
    output_dir = os.path.join(base_dir, f"bylines_ads/{year}")

    if not os.path.exists(output_dir):
        print(f"Skipping {year} - No data found.")
        continue

    print(f"Processing data for year {year}...")

    # Step 1: Replace 'id' with '_id' in all JSON files
    process_json_files(output_dir, replace_id_with_underscore_id)
    print(f"Replaced 'id' with '_id' in {year} data.")

    # Step 2: Append all JSON data into a single file
    appended_data = append_json_files(output_dir)
    all_data_file = os.path.join(output_dir, "data_all.json")
    with open(all_data_file, "w", encoding="utf-8") as json_file:
        json.dump(appended_data, json_file, indent=4, ensure_ascii=False)
    print(f"Appended {len(appended_data)} records into {all_data_file}.")

    # Step 3: Remove emojis from 'ad_creative_bodies'
    cleaned_data = clean_ad_creative_bodies(appended_data)
    cleaned_data_file = os.path.join(output_dir, "data_all_cleaned.json")
    with open(cleaned_data_file, "w", encoding="utf-8") as json_file:
        json.dump(cleaned_data, json_file, indent=4, ensure_ascii=False)
    print(f"Cleaned 'ad_creative_bodies' and saved to {cleaned_data_file}.\n")

print("Processing complete.")


# Remove the processed files after completion
for year in range(2022, 2026):
    output_dir = os.path.join(base_dir, f"bylines_ads\\{year}")
    file_path = os.path.join(output_dir, "data_all.json")
    
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Removed file: {file_path}")
    else:
        print(f"File not found, skipping: {file_path}")


In [None]:
### Push to mongo
from pymongo import MongoClient
import json
import os
from tqdm import tqdm

client = MongoClient('mongodb://localhost:27017')
db = client["spending_db"]
collection = db["spending"]

for year in range(2022, 2026):
    base_dir = os.getcwd()
    output_dir = os.path.join(base_dir, f"bylines_ads\\{year}")
    file_path = os.path.join(output_dir, "data_all_cleaned.json")

    with open(file_path, "r", encoding="utf-8") as file:
        json_data = json.load(file)

    new_count = 0
    updated_count = 0

    print(f"\nProcessing year {year}...")
    for document in tqdm(json_data, desc="Inserting documents", unit="doc"):
        result = collection.replace_one(
            {"_id": document["_id"]},
            document,
            upsert=True
        )
        if result.upserted_id is not None:
            new_count += 1
        elif result.modified_count == 1:
            updated_count += 1

    print(f"Year {year} Results:")
    print(f"New documents inserted: {new_count}")
    print(f"Existing documents updated: {updated_count}\n")

client.close()

# Remove the processed files after completion
for year in range(2022, 2026):
    output_dir = os.path.join(base_dir, f"bylines_ads\\{year}")
    file_path = os.path.join(output_dir, "data_all_cleaned.json")
    
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Removed file: {file_path}")
    else:
        print(f"File not found, skipping: {file_path}")

In [None]:
### Check distinct values in MongoDB
from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017')
db = client["spending_db"]
collection = db["spending"]

distinct_payer = collection.distinct("payer")
distinct_beneficiary = collection.distinct("beneficiary")
distinct_beneficiary = collection.distinct("beneficiary")

print(f"Total distinct page_ids: {len(distinct_payer)}")
print(f"Total distinct page_ids: {len(distinct_beneficiary)}")
print(f"Total distinct page_ids: {len(distinct_beneficiary)}")

# pipeline = [
#     {"$group": {"_id": "$page_id", "page_names": {"$addToSet": "$page_name"}}},
#     {"$project": {"_id": 1, "page_names": 1, "count": {"$size": "$page_names"}}},
#     {"$match": {"count": {"$gt": 1}}}
# ]
#  
# results = collection.aggregate(pipeline)
# 
# for result in results:
#     print(f"Page ID: {result['_id']}, Page Names: {result['page_names']}")

In [None]:
### Download logos from Facebook Ads Library
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import requests
import os
from pymongo import MongoClient

def setup_mongo_connection(uri="mongodb://localhost:27017", db_name="spending_db", collection_name="spending"):
    """Establishes a connection to MongoDB and returns the collection."""
    client = MongoClient(uri)
    return client[db_name][collection_name]

def setup_chrome_driver():
    """Initializes and returns a Selenium Chrome WebDriver."""
    service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode for efficiency
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    return webdriver.Chrome(service=service, options=options)

def navigate_to_page(driver, page_id, delay=2):
    """Navigates to the Facebook Ads Library page for a given page ID."""
    url = f"https://www.facebook.com/ads/library/?active_status=all&ad_type=political_and_issue_ads&country=CZ&is_targeted_country=false&media_type=all&search_type=page&source=ad-report&view_all_page_id={page_id}"
    driver.get(url)
    time.sleep(delay)

def download_image(driver, page_id, output_dir="ad_data"):
    """Downloads the logo image for a given page ID and saves it locally."""
    os.makedirs(output_dir, exist_ok=True)
    try:
        div_element = driver.find_element(By.CSS_SELECTOR, "div.x9f619.x1n2onr6.x1ja2u2z")
        image_element = div_element.find_element(By.TAG_NAME, "img")
        image_src = image_element.get_attribute("src")
        img_data = requests.get(image_src, timeout=5).content
        image_path = os.path.join(output_dir, f"{page_id}_logo.png")
        with open(image_path, 'wb') as handler:
            handler.write(img_data)
        print(f"Downloaded image for Page ID {page_id}")
    except Exception as e:
        print(f"Error downloading image for {page_id}: {e}")

def main():
    collection = setup_mongo_connection()
    # page_ids = collection.distinct("page_id")
    rest = ["106420985474553","106445835258574","106468352204064","106504292172552","106519366051209","106579758576614","106691471931611","106807568260950","106834174877753","106908020990999","106955471842807","107035315307886","107220428166337","107321514597921","107462127544322","107498078682762","107600275315369","107685452090075","107811238569615","107814901486149","107996628144164","108189018523523","108333431428338","108342850697974","108392050621937","108474950865581","108497678577808","108502411015582","108624290965848","108651807679403","108688808222597","108827658638480","1088335161224887","108913974315764","108926818582146","108949231817718","108996510563583","109014911323689","109094595125781","109145014987900","109207234551285","109289851902201","109323929038","109389881807545","109422221916456","109426895534368","109439037995005","109465215069581","109501160564065","109510844114220","109787574497667","109901507912958","109909211968977","109926111710180","110141981668594","110202394956780","110208494972750","110381917757634","110616612331837","110782381736289","110867741728106","110924958506311","110938328317048","111041662264882","111112605027103","111126344023093","111152388465014","111210916534","111268861638069","111451038318420","111617451012493","111740250703509","111857801623752","111902728302098","111967433619026","112020906847068","112082501611597","112148591550318","112174584650686","112314781587199","112389624911251","112503423634725","112514130247111","112928034739894","112990015402425","113074381674696","113317238053075","113322746709049","113528310021484","113552972001642","113575165174742","114236067117538","114402001282139","114474108677288","114476164577644","115057090399425","115112916857629","115498611157948","1156076277912221","115802229102602","1158186927548185","115827798130504","1159209940952925","116080567769345","1161002053978712","116132993885","117640041610392","117787408240075","117823623864","117932970907228","118300704889475","1184578558336226","118636354851151","119410337928232","1201085733241671","1201476926617478","121742606287904","1220066088147062","122074554119","122348331158908","122554031114843","123761161006567","123858471641","1240809492610426","125453991470833","125957534920","126192198099","126996454685051","127174985442604","128758673882866","129379370445549","129506597059513","129817447050951","130038285069","130631260290858","130654408176","130675686978568","130888723612442","131446920213646","1317745511659715","133229873355411","133879436647946","134443820058669","1354343051279168","136433519736526","137072422819787","137681825598","137995549548244","1386439704902726","1393826507513693","1394004240867857","1395881230682843","140017002702721","1401321553476901","1402508729972743","1404691249815376","140958725929135","1410403182565683","141867782504792","142407199736272","142955265738884","1437561026464333","1445960002302987","1448847745374153","144959226134456","1453707188247256","1456069361348225","1468464403368393","146933479338120","147316752648731","1477535869227488","147955088401381","1480287422276129","1491771137600220","149945438407451","1501279890088120","1501797940067585","1504551439761715","1510242952524260","1518789241761603","1529542723928361","153384318085064","153535188480725","1539076396126259","156945169098","1572862236283687","157675528222654","157741540937883","158655234218517","158739964797781","1589383344650567","1594515857506891","159935454034044","1600005766955141","160225901192351","161396423900274","1624537931096023","162620837267478","163186777045684","1653612541318463","1670516596555151","1674362659292756","1691199151153995","170261140345723","1708126532760089","1711374649101311","1729039174076892","172947177653","173305846086912","1736314393286607","1740437442715637","175278506715466","1758963624352554","176173883021695","1762819173742389","176618874475277","176865529179620","177139666120181","177958748740045","1780099722095158","178380188689847","1786966461421767","179497582061065","1824133914469306","185782348196928","186041078135013","186418751416638","1867697280133161","187407575190610","1876103979117187","189589554241","1909508402643576","1929507597353393","1934219790219869","194268297419434","194460987239229","194530814464731","1953958664821128","196776900726766","197010357446014","197025193499394","1982660762016859","198431717471046","198772920148707","198923616951918","1990106357945812","1993685137318931","199712473541525","201535993705342","204323117016395","206127949257172","206407509818","206630153142726","2066553093668128","2068226376787023","2085308858176067","2092588267635432","2102164976773705","2108659169383946","211401918930049","211685455639198","211835588951012","214101585467205","214827221987263","215919868555020","217379454782587","219712354825760","222157955200225","223117541065037","223848441680279","224146535014","2242877092705890","225296017595082","226917671154791","228361180513562","228742034354193","229294384295513","232356980763788","238421042698315","238478056623999","240004189482300","240057676569438","241270839070177","242117635912490","245915525601678","246919825409416","249525561568577","250391278786394","250414742249089","251245588083351","251342108296899","251448281377394","251656685576","252673128998993","252707698079845","252723251252904","252871781410820","253130661215661","257529307701966","264781283705269","264983260329918","268588983221478","268634230188375","273251297864533","273367223021327","273673582792500","275042429350706","278117815576023","280456089053611","280617905401953","281612878360690","284247164921991","284894334540","285076764698162","285098274162","286151105540772","287665901740773","288627030991926","288737359845","289075461571","289292980940100","289323263399","290159387524447","290204967518463","290740221516130","291971547991645","292218360631750","293184477211841","294808211378024","295889117124834","296231097477438","297199607016938","297662066762275","298062620248518","298499180023875","298789466930469","299001666625492","299124538476","301655750591979","302134090396266","303091886551823","303525926500817","307710452690539","308305179255281","312759208382","315747282263515","315972808450196","316563285803","317530692148022","317802208282505","318669934660466","325052354017676","325428267939168","326185347254513","326607230525205","331321058046","333115293529864","333251661983","334203283397849","336346979568723","337703022758334","338694652654920","339242666586738","342077753088187","344152608790039","34825122262","348394825788","349071138506160","349253991595045","350852515070256","351488672592066"]
    done = ["106420985474553","106445835258574","106468352204064","106504292172552","106519366051209","106579758576614","106691471931611","106807568260950","106834174877753","106908020990999","107035315307886","107321514597921","107462127544322","107498078682762","107600275315369","107685452090075","107811238569615","107814901486149","107996628144164","108189018523523","108333431428338","108342850697974","108392050621937","108474950865581","108497678577808","108502411015582","108624290965848","108651807679403","108688808222597","108827658638480","1088335161224887","108913974315764","108926818582146","108949231817718","108996510563583","109014911323689","109094595125781","109145014987900","109207234551285","109289851902201","109323929038","109389881807545","109426895534368","109439037995005","109465215069581","109501160564065","109510844114220","109787574497667","109901507912958","109909211968977","109926111710180","110141981668594","110202394956780","110208494972750","110381917757634","110616612331837","110782381736289","110924958506311","110938328317048","111041662264882","111112605027103","111126344023093","111152388465014","111210916534","111268861638069","111451038318420","111617451012493","111740250703509","111857801623752","111902728302098","111967433619026","112020906847068","112082501611597","112148591550318","112174584650686","112314781587199","112389624911251","112503423634725","112514130247111","112928034739894","112990015402425","113074381674696","113317238053075","113322746709049","113528310021484","113552972001642","113575165174742","114236067117538","114402001282139","114474108677288","114476164577644","115057090399425","115112916857629","115498611157948","1156076277912221","115802229102602","1158186927548185","115827798130504","1159209940952925","1161002053978712","116132993885","117640041610392","117787408240075","117823623864","117932970907228","118300704889475","1184578558336226","118636354851151","119410337928232","1201085733241671","1201476926617478","121742606287904","1220066088147062","122074554119","122348331158908","122554031114843","123761161006567","123858471641","1240809492610426","125453991470833","125957534920","126192198099","126996454685051","127174985442604","128758673882866","129379370445549","129506597059513","129817447050951","130038285069","130631260290858","130654408176","130675686978568","130888723612442","131446920213646","1317745511659715","133229873355411","133879436647946","134443820058669","1354343051279168","136433519736526","137072422819787","137681825598","137995549548244","1386439704902726","1393826507513693","1394004240867857","1395881230682843","140017002702721","1401321553476901","1402508729972743","1404691249815376","140958725929135","1410403182565683","141867782504792","142407199736272","142955265738884","1437561026464333","1445960002302987","1448847745374153","144959226134456","1453707188247256","1456069361348225","1468464403368393","147316752648731","1477535869227488","147955088401381","1480287422276129","1491771137600220","149945438407451","1501279890088120","1501797940067585","1504551439761715","1518789241761603","1529542723928361","153384318085064","153535188480725","1539076396126259","156945169098","1572862236283687","157675528222654","157741540937883","158739964797781","1589383344650567","1594515857506891","159935454034044","1600005766955141","160225901192351","161396423900274","1624537931096023","162620837267478","163186777045684","1653612541318463","1670516596555151","1674362659292756","1691199151153995","170261140345723","1708126532760089","1711374649101311","1729039174076892","172947177653","173305846086912","1736314393286607","1740437442715637","175278506715466","1758963624352554","176173883021695","1762819173742389","176618874475277","176865529179620","177139666120181","177958748740045","1780099722095158","178380188689847","1786966461421767","179497582061065","1824133914469306","185782348196928","186041078135013","186418751416638","1867697280133161"]
    page_ids = list(set(rest) - set(done))
    print(f"Remaining Page IDs to process: {len(page_ids)}")
    driver = setup_chrome_driver()
    
    try:
        for page_id in page_ids:
            print(f"Processing Page ID: {page_id}")
            navigate_to_page(driver, page_id)
            download_image(driver, page_id)
    except Exception as e:
        print(f"Unexpected error: {e}")
    finally:
        driver.quit()
        print("Driver closed.")

if __name__ == "__main__":
    main()


In [None]:
#### Create CSV with image URLs
import os
import pandas as pd

# Define output directory
output_dir = os.path.join(os.getcwd(), "ad_data")

# Ensure the directory exists before listing files
if not os.path.exists(output_dir):
    raise FileNotFoundError(f"Directory '{output_dir}' does not exist.")

# List images with '_logo.png' suffix
image_names = [file for file in os.listdir(output_dir) if file.endswith("_logo.png")]

# Function to generate raw GitHub URLs
def github_to_raw_url(image_name: str) -> str:
    return f"https://raw.githubusercontent.com/Thepilli/political_ads/refs/heads/main/ad_data/{image_name}"

# Prepare data for the DataFrame
df = pd.DataFrame([
    {"page_id": str(image.replace("_logo.png", "")), "url_key": github_to_raw_url(image)}
    for image in image_names
])

# Ensure 'page_id' is stored explicitly as a string
df["page_id"] = df["page_id"].astype(str)

# Save to CSV
csv_path = os.path.join(output_dir, "image_urls.csv")
df.to_csv(csv_path, index=False)
print(f"CSV file saved at: {csv_path}")

# Print raw URLs
for _, row in df.iterrows():
    print(f"Raw URL: {row['url_key']}")


In [1]:
### Circular crop images

from PIL import Image, ImageDraw
import numpy as np
import os

# Define output directory
output_dir = os.path.join(os.getcwd(), "ad_data")

# Ensure the directory exists before listing files
if not os.path.exists(output_dir):
    raise FileNotFoundError(f"Directory '{output_dir}' does not exist.")

# List images with '_logo.png' suffix
image_names = [file for file in os.listdir(output_dir) if file.endswith("_logo.png")]


def circular_crop(image_path, output_path):
    # Open the image
    img = Image.open(image_path).convert("RGBA")
    
    # Create same size mask with transparent background
    mask = Image.new("L", img.size, 0)
    draw = ImageDraw.Draw(mask)

    # Define the circular region (centered)
    size = min(img.size)
    left = (img.width - size) // 2
    top = (img.height - size) // 2
    right = left + size
    bottom = top + size

    # Draw a white filled circle on the mask
    draw.ellipse((left, top, right, bottom), fill=255)

    # Apply mask to image
    circular_img = Image.new("RGBA", img.size, (0, 0, 0, 0))
    circular_img.paste(img, (0, 0), mask=mask)

    # Crop the circular region and save
    circular_img = circular_img.crop((left, top, right, bottom))
    circular_img.save(output_path, format="PNG")
    # Loop through all images and apply circular cropping
    for image_name in image_names:
        input_path = os.path.join(output_dir, image_name)
        output_path = os.path.join(output_dir, image_name.replace("_logo.png", "_logo_circular.png"))
        circular_crop(input_path, output_path)
        print(f"Circular cropped image saved at: {output_path}")
