In [None]:
import requests
import json
import os
import time
from urllib.parse import quote


def get_next_page(response):
    """Extracts the next page URL from the API response."""
    return response.get("paging", {}).get("next")


def fetch_data(api_url):
    """Fetches paginated data from the API."""
    all_data = []
    
    while api_url:
        try:
            response = requests.get(api_url)
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
            
            json_data = response.json()
            data = json_data.get("data", [])
            all_data.extend(data)
            
            print(f"Extracted {len(data)} records. Total: {len(all_data)}")

            api_url = get_next_page(json_data)
            time.sleep(2)  # Respect API rate limits

        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            break

    return all_data


def extract_data(bylines, output_dir, access_token, year, api_version="v21.0", country="CZ", language="cs"):
    """
    Extracts and saves ad data from the Facebook Ads API for a list of bylines.
    
    Parameters:
        bylines (list): List of bylines to query.
        output_dir (str): Directory to save the JSON files.
        access_token (str): Facebook API access token.
        year (int): Year for filtering ads.
        api_version (str): API version (default: v21.0).
        country (str): Country code for filtering ads (default: CZ).
        language (str): Language code for filtering ads (default: cs).
    """
    os.makedirs(output_dir, exist_ok=True)  # Ensure output directory exists

    fields = ",".join([
        "id", "ad_snapshot_url", "ad_creation_time", "ad_creative_bodies", "ad_creative_link_captions",
        "ad_creative_link_descriptions", "ad_creative_link_titles", "ad_delivery_start_time", 
        "ad_delivery_stop_time", "bylines", "currency", "delivery_by_region", "demographic_distribution", 
        "estimated_audience_size", "impressions", "languages", "page_id", "page_name", "publisher_platforms", 
        "spend", "target_locations", "target_gender", "target_ages", "eu_total_reach", "beneficiary_payers", 
        "age_country_gender_reach_breakdown"
    ])
    
    for byline in bylines:
        print(f"Extracting Data for: {byline}")
        encoded_byline = quote(f'["{byline}"]')

        api_url = (
            f"https://graph.facebook.com/{api_version}/ads_archive?"
            f"bylines={encoded_byline}&ad_type=POLITICAL_AND_ISSUE_ADS"
            f"&ad_reached_countries=['{country}']&access_token={access_token}"
            f"&unmask_removed_content=true&fields={fields}&limit=199"
            f"&search_terms=''&languages=['{language}']"
            f"&ad_delivery_date_min={year}-01-01"
            f"&ad_delivery_date_max={year+1}-01-01"
        )
        
        extracted_data = fetch_data(api_url)
        time.sleep(5)  # Delay between different bylines

        if extracted_data:
            filename = os.path.join(output_dir, f"data_{byline}.json")
            with open(filename, "w", encoding="utf-8") as json_file:
                json.dump(extracted_data, json_file, indent=4, ensure_ascii=False)
            print(f"Saved data to {filename}")
        else:
            print(f"No data extracted for {byline}.")


# Parameters
year = 2023
base_dir = os.getcwd()
output_dir = os.path.join(base_dir, f"bylines_ads/{year}")
# bylines = ["SH media, spol. s r.o."]
bylines = ['Česká pirátská strana', 'EUROPEUM Institute for European Policy', 'FTV Prima', 'ANO', 'Naše zdravotnictví', 'Svoboda a přímá demokracie (SPD)', 'ČSOB', 'ODS', 'Lékaři bez hranic - Médecins Sans Frontières in Czech Republic, o. p. s.', 'Aliance pro budoucnost', 'STAN', 'Starostové a nezávislí • STAN', 'PRAHA SOBĚ', 'Člověk v tísni, o.p.s.', 'SH media, spol. s r.o.', 'Komunistická strana Čech a Moravy', 'Zdeněk Hraba - senátor', 'Ondrej Prokop', 'Ministerstvo práce a sociálních věcí', 'Tomáš Zdechovský', 'Milion Chvilek, z. s.', 'EU Social', 'EU Justice and Consumers', 'ODS - Občanská demokratická strana', 'Sociální demokracie', 'Amnesty International ČR', 'Občanská demokratická strana', 'Tipsport', 'Karel Janeček', 'Martin Kuba', 'Starostové a nezávislí', 'Svoboda zvířat Plzeň, z.s.', 'CARE Česká republika', 'Replastuj.cz', 'ANO 2011', 'Reportér magazín', 'Andrej Babiš', 'Člověk v tísni o.p.s.', 'DFMG', 'TOP 09', 'SEN 21', 'Kupředu do minulosti s.r.o.', 'Ministerstvo pro místní rozvoj ČR', 'Svoboda a přímá demokracie', 'Greenpeace Česká republika', 'STAROSTOVÉ A NEZÁVISLÍ', 'KDU-ČSL', 'Česká pirátská strana - Praha', 'XTV', 'Hnutí DUHA - Přátelé Země Česká republika', 'Zelení - Strana zelených', 'PŘÍSAHA - občanské hnutí Roberta Šlachty', 'Český rozhlas', 'CZECH NEWS CENTER a. s.', 'Oldřich Hájek', 'Transparency International ČR']

access_token = "EAALnc8im5MUBO9FtPZCD8AcvH8w0CfpLw4BTgWtLI0Ivs7d71FI5BMirPdtx3ejqC4l8OUu8foPYjEEgsGtDZB3nyWeS2SSq2OGHYEo3yJnrawuZC6q5bz2BdQj7NSB4kYVQiYNVLFSofwgjtAICABUIMGpK6F1GlMo9t4uYS29XszNgHnC4tkYtLn1"  # Use environment variable

if not access_token:
    raise ValueError("Access token is missing. Set the 'FB_ACCESS_TOKEN' environment variable.")

# Execute extraction
extract_data(bylines, output_dir, access_token, year)


Extracting Data for: SH media, spol. s r.o.
Extracted 199 records. Total: 199
Extracted 30 records. Total: 229
Extracted 0 records. Total: 229
Saved data to c:\Users\jirip\Documents\Developer\python\political_ads\bylines_ads/2023\data_SH media, spol. s r.o..json


In [6]:
import requests
import json
import os
from urllib.parse import quote
import time


def get_next_page(response):
    paging = response.get("paging", {})
    next_url = paging.get("next", None)
    return next_url

def main(api_url):
    all_data = []
    
    while api_url:
        response = requests.get(api_url)
        if response.status_code == 200:
            json_data = response.json()
            data = json_data.get("data", [])
            all_data.extend(data)
            api_url = get_next_page(json_data)
            print(f"Extracted {len(data)} records. Total records: {len(all_data)}")
            time.sleep(1) # Add a 2-second delay between API calls

        else:
            print("Error:", response.text)
            print("Error:", response.status_code)
            break

    return all_data

def extract_data(bylines, output_dir, access_token, year, api_version="v21.0", country="CZ", language="cs"):
    """
    Extracts data from the Facebook Ads API for a list of bylines.
    
    Parameters:
        bylines (list): List of bylines to query.
        output_dir (str): Directory to save the JSON files.
        access_token (str): Access token for the API.
        api_version (str): Version of the Facebook Graph API (default: v21.0).
        country (str): Country code for filtering ads (default: CZ).
        language (str): Language code for filtering ads (default: cs).
    """
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    fields = ",".join([
        "id", "ad_snapshot_url", "ad_creation_time", "ad_creative_bodies", "ad_creative_link_captions",
        "ad_creative_link_descriptions", "ad_creative_link_titles", "ad_delivery_start_time", 
        "ad_delivery_stop_time", "bylines", "currency", "delivery_by_region", "demographic_distribution", 
        "estimated_audience_size", "impressions", "languages", "page_id", "page_name", "publisher_platforms", 
        "spend", "target_locations", "target_gender", "target_ages", "eu_total_reach", "beneficiary_payers", 
        "age_country_gender_reach_breakdown"
    ])
    
    for byline in bylines:
        print(f"Extracting Data for {byline}")
        encoded_byline = quote(f'["{byline}"]')

        
        api_url = (
            f"https://graph.facebook.com/{api_version}/ads_archive?"
            f"bylines={encoded_byline}&ad_type=POLITICAL_AND_ISSUE_ADS"
            f"&ad_reached_countries=['{country}']&access_token={access_token}"
            f"&unmask_removed_content=true&fields={fields}&limit=199"
            f"&search_terms=''&languages=['{language}']"
            f"&ad_delivery_date_min={year}-01-01"
            f"&ad_delivery_date_max={year+1}-01-01"
        )
        
        # Assuming main() handles the API request and returns the data
        extracted_data = main(api_url)
        time.sleep(5) # Add a 5-second delay between API calls
        
        # Write the extracted data to a JSON file
        filename = os.path.join(output_dir, f"data_{byline}.json")

        if extracted_data:  # Check if the array is not empty
            with open(filename, "w", encoding='utf-8') as json_file:
                json.dump(extracted_data, json_file, indent=4, ensure_ascii=False)
            print(f"All Extracted Data written to {filename}")
        else:
            print(f"No data extracted. File {filename} not created.")


# Parameters
year = 2023
base_dir = os.getcwd()
output_dir = os.path.join(base_dir, f"bylines_ads\\{year}")
bylines = ["Milion Chvilek, z. s."]
# bylines = ['Česká pirátská strana', 'EUROPEUM Institute for European Policy', 'FTV Prima', 'ANO', 'Naše zdravotnictví', 'Svoboda a přímá demokracie (SPD)', 'ČSOB', 'ODS', 'Lékaři bez hranic - Médecins Sans Frontières in Czech Republic, o. p. s.', 'Aliance pro budoucnost', 'STAN', 'Starostové a nezávislí • STAN', 'PRAHA SOBĚ', 'Člověk v tísni, o.p.s.', 'SH media, spol. s r.o.', 'Komunistická strana Čech a Moravy', 'Zdeněk Hraba - senátor', 'Ondrej Prokop', 'Ministerstvo práce a sociálních věcí', 'Tomáš Zdechovský', 'Milion Chvilek, z. s.', 'EU Social', 'EU Justice and Consumers', 'ODS - Občanská demokratická strana', 'Sociální demokracie', 'Amnesty International ČR', 'Občanská demokratická strana', 'Tipsport', 'Karel Janeček', 'Martin Kuba', 'Starostové a nezávislí', 'Svoboda zvířat Plzeň, z.s.', 'CARE Česká republika', 'Replastuj.cz', 'ANO 2011', 'Reportér magazín', 'Andrej Babiš', 'Člověk v tísni o.p.s.', 'DFMG', 'TOP 09', 'SEN 21', 'Kupředu do minulosti s.r.o.', 'Ministerstvo pro místní rozvoj ČR', 'Svoboda a přímá demokracie', 'Greenpeace Česká republika', 'STAROSTOVÉ A NEZÁVISLÍ', 'KDU-ČSL', 'Česká pirátská strana - Praha', 'XTV', 'Hnutí DUHA - Přátelé Země Česká republika', 'Zelení - Strana zelených', 'PŘÍSAHA - občanské hnutí Roberta Šlachty', 'Český rozhlas', 'CZECH NEWS CENTER a. s.', 'Oldřich Hájek', 'Transparency International ČR']
access_token = "EAALnc8im5MUBO9FtPZCD8AcvH8w0CfpLw4BTgWtLI0Ivs7d71FI5BMirPdtx3ejqC4l8OUu8foPYjEEgsGtDZB3nyWeS2SSq2OGHYEo3yJnrawuZC6q5bz2BdQj7NSB4kYVQiYNVLFSofwgjtAICABUIMGpK6F1GlMo9t4uYS29XszNgHnC4tkYtLn1"

# Call the function
extract_data(bylines, output_dir, access_token, year)

Extracting Data for Milion Chvilek, z. s.
Extracted 199 records. Total records: 199
Extracted 199 records. Total records: 398
Extracted 199 records. Total records: 597
Extracted 199 records. Total records: 796
Extracted 199 records. Total records: 995
Extracted 31 records. Total records: 1026
Extracted 0 records. Total records: 1026
All Extracted Data written to c:\Users\jirip\Documents\Developer\python\political_ads\bylines_ads\2023\data_Milion Chvilek, z. s..json


In [None]:

import os
import json
import re


def replace_id_with_underscore_id(json_data):
    """Recursively replaces 'id' with '_id' in JSON data."""
    if isinstance(json_data, dict):
        if "id" in json_data:
            json_data["_id"] = json_data.pop("id")
        for value in json_data.values():
            replace_id_with_underscore_id(value)
    elif isinstance(json_data, list):
        for item in json_data:
            replace_id_with_underscore_id(item)


def process_json_files(folder_path, process_function):
    """Applies a transformation function to all JSON files in a given folder."""
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as json_file:
                data = json.load(json_file)

            processed_data = process_function(data)

            with open(file_path, "w", encoding="utf-8") as json_file:
                json.dump(processed_data, json_file, indent=4, ensure_ascii=False)


def append_json_files(folder_path):
    """Combines all JSON files in a directory into a single list."""
    all_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as json_file:
                all_data.extend(json.load(json_file))
    return all_data


def remove_emojis(text):
    """Removes emojis and symbols from a given text."""
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Miscellaneous Symbols and Pictographs
        "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
        "\U0001F1E0-\U0001F1FF"  # Flags
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002600-\U000026FF"  # Miscellaneous Symbols
        "\U00000200-\U00002BFF"  # Additional symbols
        "\U0001F004"             # Mahjong tiles
        "\U0001F0CF"             # Playing cards
        "\n"                     # Line break
        "]",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub("", text)


def clean_ad_creative_bodies(data):
    """Removes emojis from 'ad_creative_bodies' field in JSON data."""
    for item in data:
        if "ad_creative_bodies" in item:
            if isinstance(item["ad_creative_bodies"], str):
                item["ad_creative_bodies"] = remove_emojis(item["ad_creative_bodies"])
            elif isinstance(item["ad_creative_bodies"], list):
                item["ad_creative_bodies"] = [remove_emojis(body) for body in item["ad_creative_bodies"]]
    return data


# Process all years
for year in range(2022, 2026):
    base_dir = os.getcwd()
    output_dir = os.path.join(base_dir, f"bylines_ads/{year}")

    if not os.path.exists(output_dir):
        print(f"Skipping {year} - No data found.")
        continue

    print(f"Processing data for year {year}...")

    # Step 1: Replace 'id' with '_id' in all JSON files
    process_json_files(output_dir, replace_id_with_underscore_id)
    print(f"Replaced 'id' with '_id' in {year} data.")

    # Step 2: Append all JSON data into a single file
    appended_data = append_json_files(output_dir)
    all_data_file = os.path.join(output_dir, "data_all.json")
    with open(all_data_file, "w", encoding="utf-8") as json_file:
        json.dump(appended_data, json_file, indent=4, ensure_ascii=False)
    print(f"Appended {len(appended_data)} records into {all_data_file}.")

    # Step 3: Remove emojis from 'ad_creative_bodies'
    cleaned_data = clean_ad_creative_bodies(appended_data)
    cleaned_data_file = os.path.join(output_dir, "data_all_cleaned.json")
    with open(cleaned_data_file, "w", encoding="utf-8") as json_file:
        json.dump(cleaned_data, json_file, indent=4, ensure_ascii=False)
    print(f"Cleaned 'ad_creative_bodies' and saved to {cleaned_data_file}.\n")

print("Processing complete.")


In [None]:
# // Replace 'id' with '_id' in all JSON files

import os
import json

def replace_id_with_underscore_id(json_data):
    if isinstance(json_data, dict):
        for key, value in list(json_data.items()):
            if key == "id":
                json_data["_id"] = json_data.pop("id")
            replace_id_with_underscore_id(value)
    elif isinstance(json_data, list):
        for item in json_data:
            replace_id_with_underscore_id(item)

def replace_id_in_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            
            with open(file_path, "r", encoding="utf-8") as json_file:
                data = json.load(json_file)

            replace_id_with_underscore_id(data)

            with open(file_path, "w", encoding="utf-8") as json_file:
                json.dump(data, json_file, indent=4, ensure_ascii=False)

if __name__ == "__main__":
    for year in range(2022, 2026):
        base_dir = os.getcwd()
        output_dir = os.path.join(base_dir, f"bylines_ads\\{year}")
        folder_path = output_dir
        replace_id_in_folder(folder_path)


In [None]:
# // Append all JSON files in the folder
import os
import json


def append_json_files(folder_path):
    all_data = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            
            with open(file_path, "r", encoding="utf-8") as json_file:
                data = json.load(json_file)
                all_data.extend(data)
    
    return all_data

# Loop through the years 2022 to 2025
for year in range(2023, 2024):
    base_dir = os.getcwd()
    folder_path = output_dir
    output_dir = os.path.join(base_dir, f"bylines_ads\\{year}")

    # Append all JSON files in the folder
    appended_data = append_json_files(folder_path)

    # Print the total number of documents appended
    print(f"Year: {year}, Total documents appended: {len(appended_data)}")

    # Write the extracted data to a JSON file
    filename = os.path.join(output_dir, f"data_all.json")
    with open(filename, "w", encoding='utf-8') as json_all:
        json.dump(appended_data, json_all, indent=4, ensure_ascii=False)

Year: 2023, Total documents appended: 31915


In [None]:
# // Clean ad creative bodies
import json
import re

def remove_emojis(text):
    # Broader regex pattern to match emojis and symbols
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Miscellaneous Symbols and Pictographs
        "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
        "\U0001F1E0-\U0001F1FF"  # Flags
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002600-\U000026FF"  # Miscellaneous Symbols
        "\U00000200-\U00002BFF"  # Additional symbols
        "\U0001F004"             # Mahjong tiles
        "\U0001F0CF"             # Playing cards
        "\n"                     # line break
        "]",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub("", text)

def clean_ad_creative_bodies(data):
    for item in data:
        if "ad_creative_bodies" in item:
            # Handle strings and lists
            if isinstance(item["ad_creative_bodies"], str):
                item["ad_creative_bodies"] = remove_emojis(item["ad_creative_bodies"])
            elif isinstance(item["ad_creative_bodies"], list):
                item["ad_creative_bodies"] = [
                    remove_emojis(body) for body in item["ad_creative_bodies"]
                ]
    return data

for year in range(2022, 2026):
    base_dir = os.getcwd()
    folder_path = output_dir
    output_dir = os.path.join(base_dir, f"bylines_ads\\{year}")
    # Load your JSON file
    input_file = os.path.join(output_dir, f"data_all.json")  # Replace with your input file path
    output_file = os.path.join(output_dir, f"data_all_cleaned.json")  # Replace with your output file path

    with open(input_file, "r", encoding="utf-8") as f:
        ad_data = json.load(f)

    # Clean the data
    cleaned_data = clean_ad_creative_bodies(ad_data)

    # Save the cleaned data to a new JSON file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(cleaned_data, f, ensure_ascii=False, indent=4)

    print("Cleaned ad creative bodies saved successfully.")




# D_Beneficiary
# D_Bylines
# D_Payer
# D_Page_name
# D_Region
# D_Age range
# D_Target location

# javascript
# db.collection.find({
#   ad_delivery_start_time: { $exists: true },
#   ad_delivery_end_time: { $exists: true },
#   $expr: { $eq: ["$ad_delivery_start_time", "$ad_delivery_end_time"] }
# })
# Or as an aggregation pipeline stage:
# javascript
# {
#   $match: {
#     ad_delivery_start_time: { $exists: true },
#     ad_delivery_end_time: { $exists: true },
#     $expr: { $eq: ["$ad_delivery_start_time", "$ad_delivery_end_time"] }
#   }
# }

Cleaned ad creative bodies saved successfully.
Cleaned ad creative bodies saved successfully.
Cleaned ad creative bodies saved successfully.
Cleaned ad creative bodies saved successfully.


### push data to mongo


In [13]:
from pymongo import MongoClient
import json
import os
from tqdm import tqdm

client = MongoClient('mongodb://localhost:27017')
db = client["spending_db"]
collection = db["spending"]

for year in range(2022, 2026):
    base_dir = os.getcwd()
    output_dir = os.path.join(base_dir, f"bylines_ads\\{year}")
    file_path = os.path.join(output_dir, "data_all_cleaned.json")

    with open(file_path, "r", encoding="utf-8") as file:
        json_data = json.load(file)

    new_count = 0
    updated_count = 0

    print(f"\nProcessing year {year}...")
    for document in tqdm(json_data, desc="Inserting documents", unit="doc"):
        result = collection.replace_one(
            {"_id": document["_id"]},
            document,
            upsert=True
        )
        if result.upserted_id is not None:
            new_count += 1
        elif result.modified_count == 1:
            updated_count += 1

    print(f"Year {year} Results:")
    print(f"New documents inserted: {new_count}")
    print(f"Existing documents updated: {updated_count}\n")

client.close()



Processing year 2022...


Inserting documents: 100%|██████████| 22234/22234 [00:16<00:00, 1362.67doc/s]


Year 2022 Results:
New documents inserted: 0
Existing documents updated: 22234


Processing year 2023...


Inserting documents: 100%|██████████| 31915/31915 [00:24<00:00, 1290.46doc/s]


Year 2023 Results:
New documents inserted: 0
Existing documents updated: 31915


Processing year 2024...


Inserting documents: 100%|██████████| 15101/15101 [00:11<00:00, 1344.95doc/s]


Year 2024 Results:
New documents inserted: 0
Existing documents updated: 15101


Processing year 2025...


Inserting documents: 100%|██████████| 2838/2838 [00:02<00:00, 1380.74doc/s]

Year 2025 Results:
New documents inserted: 0
Existing documents updated: 2838






## drop entire db

In [None]:
# from pymongo import MongoClient
# 
# # Connection to MongoDB
# client = MongoClient('mongodb://localhost:27017')
# 
# # Select your database
# db = client['meta_ads_db']
# 
# # Select your collection
# collection = db['meta_ads_collection']
# 
# # Truncate (remove all documents from) the collection
# result = collection.delete_many({})
# 
# # Print the number of deleted documents
# print(f"Number of documents deleted: {result.deleted_count}")
# 
# # Close the connection
# client.close()
