In [16]:
import requests
import pandas as pd
import os


In [17]:
# Function to fetch API data
def fetch_api_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()['data']
    else:
        raise Exception(f"API request failed with status {response.status_code}")

In [18]:
# Function to normalize data
def normalize_data(data):
    # Safeguard for empty or None data
    if not data:
        print("No data to normalize.")
        return [], [], [], [], [],[]

    # Initialize tables
    fact_table = []
    demographics_data = []
    region_data = []
    creative_data = []
    platform_data = []
    reach_data = []

    for ad in data:
        ad_id = ad.get('id')
        if not ad_id:  # Skip invalid ads with no ID
            print("Skipping ad with missing ID:", ad)
            continue

        # Fact table
        fact_table.append({
            'id': ad_id,
            'spend_lower': ad.get('spend', {}).get('lower_bound'),
            'spend_upper': ad.get('spend', {}).get('upper_bound'),
            'impressions_lower': ad.get('impressions', {}).get('lower_bound'),
            'impressions_upper': ad.get('impressions', {}).get('upper_bound'),
            'currency': ad.get('currency'),
            'ad_delivery_start_time': ad.get('ad_delivery_start_time'),
            'ad_delivery_stop_time': ad.get('ad_delivery_stop_time'),
            'search_page_ids': ad.get('page_id'),
            'page_name': ad.get('page_name'),
            'bylines': ad.get('bylines',""),
            'target_gender': ad.get('target_gender',""),
            'eu_total_reach': ad.get('eu_total_reach', ""),
            'estimated_audience_size': ad.get('estimated_audience_size', {}).get('lower_bound'),
        })

        # Demographics
        for demo in ad.get('demographic_distribution', []):
            demographics_data.append({
                'ad_id': ad_id,
                'age': demo.get('age'),
                'gender': demo.get('gender'),
                'percentage': demo.get('percentage'),
            })

        # Regions
        for region in ad.get('delivery_by_region', []):
            region_data.append({
                'ad_id': ad_id,
                'region': region.get('region'),
                'percentage': region.get('percentage'),
            })
        # Reach
        for reach in ad.get('age_country_gender_reach_breakdown', []):
             if reach.get('country') == 'CZ':
                for age_gender_breakdown in reach.get('age_gender_breakdowns', []):
                    reach_data.append({
                        'ad_id': ad_id,
                        'country': reach.get('country'),
                        'age_range': age_gender_breakdown.get('age_range'),
                        'male': age_gender_breakdown.get('male'),
                        'female': age_gender_breakdown.get('female'),
                        'unknown': age_gender_breakdown.get('unknown'),
                    })


        # Creative data
        creative_data.append({
            'ad_id': ad_id,
            'link_titles': ', '.join(ad.get('ad_creative_link_titles', [])),
            'link_captions': ', '.join(ad.get('ad_creative_link_captions', [])),
            # 'link_bodies': ', '.join(f'"{body}"' for body in ad.get('ad_creative_bodies', [])),
        })

        # Platforms
        for platform in ad.get('publisher_platforms', []):
            platform_data.append({
                'ad_id': ad_id,
                'platform_name': platform,
            })

    return fact_table, demographics_data, region_data, creative_data, platform_data,reach_data

In [22]:
# Function to fetch API data
def get_facebook_ads_archive(
    access_token,
    search_terms="",
    ad_type="POLITICAL_AND_ISSUE_ADS",
    ad_reached_countries=None,
    search_page_ids="",
    fields="",
    limit=499,
    d_active_status="ALL",
    ad_delivery_date_min="",
    ad_delivery_date_max="",
):
    """
    Fetches Facebook ads archive based on given parameters.
    """
    if ad_reached_countries is None:
        ad_reached_countries = ['CZ']

    # Construct the API URL and parameters
    base_url = "https://graph.facebook.com/v20.0/ads_archive"
    params = {
        "search_terms": search_terms,
        "ad_type": ad_type,
        "ad_reached_countries": ad_reached_countries,
        "access_token": access_token,
        "unmask_removed_content": "true",
        "search_page_ids": search_page_ids,
        "fields": fields,
        "limit": limit,
        "d_active_status": d_active_status,
        "ad_delivery_date_min": ad_delivery_date_min,
        "ad_delivery_date_max": ad_delivery_date_max,
    }

    all_data = []  # To store the paginated data
    response = requests.get(base_url, params=params)

    # Process the response
    if response.status_code == 200:
        data = response.json()
        all_data.extend(data.get("data", []))

        # Handle pagination if more data is available
        while "paging" in data and "next" in data["paging"]:
            next_page_url = data["paging"]["next"]
            response = requests.get(next_page_url)

            if response.status_code == 200:
                data = response.json()
                all_data.extend(data.get("data", []))
            else:
                print(
                    f"Failed to retrieve data from next page. Status code: {response.status_code}"
                )
                break
    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")
        return None

    return all_data

# FINAL



In [23]:
import os
import pandas as pd
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Configuration
ACCESS_TOKEN = "EAALnc8im5MUBOZBXcyevhADuqZBvZBT8VZBr3bmodjXeycutV6XsFmxWa7BFKeK5UpFJ3RDgybaaweneTgXn8B0j94fxRHoUpm0VmHv9cMawbupl7bCERO7mEO43ftZBIMKxuZALmb4wCZBd7iZBxz7C8mYhE0HSOf148ZAH3rZAusnz9hCGfwn0EI2hZBJdolB4SFn0wVuyWTN"
FIELDS = "id,ad_snapshot_url,ad_creation_time,ad_creative_link_captions,ad_creative_link_descriptions,ad_creative_link_titles,ad_delivery_start_time,ad_delivery_stop_time,bylines,currency,delivery_by_region,demographic_distribution,estimated_audience_size,impressions,languages,page_id,page_name,publisher_platforms,spend,target_locations,target_gender,target_ages,eu_total_reach,beneficiary_payers,age_country_gender_reach_breakdown"
OUTPUT_DIR = "ad_data"
API_LIMIT = 99
AD_TYPE = "POLITICAL_AND_ISSUE_ADS"
AD_REACHED_COUNTRIES = ['CZ']
D_ACTIVE_STATUS = "ALL"
AD_DELIVERY_DATE_MIN = "2023-01-01"
AD_DELIVERY_DATE_MAX = "2024-12-29"
INPUT_CSV_PATH = "top500.csv"

def fetch_ads_data(page_id):
    """Fetch data from the Facebook Ads Archive API for a single page ID."""
    try:
        logging.info(f"Fetching data for Page ID: {page_id}")
        return get_facebook_ads_archive(
            ACCESS_TOKEN,
            "",
            AD_TYPE,
            AD_REACHED_COUNTRIES,
            page_id,
            FIELDS,
            API_LIMIT,
            D_ACTIVE_STATUS,
            AD_DELIVERY_DATE_MIN,
            AD_DELIVERY_DATE_MAX,
        )
    except Exception as e:
        logging.error(f"Error fetching data for Page ID {page_id}: {e}")
        return None

def save_to_csv(data,page_id, file_name):
    """Save DataFrame to a CSV file."""
    if data.empty:
        logging.warning(f"No data to save for {file_name}. Skipping.")
        return
    os.makedirs(os.path.join(OUTPUT_DIR, page_id), exist_ok=True)
    file_path = os.path.join(OUTPUT_DIR, page_id,file_name)
    data.to_csv(file_path, index=False,encoding='utf-8')
    logging.info(f"Data saved to {file_path}")

def main():
    # Read input page IDs
    
    # df = pd.read_csv(os.path.join(os.getcwd(), INPUT_CSV_PATH), dtype=str)
    # page_ids = df["Page ID"].tolist()
    page_ids = ["100723801653101","114840783409005","210778132327279","197010357446014","1681478228766601","176688316811","55408246161","1434319750128022","278451599256","490930117644709","1455431764712359","102338366221744","337703022758334","214827221987263","39371299263","344542362982337","1460486154172874","109323929038","60516116431","103430204491217","1104866262972886","278117815576023","34825122262","1934837773438430","1477535869227488","278212515809","120343698070515","109961708643319","218764824871418","43855944703"]

    # Initialize combined results
    combined_fact_data = []
    combined_demo_data = []
    combined_region_data = []
    combined_creative_data = []
    combined_platform_data = []
    combined_reach_data = []

    # Process each page ID
    for idx, page_id in enumerate(page_ids, start=1):
        logging.info(f"Processing {idx}/{len(page_ids)}: Page ID {page_id}")

        # Fetch API data
        api_data = fetch_ads_data(page_id)
        if not api_data:
            logging.warning(f"No data returned for Page ID {page_id}. Skipping.")
            continue

        # Normalize data
        fact_data, demo_data, region_data, creative_data, platform_data,reach_data= normalize_data(api_data)

        # Combine results
        combined_fact_data.extend(fact_data)
        combined_demo_data.extend(demo_data)
        combined_region_data.extend(region_data)
        combined_creative_data.extend(creative_data)
        combined_platform_data.extend(platform_data)
        combined_reach_data.extend(reach_data)

        # Convert combined data to DataFrames
        fact_df = pd.DataFrame(combined_fact_data)
        demo_df = pd.DataFrame(combined_demo_data)
        region_df = pd.DataFrame(combined_region_data)
        creative_df = pd.DataFrame(combined_creative_data)
        platform_df = pd.DataFrame(combined_platform_data)
        reach_df = pd.DataFrame(combined_reach_data)

        # Save DataFrames to CSV
        save_to_csv(fact_df,page_id, f"{page_id}_fact_data.csv")
        save_to_csv(demo_df,page_id, f"{page_id}_demo_data.csv")
        save_to_csv(region_df,page_id, f"{page_id}_region_data.csv")
        save_to_csv(creative_df,page_id, f"{page_id}_creative_data.csv")
        save_to_csv(platform_df,page_id, f"{page_id}_platform_data.csv")
        save_to_csv(reach_df,page_id, f"{page_id}_reach_data.csv")

    logging.info("Data fetching and processing complete.")

if __name__ == "__main__":
    main()

2025-01-05 09:10:47,582 - INFO - Processing 1/30: Page ID 100723801653101
2025-01-05 09:10:47,583 - INFO - Fetching data for Page ID: 100723801653101
2025-01-05 09:10:49,987 - INFO - Processing 2/30: Page ID 114840783409005
2025-01-05 09:10:49,988 - INFO - Fetching data for Page ID: 114840783409005


Failed to retrieve data. Status code: 400


2025-01-05 09:10:50,564 - INFO - Processing 3/30: Page ID 210778132327279
2025-01-05 09:10:50,564 - INFO - Fetching data for Page ID: 210778132327279


Failed to retrieve data. Status code: 400


2025-01-05 09:10:53,618 - INFO - Processing 4/30: Page ID 197010357446014
2025-01-05 09:10:53,619 - INFO - Fetching data for Page ID: 197010357446014


Failed to retrieve data. Status code: 400


KeyboardInterrupt: 