In [9]:
import requests
import json
import os

def get_next_page(response):
    paging = response.get("paging", {})
    next_url = paging.get("next", None)
    return next_url

def main(api_url):
    all_data = []
    
    while api_url:
        response = requests.get(api_url)
        if response.status_code == 200:
            json_data = response.json()
            data = json_data.get("data", [])
            all_data.extend(data)
            api_url = get_next_page(json_data)
        else:
            print("Error:", response.status_code)
            break

    return all_data


In [None]:
def extract_data(bylines, output_dir, access_token, api_version="v21.0", country="CZ", language="cs"):
    """
    Extracts data from the Facebook Ads API for a list of bylines.
    
    Parameters:
        bylines (list): List of bylines to query.
        output_dir (str): Directory to save the JSON files.
        access_token (str): Access token for the API.
        api_version (str): Version of the Facebook Graph API (default: v21.0).
        country (str): Country code for filtering ads (default: CZ).
        language (str): Language code for filtering ads (default: cs).
    """
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    fields = ",".join([
        "id", "ad_snapshot_url", "ad_creation_time", "ad_creative_bodies", "ad_creative_link_captions",
        "ad_creative_link_descriptions", "ad_creative_link_titles", "ad_delivery_start_time", 
        "ad_delivery_stop_time", "bylines", "currency", "delivery_by_region", "demographic_distribution", 
        "estimated_audience_size", "impressions", "languages", "page_id", "page_name", "publisher_platforms", 
        "spend", "target_locations", "target_gender", "target_ages", "eu_total_reach", "beneficiary_payers", 
        "age_country_gender_reach_breakdown"
    ])
    
    for byline in bylines:
        print(f"Extracting Data for {byline}")
        
        api_url = (
            f"https://graph.facebook.com/{api_version}/ads_archive?"
            f"bylines={byline}&ad_type=POLITICAL_AND_ISSUE_ADS"
            f"&ad_reached_countries=['{country}']&access_token={access_token}"
            f"&unmask_removed_content=true&fields={fields}&limit=100"
            f"&search_terms=''&languages=['{language}']"
        )
        
        # Assuming main() handles the API request and returns the data
        extracted_data = main(api_url)
        
        # Write the extracted data to a JSON file
        filename = os.path.join(output_dir, f"data_{byline}.json")
        with open(filename, "w", encoding='utf-8') as json_file:
            json.dump(extracted_data, json_file, indent=4, ensure_ascii=False)
        
        print(f"All Extracted Data written to {filename}")


# Parameters
base_dir = os.getcwd()
output_dir = os.path.join(base_dir, "bylines_ads")
bylines = ["Milion Chvilek, z. s."]
# bylines = ["Milion Chvilek, z. s.","Svoboda zvířat Plzeň, z.s.","Člověk v tísni, o.p.s.","Lékaři bez hranic - Médecins Sans Frontières in Czech Republic, o. p. s.","SH media, spol. s r.o."]
access_token = "EAALnc8im5MUBOZBXcyevhADuqZBvZBT8VZBr3bmodjXeycutV6XsFmxWa7BFKeK5UpFJ3RDgybaaweneTgXn8B0j94fxRHoUpm0VmHv9cMawbupl7bCERO7mEO43ftZBIMKxuZALmb4wCZBd7iZBxz7C8mYhE0HSOf148ZAH3rZAusnz9hCGfwn0EI2hZBJdolB4SFn0wVuyWTN"


# Call the function
extract_data(bylines, output_dir, access_token)

In [None]:
base_dir = os.getcwd()
  # Directory to save the JSON files
output_dir = os.path.join(base_dir, "bylines_ads")

bylines = ['Česká pirátská strana', 'EUROPEUM Institute for European Policy', 'FTV Prima', 'ANO', 'Naše zdravotnictví', 'Svoboda a přímá demokracie (SPD)', 'ČSOB', 'ODS', 'Lékaři bez hranic - Médecins Sans Frontières in Czech Republic, o. p. s.', 'Aliance pro budoucnost', 'STAN', 'Starostové a nezávislí • STAN', 'PRAHA SOBĚ', 'Člověk v tísni, o.p.s.', 'SH media, spol. s r.o.', 'Komunistická strana Čech a Moravy', 'Zdeněk Hraba - senátor', 'Ondrej Prokop', 'Ministerstvo práce a sociálních věcí', 'Tomáš Zdechovský', 'Milion Chvilek, z. s.', 'EU Social', 'EU Justice and Consumers', 'ODS - Občanská demokratická strana', 'Sociální demokracie', 'Amnesty International ČR', 'Občanská demokratická strana', 'Tipsport', 'Karel Janeček', 'Martin Kuba', 'Starostové a nezávislí', 'Svoboda zvířat Plzeň, z.s.', 'CARE Česká republika', 'Replastuj.cz', 'ANO 2011', 'Reportér magazín', 'Andrej Babiš', 'Člověk v tísni o.p.s.', 'DFMG', 'TOP 09', 'SEN 21', 'Kupředu do minulosti s.r.o.', 'Ministerstvo pro místní rozvoj ČR', 'Svoboda a přímá demokracie', 'Greenpeace Česká republika', 'STAROSTOVÉ A NEZÁVISLÍ', 'KDU-ČSL', 'Česká pirátská strana - Praha', 'XTV', 'Hnutí DUHA - Přátelé Země Česká republika', 'Zelení - Strana zelených', 'PŘÍSAHA - občanské hnutí Roberta Šlachty', 'Český rozhlas', 'CZECH NEWS CENTER a. s.', 'Oldřich Hájek', 'Transparency International ČR']

for byline in bylines:
    # fields = "id,ad_snapshot_url,ad_creation_time,ad_creative_bodies,ad_creative_link_captions,ad_creative_link_descriptions,ad_creative_link_titles,ad_delivery_start_time,ad_delivery_stop_time,bylines,currency,delivery_by_region,demographic_distribution,estimated_audience_size,impressions,languages,page_id,page_name,publisher_platforms,spend,target_locations,target_gender,target_ages,eu_total_reach,beneficiary_payers,age_country_gender_reach_breakdown"
    # supported_countries = ["CZ"]
    print(f"Extracting Data for {byline}")
    access_token = "EAALnc8im5MUBOZBXcyevhADuqZBvZBT8VZBr3bmodjXeycutV6XsFmxWa7BFKeK5UpFJ3RDgybaaweneTgXn8B0j94fxRHoUpm0VmHv9cMawbupl7bCERO7mEO43ftZBIMKxuZALmb4wCZBd7iZBxz7C8mYhE0HSOf148ZAH3rZAusnz9hCGfwn0EI2hZBJdolB4SFn0wVuyWTN"
    api_url = f"https://graph.facebook.com/v21.0/ads_archive?bylines={byline}&ad_type=POLITICAL_AND_ISSUE_ADS&ad_reached_countries=['CZ']&access_token={access_token}&unmask_removed_content=true&fields=id,ad_snapshot_url,ad_creation_time,ad_creative_bodies,ad_creative_link_captions,ad_creative_link_descriptions,ad_creative_link_titles,ad_delivery_start_time,ad_delivery_stop_time,bylines,currency,delivery_by_region,demographic_distribution,estimated_audience_size,impressions,languages,page_id,page_name,publisher_platforms,spend,target_locations,target_gender,target_ages,eu_total_reach,beneficiary_payers,age_country_gender_reach_breakdown&limit=999&search_terms=''&languages=['cs']"
    extracted_data = main(api_url)

    # Write the extracted data to a JSON file
    filename = os.path.join(output_dir, f"data_{byline}.json")
    with open(filename, "w", encoding='utf-8') as json_file:
                json.dump(extracted_data, json_file, indent=4, ensure_ascii=False)

    print(f"All Extracted Data written to {filename}")
    

In [22]:
import os
import json

def replace_id_with_underscore_id(json_data):
    if isinstance(json_data, dict):
        for key, value in list(json_data.items()):
            if key == "id":
                json_data["_id"] = json_data.pop("id")
            replace_id_with_underscore_id(value)
    elif isinstance(json_data, list):
        for item in json_data:
            replace_id_with_underscore_id(item)

def replace_id_in_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            
            with open(file_path, "r", encoding="utf-8") as json_file:
                data = json.load(json_file)

            replace_id_with_underscore_id(data)

            with open(file_path, "w", encoding="utf-8") as json_file:
                json.dump(data, json_file, indent=4, ensure_ascii=False)

if __name__ == "__main__":
    folder_path = "bylines_ads"  # Replace with the actual path to your folder
    replace_id_in_folder(folder_path)


In [27]:
import os
import json

def append_json_files(folder_path):
    all_data = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            
            with open(file_path, "r", encoding="utf-8") as json_file:
                data = json.load(json_file)
                all_data.extend(data)
    
    return all_data

# Specify the folder containing JSON files
folder_path = 'bylines_ads'

# Append all JSON files in the folder
appended_data = append_json_files(folder_path)

# Print the total number of documents appended
print(f"Total documents appended: {len(appended_data)}")

# Write the extracted data to a JSON file
filename = os.path.join(output_dir, f"data_all.json")
with open(filename, "w", encoding='utf-8') as json_all:
    json.dump(appended_data, json_all, indent=4, ensure_ascii=False)

Total documents appended: 89500


### push data to mongo

update if exists

insert if not


In [None]:
from pymongo import MongoClient

# Connection to MongoDB
client = MongoClient('mongodb://localhost:27017')

# Select your database
db = client['meta_ads_db']

# Select your collection
collection = db['meta_ads_collection']

# Specify the folder containing JSON files
folder_path = 'political_ads_delta'

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)

        # Load data from JSON file with 'utf-8' encoding
        with open(file_path, encoding='utf-8') as file:
            data = json.load(file)

        # Upsert each document from the JSON file
        for document in data:
            result = collection.update_one(
                {"_id": document["_id"]},
                {"$set": document},
                upsert=True
            )
        # Print the total number of documents added or updated for the current file
        print(f"File: {filename}, Total documents processed: {len(data)}")


# Close the connection
client.close()

## drop entire db

In [None]:
# from pymongo import MongoClient
# 
# # Connection to MongoDB
# client = MongoClient('mongodb://localhost:27017')
# 
# # Select your database
# db = client['meta_ads_db']
# 
# # Select your collection
# collection = db['meta_ads_collection']
# 
# # Truncate (remove all documents from) the collection
# result = collection.delete_many({})
# 
# # Print the number of deleted documents
# print(f"Number of documents deleted: {result.deleted_count}")
# 
# # Close the connection
# client.close()
