In [1]:

import json
import sys
import collections
import datetime

import dateutil.parser
from matplotlib import pyplot as plt

from facebook_fetch import fetch

In [2]:
filename_old = '/home/michel/projects/desinfo/political-ads-scraper/data/facebook/API/FR/facebook-ads-archive_FR_2019-05-20_14-03-03.json'
filename_new = '/home/michel/projects/desinfo/political-ads-scraper/data/facebook/API/FR/facebook-ads-archive_FR_2019-05-27_14-53-52.json'

In [3]:
with open(filename_old, 'r') as f:
    ads_old = json.load(f)
with open(filename_new, 'r') as f:
    ads_new = json.load(f)

In [4]:
print('Comparing {} old ads and {} new ads.'.format(len(ads_old), len(ads_new)))

Comparing 13550 old ads and 9288 new ads.


In [5]:

for field in fetch.FIELDS:
    print('{} ads have the field "{}" on a total of {}'.format(
        sum([field in ad for ad in ads_new]),
        field,
        len(ads_new)
    ))
print()


9288 ads have the field "ad_creation_time" on a total of 9288
9236 ads have the field "ad_creative_body" on a total of 9288
8200 ads have the field "ad_creative_link_caption" on a total of 9288
5458 ads have the field "ad_creative_link_description" on a total of 9288
8467 ads have the field "ad_creative_link_title" on a total of 9288
9288 ads have the field "ad_delivery_start_time" on a total of 9288
2646 ads have the field "ad_delivery_stop_time" on a total of 9288
9288 ads have the field "ad_snapshot_url" on a total of 9288
9288 ads have the field "currency" on a total of 9288
9287 ads have the field "demographic_distribution" on a total of 9288
1129 ads have the field "funding_entity" on a total of 9288
9197 ads have the field "impressions" on a total of 9288
9288 ads have the field "page_id" on a total of 9288
9288 ads have the field "page_name" on a total of 9288
9288 ads have the field "region_distribution" on a total of 9288
9286 ads have the field "spend" on a total of 9288



In [6]:
# Index

def to_dict(ads):
    ads_by_id = {}
    for ad in ads:
        ad_id = fetch.get_ad_id(ad)
        assert ad_id not in ads_by_id, ad
        ads_by_id[ad_id] = ad
    return ads_by_id

ads_old_by_id = to_dict(ads_old)
ads_new_by_id = to_dict(ads_new)


# Find removed ads

old_ids = set(ads_old_by_id.keys())
new_ids = set(ads_new_by_id.keys())

new_only_ids = new_ids - old_ids
old_only_ids = old_ids - new_ids
both_ids = old_ids & new_ids

assert len(new_only_ids) + len(both_ids) == len(new_ids)
assert len(old_only_ids) + len(both_ids) == len(old_ids)

print('{} ads have been added.'.format(len(new_only_ids)))
print('{} ads have been removed.'.format(len(old_only_ids)))

695 ads have been added.
4957 ads have been removed.


In [7]:
ad_creation_date_list = []
ad_delivery_start_date_list = []
ad_delivery_stop_date_list = []

for removed_ad_id in old_only_ids:
    removed_ad = ads_old_by_id[removed_ad_id]

    if 'ad_creation_time' in removed_ad:
        d = dateutil.parser.parse(removed_ad['ad_creation_time'])
        ad_creation_date_list.append(d.date())
    if 'ad_delivery_start_time' in removed_ad:
        d = dateutil.parser.parse(removed_ad['ad_delivery_start_time'])
        ad_delivery_start_date_list.append(d.date())
    if 'ad_delivery_stop_time' in removed_ad:
        d = dateutil.parser.parse(removed_ad['ad_delivery_stop_time'])
        ad_delivery_stop_date_list.append(d.date())


Ads that were removed because diffusion stopped before April 15th vs all the removed ads

In [8]:
sum([
    d <= datetime.date(2019, 4, 15)
    for d in ad_delivery_stop_date_list
]), len(old_only_ids)

(4542, 4957)

Let's find the 415 ads that were not removed for this reason

In [15]:
removed_ads_unexplained = []

for removed_ad_id in old_only_ids:
    removed_ad = ads_old_by_id[removed_ad_id]

    if 'ad_delivery_stop_time' in removed_ad:
        d = dateutil.parser.parse(removed_ad['ad_delivery_stop_time'])
        if d.date() <= datetime.date(2019, 4, 15):
            continue
    
    removed_ads_unexplained.append(removed_ad)

len(removed_ads_unexplained)

415

In [26]:
for ad in removed_ads_unexplained:
    print(fetch.get_ad_id(ad))
    if 'ad_creative_body' in ad:
        print(ad['ad_creative_body'])
    if 'ad_delivery_stop_time' in ad:
        print(ad['ad_delivery_stop_time'])
    print('-'*80)

2033573636739869
🔴 Après presque 5 ans de conflit, les Yéménites ne peuvent plus attendre: 4,55 millions de personnes en insécurité alimentaire aigüe.
--------------------------------------------------------------------------------
2085635328180836
Translate in chats, on webpages and in any other apps
--------------------------------------------------------------------------------
592518337928300
🏠 PROPRIÉTAIRES DE MAISONS INDIVIDUELLES 🏠
♻️ Faites de grosses économies  grâce à la pompe à chaleur financée par l’état 🇨🇵  👍
Bénéficiez du dispositif de l’État « Coup de pouce économies d’énergie », et économisez jusqu'à 10 500€* pour la pose d'une pompe à chaleur ! 
🎁 BONUS: Isolation des combles à 1€  pour la pose d'une pompe à chaleur si éligible au programme de l’État
--------------------------------------------------------------------------------
486058748596253
Translate in chats, on webpages and in any other apps
-----------------------------------------------------------------------

Ads that were not displayed after April 15th, but were not removed.

In [9]:
sum([
    dateutil.parser.parse(ad['ad_delivery_stop_time']).date() <= datetime.date(2019, 4, 15)
    for ad in ads_new
    if 'ad_delivery_stop_time' in ad
]), len(ads_new)

(98, 9288)