In [13]:
# path to better.fyi data
# available at https://github.com/UChicagoSUPERgroup/better-content
better_path = '../../better-content'

In [14]:
import os
import json
from urllib.parse import urlparse

In [15]:
# helpers for markdown parsing

from bs4 import BeautifulSoup
import mistune
def target_links(html):
    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all('a')
    for link in links:
        link['target'] = '_blank'
    return str(soup)

def markdown(md):
    return target_links(mistune.markdown(md))

In [21]:
# process better data

better_data = {}
trackers = os.listdir(better_path + '/trackers')
for tracker in trackers:
    filename = better_path + '/trackers/' + tracker + '/index.md'
    f = open(filename, "r")
    lines=f.readlines()
    name = lines[0].split('**')[1]
    description = markdown(lines[2][2:])
    for i, line in enumerate(lines):
        if line == "<!-- prevalence -->\n":
            prevalence = markdown(lines[i+1])
        if line == "## Notes\n":
            notes = markdown(''.join(lines[i+2:]))
    better_data[tracker] = {
        'name': name,
        'description': description,
        'prevalence': prevalence,
        'notes': notes
    }

with open('better.json', 'w') as f:
  json.dump(better_data, f, ensure_ascii=False, indent=2)

In [23]:
# disconnect list processing

# originally based off of https://github.com/duckduckgo/duckduckgo-privacy-extension/blob/418e30d36e1c24e27930acb534caeb3ffc81c6a8/scripts/importers/companyList.js
# (with a few subsequent rewrites and translation to python)

domainEntityMap = {}
companyData = {}

with open('disconnect.json') as disconnect_json:
    disconnect_list = json.load(disconnect_json)
    
    with open('better.json') as better_json:
        better_data = json.load(better_json)

        for type in disconnect_list['categories']:
            for entry in disconnect_list['categories'][type]:
                (name, info) = list(entry.items())[0]
                (site, domains) = list(info.items())[0]

                cleanedSite = urlparse(site).hostname
                if not cleanedSite:
                    cleanedSite = site[:-1]
                cleanedSite = cleanedSite.split('www.')[-1]

                data = {
                    'site': cleanedSite,
                    'domains': domains,
                    'type': type
                }
                if cleanedSite in better_data:
                    better = better_data[cleanedSite]
                    data['description'] = better['description']
                    data['prevalence'] = better['prevalence']
                    data['notes'] = better['notes']

                companyData[name] = data

                for domain in domains:
                    domainEntityMap[domain] = name


# facebook, twitter, and google are classified under the "disconnect" category for legacy reasons
# so we recategorize them under the correct categories
companyData['Facebook']['type'] = 'Social'
companyData['Twitter']['type'] = 'Social'
companyData['Google']['type'] = 'Advertising' # what is google?

In [24]:
with open('../src/data/trackers/companyData.json', 'w') as f:
  json.dump(companyData, f, ensure_ascii=False, indent=2)
with open('../src/data/trackers/domainEntityMap.json', 'w') as f:
  json.dump(domainEntityMap, f, ensure_ascii=False, indent=2)