## OONI data analysis case study for dnsping

### Downloading the data

We offer a tool called oonidata (that's currently in BETA), which can be installed by running:
```
pip install oonidata
```

To download all OONI data for this example notebook, run the following command:
```
oonidata sync --country-codes MY --since 2024-08-01 --until 2024-09-20 --output-dir ~/projects/imap/ooni-data/ --test-name dnsping
```



In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime, timedelta
from dateutil.parser import parse as parse_date
from urllib.parse import urlencode, quote, urlparse

from tqdm import tqdm
tqdm.pandas()

### OONI Explorer utility functions

Below are a couple of useful utility functions when dealing with measurements. They take a dataframe row and return (or print) the OONI Explorer URL. This is useful to get a link to OONI explorer to more easily inspect the raw measurement to better understand what is going on.

In [44]:
def get_explorer_url(e):
    query = ''
    if 'input' in e.keys() and e['input']:
        query = '?input={}'.format(quote(e['input'], safe=''))
    return 'https://explorer.ooni.org/measurement/{}{}'.format(e['report_id'], query)
    
def print_explorer_url(e):
    print(get_explorer_url(e))

# Extracting metadata from raw measurements

The OONI raw data is very rich, but for most analysis use-cases you just need a subset of the fields or some value that is derived from them.

Below are functions that will extract all the metadata we care about from the web_connectivity test.



In [45]:
import requests
from base64 import b64decode
import hashlib
import json
import re

def get_raw_measurement(row):
    r = requests.get("https://api.ooni.io/api/v1/measurement_meta", params={
        'report_id':row['report_id'],
        'input': row['input'],
        'full': True
    })
    j = r.json()
    return json.loads(j['raw_measurement'])

def get_resolved_ips(msmt):
    queries = msmt['test_keys'].get('queries', [])
    if not queries:
        return ''
    answers = queries[0].get('answers', [])
    if not answers:
        return []
    
    ip_list = []
    for a in answers:
        ip = a.get('ipv4', '')
        if ip:
            ip_list.append(ip)
    return ip_list

def get_control_failure(msmt):
    if 'test_keys' not in msmt:
        return 'missing_test_keys'
    return msmt['test_keys']['control_failure']

def get_test_keys_blocking(msmt):
    return str(msmt['test_keys']['blocking'])

#def get_http_experiment_failure(msmt):
#    if 'http_experiment_failure' not in str(msmt['test_keys']):
#        return None
#    else: 
#        return str(msmt['test_keys']['http_experiment_failure'])
    
def get_http_experiment_failure(msmt):
    if 'http_experiment_failure' not in str(msmt['test_keys']):
        return 'None'
    else: 
        return str(msmt['test_keys']['http_experiment_failure'])
    
def get_resolver_info(msmt):
    return {
        'resolver_ip': msmt.get('resolver_ip', ''),
        'resolver_asn': msmt.get('resolver_asn', ''),
        'resolver_network_name': msmt.get('resolver_network_name', '')
    }

def get_network_events(msmt):
    return msmt['test_keys'].get('network_events', [])

def get_pings(msmt):
    return msmt['test_keys'].get('pings', [])

def get_first_ping(msmt):
    pings = msmt['test_keys'].get('pings', [])
    return pings[0] if pings else None

def get_ip_from_second_ping(msmt):
    pings = msmt['test_keys'].get('pings', [])
    if len(pings) > 1:  # Ensure there is a second ping
        answers = pings[1]['query'].get('answers', [])
        if answers:  # Make sure answers is not None or an empty list
            for answer in answers:
                if 'ipv6' in answer:
                    return answer['ipv6']
                elif 'ipv4' in answer:
                    return answer['ipv4']
    return None  # Return None if no second ping or no IP address found


def get_tcp_connect(msmt):
    return msmt['test_keys'].get('tcp_connect', [])

def decode_body(body):
    if body is None:
        return ''
    if isinstance(body, dict):
        raw_body = b64decode(body['data'])
        try:
            return raw_body.decode('utf-8')
        except:
            return raw_body
    return body

def get_last_response_body(msmt):
    try:
        # The requests/response list sorts them from the newest to the oldest, 
        # hence the first item in the list is the last response we received.
        body = msmt['test_keys']['requests'][0]['response']['body']
        return decode_body(body)
    except (KeyError, TypeError, IndexError):
        return ''

TITLE_REGEXP = re.compile("<title.*?>(.*?)</title>", re.IGNORECASE | re.DOTALL)
# Doesn't take into account ordering
META_TITLE_REGEXP = re.compile("<meta.*?property=\"og:title\".*?content=\"(.*?)\"", re.IGNORECASE | re.DOTALL)

def get_http_title(msmt):
    body = get_last_response_body(msmt)
    # If the body is not a str object, it means it's binary (or an encoding we could not detect). 
    # No point in trying to extract the title.
    # Handling it like this is not very clean or nice.
    if not isinstance(body, str):
        return ''

    m = TITLE_REGEXP.search(body, re.IGNORECASE | re.DOTALL)
    if m:
        return m.group(1)
    return ''

    return extract_title(get_last_response_body(msmt))

def get_meta_http_title(msmt):
    body = get_last_response_body(msmt)
    if not isinstance(body, str):
        return ''

    m = META_TITLE_REGEXP.search(body, re.IGNORECASE | re.DOTALL)
    if m:
        return m.group(1)
    return ''

def get_http_body_hash(msmt):
    body = get_last_response_body(msmt)
    if body == '':
        return ''
    if isinstance(body, str):
        # We need the content of the body to be binary.
        body = body.encode('utf-8')
    return hashlib.md5(body[:2048]).hexdigest()

def base_metadata(msmt):
    base_keys = [
        'input',
        'measurement_start_time',
        'probe_asn',
        'probe_cc',
        'probe_network_name',
        'report_id',
        'resolver_asn',
        'resolver_ip',
        'resolver_network_name',
        'software_name',
        'software_version',
        'test_name',
        'test_runtime',
        'test_version'
    ]
    base_metadata = {}
    for k in base_keys:
        base_metadata[k] = msmt.get(k, '')
    annotations = msmt.pop('annotations')
    base_metadata['network_type'] = annotations.get('network_type', 'unknown')
    base_metadata['origin'] = annotations.get('origin', 'unknown')
    base_metadata['platform'] = annotations.get('platform', 'unknown')
    return base_metadata

In [46]:
def get_measurement_meta(msmt):
    m = base_metadata(msmt)
    m.update(get_resolver_info(msmt))
    m.update({
        'dns_resolved_ips': get_resolved_ips(msmt),
        'pings': get_pings(msmt),
        'get_ip_from_second_ping' : get_ip_from_second_ping(msmt)
    })
    return m

### Parsing raw files on disk, filtering and transforming them

Below are functions that will list the files on disk, given a search query, and return an iterator of the raw measurement dict.

These functions are then called by either `msmt_to_csv` or `get_msmt_df`, which write the processed data to a CSV file or load it in memory as a pandas DataFrame respectively.

It's generally recommended, when you are dealing with very large datasets, to write the minimised form of the data to a file on disk so that you don't have to re-parse everything if your notebook crashes.

In [47]:
from tqdm import tqdm
from pathlib import Path
import gzip
import ujson

data_dir = Path("/home/sitinurliza/projects/imap/ooni-data")

def iter_msmts(fp):
    with gzip.open(fp) as in_file:
        yield from [ujson.loads(line) for line in in_file]
            
def iter_jsonl_paths(query):
    for p in data_dir.glob('*/*/*/*'):
        ts, cc, tn = p.name.split('_')
        tn = tn.split('.')[0]
        ts = datetime.strptime(ts, '%Y%m%d%H')
        if query.get('probe_cc') and cc != query['probe_cc']:
            continue
        if query.get('test_name') and tn != query['test_name'].replace('_', ''):
            continue
        if query.get('since') and parse_date(query['since']) >= ts:
            continue
        if query.get('until') and parse_date(query['until']) <= ts:
            continue
        yield p
        
def iter_raw_measurements(query):
    path_list = list(iter_jsonl_paths(query))
    print(f"processing {len(path_list)}")
    for fp in tqdm(path_list):
        for msmt in iter_msmts(fp):
            if query.get('probe_asn') and msmt['probe_asn'] != query['probe_asn']:
                continue
            if query.get('domain'):
                domain = urlparse(msmt['input']).netloc
                if domain != query['domain']:
                    continue
            yield msmt

In [48]:
import csv

def msmt_to_csv(query, output_file="output.csv"):
    with open(output_file, 'w') as output_file:
        csv_writer = None
        for msmt in iter_raw_measurements(query):
            msmt_meta = get_measurement_meta(msmt)
            if csv_writer is None:
                fieldnames = msmt_meta.keys()
                csv_writer = csv.DictWriter(output_file, fieldnames=fieldnames)
                csv_writer.writeheader()
            csv_writer.writerow(msmt_meta)
            

In [49]:
def get_msmt_df(query):
    msmt_list = []
    for msmt in iter_raw_measurements(query):
        mdf = pd.DataFrame([get_measurement_meta(msmt)])
        msmt_list.append(mdf)
    return pd.concat(msmt_list, ignore_index=True)

Here we do the actual conversion to CSV. Edit the dates and country codes accordingly.

In [60]:
msmt_to_csv({
    'since': '2024-08-01',
    'until': '2024-09-30',
    'probe_cc': 'MY',
    'test_name': 'dnsping'
}, output_file="ooni-data-dnsping-2024.csv")

processing 130


100%|████████████████████████████████████████| 130/130 [00:00<00:00, 142.62it/s]


We then load the CSV file in memory as a pandas dataframe for more analysis

In [61]:
df['explorer_url'] = "https://explorer.ooni.org/measurement/" + df['report_id'] + "?input=" + df['input']

In [62]:
import numpy as np

df['blocking'] = np.where(df['get_ip_from_second_ping'] == '175.139.142.25', True, False)

In [59]:
df_1 = df.drop('pings', axis=1)
df_1.to_csv('ooni-data-dnsping-2024-1.csv')

In [39]:
df = pd.read_csv('ooni-data-dnsping-2024.csv', low_memory=False)

In [34]:
len(df)

905

In [35]:
list(df)

['input',
 'measurement_start_time',
 'probe_asn',
 'probe_cc',
 'probe_network_name',
 'report_id',
 'resolver_asn',
 'resolver_ip',
 'resolver_network_name',
 'software_name',
 'software_version',
 'test_name',
 'test_runtime',
 'test_version',
 'network_type',
 'origin',
 'platform',
 'dns_resolved_ips',
 'pings',
 'first_ping',
 'get_ip_from_second_ping']

In [36]:
df

Unnamed: 0,input,measurement_start_time,probe_asn,probe_cc,probe_network_name,report_id,resolver_asn,resolver_ip,resolver_network_name,software_name,...,test_name,test_runtime,test_version,network_type,origin,platform,dns_resolved_ips,pings,first_ping,get_ip_from_second_ping
0,udp://1.1.1.1:53,2024-08-26 07:20:58,AS4788,MY,TM TECHNOLOGY SERVICES SDN BHD,20240826T072108Z_dnsping_MY_4788_n1_tJk1bvfDAO...,AS15169,172.217.43.146,Google LLC,miniooni,...,dnsping,10.001480,0.4.0,unknown,unknown,macos,,"[{'query': {'answers': [{'asn': 15169, 'as_org...","{'query': {'answers': [{'asn': 15169, 'as_org_...",2404:6800:4003:c1c::64
1,udp://1.1.1.1:853,2024-08-26 07:21:14,AS4788,MY,TM TECHNOLOGY SERVICES SDN BHD,20240826T072129Z_dnsping_MY_4788_n1_nM1WiRTRsk...,AS15169,172.253.211.92,Google LLC,miniooni,...,dnsping,14.001458,0.4.0,unknown,unknown,macos,,"[{'query': {'answers': None, 'engine': 'udp', ...","{'query': {'answers': None, 'engine': 'udp', '...",
2,udp://1.1.1.3:53,2024-08-26 07:21:44,AS9930,MY,TT DOTCOM SDN BHD,20240826T072157Z_dnsping_MY_9930_n1_SAmuLg66GM...,AS15169,172.253.211.92,Google LLC,miniooni,...,dnsping,13.003519,0.4.0,unknown,unknown,macos,,"[{'query': {'answers': [{'asn': 15169, 'as_org...","{'query': {'answers': [{'asn': 15169, 'as_org_...",142.251.175.138
3,udp://8.8.8.8:53,2024-08-26 13:24:10,AS9534,MY,Maxis Broadband Sdn Bhd,20240826T132421Z_dnsping_MY_9534_n1_iCsotY0ZKm...,AS9534,202.75.146.218,Maxis Broadband Sdn Bhd,miniooni,...,dnsping,10.000755,0.4.0,unknown,unknown,linux,,[{'query': {'answers': [{'answer_type': 'CNAME...,{'query': {'answers': [{'answer_type': 'CNAME'...,175.139.142.25
4,udp://1.1.1.1:53,2024-08-26 13:24:21,AS9534,MY,Maxis Broadband Sdn Bhd,20240826T132421Z_dnsping_MY_9534_n1_iCsotY0ZKm...,AS9534,202.75.146.218,Maxis Broadband Sdn Bhd,miniooni,...,dnsping,10.000447,0.4.0,unknown,unknown,linux,,[{'query': {'answers': [{'answer_type': 'CNAME...,{'query': {'answers': [{'answer_type': 'CNAME'...,175.139.142.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900,udp://208.67.222.222:53,2024-09-12 00:00:13,AS4788,MY,TM TECHNOLOGY SERVICES SDN BHD,20240912T000013Z_dnsping_MY_4788_n1_Y9YP7JJ23x...,AS4788,202.188.1.176,TM TECHNOLOGY SERVICES SDN BHD,miniooni,...,dnsping,10.000706,0.4.0,unknown,unknown,linux,,"[{'query': {'answers': [{'asn': 13335, 'as_org...","{'query': {'answers': [{'asn': 13335, 'as_org_...",2606:4700:4400::ac40:9638
901,udp://9.9.9.9:53,2024-09-12 00:00:14,AS4788,MY,TM TECHNOLOGY SERVICES SDN BHD,20240912T000014Z_dnsping_MY_4788_n1_D6mdZNRAnP...,AS4788,202.188.1.181,TM TECHNOLOGY SERVICES SDN BHD,miniooni,...,dnsping,11.001709,0.4.0,unknown,unknown,linux,,"[{'query': {'answers': [{'asn': 13335, 'as_org...","{'query': {'answers': [{'asn': 13335, 'as_org_...",2606:4700:4400::6812:25c8
902,udp://1.1.1.1:53,2024-09-12 00:00:14,AS4788,MY,TM TECHNOLOGY SERVICES SDN BHD,20240912T000014Z_dnsping_MY_4788_n1_DktP3nZK57...,AS4788,202.188.1.176,TM TECHNOLOGY SERVICES SDN BHD,miniooni,...,dnsping,10.000562,0.4.0,unknown,unknown,linux,,"[{'query': {'answers': [{'asn': 13335, 'as_org...","{'query': {'answers': [{'asn': 13335, 'as_org_...",104.18.37.200
903,udp://8.8.8.8:53,2024-09-12 00:00:14,AS4788,MY,TM TECHNOLOGY SERVICES SDN BHD,20240912T000014Z_dnsping_MY_4788_n1_XUKsogXexR...,AS4788,202.188.1.181,TM TECHNOLOGY SERVICES SDN BHD,miniooni,...,dnsping,10.001894,0.4.0,unknown,unknown,linux,,"[{'query': {'answers': [{'asn': 13335, 'as_org...","{'query': {'answers': [{'asn': 13335, 'as_org_...",104.18.37.200


In [57]:
import pandas as pd
from io import StringIO

# Replace this with your pasted table
table = """
input	resolver
udp://8.8.8.8:53	Google
udp://8.8.4.4:53	Google
udp://1.9.1.9:53	TM
udp://208.67.222.222:53	OpenDNS
udp://208.67.220.220:53	OpenDNS
udp://9.9.9.9:53	Quad9
udp://1.1.1.1:53	Cloudflare
udp://4.4.4.4:53	4.4.4.4
udp://3.3.2.4:53	3.3.2.4
udp://3.2.1.1:53	3.2.1.1
udp://1.0.0.1:53	1.0.0.1
udp://149.112.112.112:53	Quad9
udp://1.1.1.3:53	Cloudflare
udp://1.1.1.2:53	Cloudflare
udp://1.1.1.2:853	Cloudflare
udp://5.5.5.5:53	5.5.5.5
"""

# Use StringIO to convert the string into a file-like object
data = StringIO(table)

# Create a DataFrame from the pasted table
df_lookup = pd.read_csv(data, sep=r'\s+')

df_lookup.head()


Unnamed: 0,input,resolver
0,udp://8.8.8.8:53,Google
1,udp://8.8.4.4:53,Google
2,udp://1.9.1.9:53,TM
3,udp://208.67.222.222:53,OpenDNS
4,udp://208.67.220.220:53,OpenDNS


In [65]:
# Create a mapping dictionary from df_lookup
lookup_dict = pd.Series(df_lookup['resolver'].values, index=df_lookup['input']).to_dict()

# Create the 'resolver' column in df based on the lookup from df_lookup
df['resolver'] = df['input'].map(lookup_dict)

# Optionally, replace NaN values with empty strings if you want blanks
df['resolver'] = df['resolver'].fillna('')

df.tail()

Unnamed: 0,input,measurement_start_time,probe_asn,probe_cc,probe_network_name,report_id,resolver_asn,resolver_ip,resolver_network_name,software_name,...,test_version,network_type,origin,platform,dns_resolved_ips,pings,get_ip_from_second_ping,explorer_url,blocking,resolver
900,udp://208.67.222.222:53,2024-09-12 00:00:13,AS4788,MY,TM TECHNOLOGY SERVICES SDN BHD,20240912T000013Z_dnsping_MY_4788_n1_Y9YP7JJ23x...,AS4788,202.188.1.176,TM TECHNOLOGY SERVICES SDN BHD,miniooni,...,0.4.0,unknown,unknown,linux,,"[{'query': {'answers': [{'asn': 13335, 'as_org...",2606:4700:4400::ac40:9638,https://explorer.ooni.org/measurement/20240912...,False,OpenDNS
901,udp://9.9.9.9:53,2024-09-12 00:00:14,AS4788,MY,TM TECHNOLOGY SERVICES SDN BHD,20240912T000014Z_dnsping_MY_4788_n1_D6mdZNRAnP...,AS4788,202.188.1.181,TM TECHNOLOGY SERVICES SDN BHD,miniooni,...,0.4.0,unknown,unknown,linux,,"[{'query': {'answers': [{'asn': 13335, 'as_org...",2606:4700:4400::6812:25c8,https://explorer.ooni.org/measurement/20240912...,False,Quad9
902,udp://1.1.1.1:53,2024-09-12 00:00:14,AS4788,MY,TM TECHNOLOGY SERVICES SDN BHD,20240912T000014Z_dnsping_MY_4788_n1_DktP3nZK57...,AS4788,202.188.1.176,TM TECHNOLOGY SERVICES SDN BHD,miniooni,...,0.4.0,unknown,unknown,linux,,"[{'query': {'answers': [{'asn': 13335, 'as_org...",104.18.37.200,https://explorer.ooni.org/measurement/20240912...,False,Cloudflare
903,udp://8.8.8.8:53,2024-09-12 00:00:14,AS4788,MY,TM TECHNOLOGY SERVICES SDN BHD,20240912T000014Z_dnsping_MY_4788_n1_XUKsogXexR...,AS4788,202.188.1.181,TM TECHNOLOGY SERVICES SDN BHD,miniooni,...,0.4.0,unknown,unknown,linux,,"[{'query': {'answers': [{'asn': 13335, 'as_org...",104.18.37.200,https://explorer.ooni.org/measurement/20240912...,False,Google
904,udp://8.8.4.4:53,2024-09-12 00:00:15,AS4788,MY,TM TECHNOLOGY SERVICES SDN BHD,20240912T000015Z_dnsping_MY_4788_n1_odSRnmymOe...,AS4788,202.188.1.181,TM TECHNOLOGY SERVICES SDN BHD,miniooni,...,0.4.0,unknown,unknown,linux,,"[{'query': {'answers': [{'asn': 13335, 'as_org...",104.18.37.200,https://explorer.ooni.org/measurement/20240912...,False,Google


In [66]:
df_1 = df.drop('pings', axis=1)
df_1.to_csv('ooni-data-dnsping-2024-1.csv')