# Experiment Mesh Analysis


## Imports and constants declarations

In [1]:
# external imports
import pandas as pd
from ast import literal_eval
from shapely import (
    from_geojson
)
import ipaddress
import os
import json
import requests
from dotenv import load_dotenv

In [2]:
# internal imports
from src.utils.common_functions import (
    json_file_to_dict,
    get_list_files_in_path
)
from src.utils.constants import (
    EEE_COUNTRIES_FILEPATH,
    REPLICATION_PACKAGE_DIR,
    PARTIAL_RESULTS_DIR,
    TRAFFIC_LOGS_IP_CLASSIFIED_FILEPATH,
    ANYCAST_PII_TRAFFIC_LOGS_FILEPATH,
    APKS_METADATA_FILEPATH,
    RESULTS_MODES,
    IT_ANNOTATION_FILEPATH,
    TPLS_RESULTS_FILEPATH,
    ANYCAST_IP_CLASSIFICATION_FILEPATH,
    TPLS_MANUAL_POLICY_INFO,
    TPLS_POLICY_ANALYSIS
)

In [3]:
load_dotenv("./src/utils/.env")
IP_URL = str(os.getenv("CACHE_IP_URL")) or ""
# Constants
EEE_countries_set = set([country["alpha-2"] for country in json_file_to_dict(EEE_COUNTRIES_FILEPATH)])

In [4]:
# Analysis params
ANALYSIS_MODE=RESULTS_MODES[1]
GENERATE_ROUTES_RAW = False
GENERATE_ROUTES_ADDONS = True
ANALYZE_DATA = True

In [5]:
# Filepaths variables
EXPERIMENT_RESULTS_FOLDER = f"{REPLICATION_PACKAGE_DIR}/experiment_results_{ANALYSIS_MODE}"
ANALYSIS_FOLDER = f"{REPLICATION_PACKAGE_DIR}/analysis_{ANALYSIS_MODE}"

ROUTES_RESULTS_FILENAME = f"{ANALYSIS_FOLDER}/routes_results_{ANALYSIS_MODE}.csv"
ROUTES_RESULTS_NON_SUSPICIOUS_FILENAME = f"{ANALYSIS_FOLDER}/routes_results_non_suspicious_{ANALYSIS_MODE}.csv"
ROUTES_RESULTS_SUSPICIOUS_FILENAME = f"{ANALYSIS_FOLDER}/routes_results_suspicious_{ANALYSIS_MODE}.csv"
ROUTES_FREQUENCY_FILENAME = f"{ANALYSIS_FOLDER}/routes_frequency_{ANALYSIS_MODE}.csv"
ROUTES_FREQUENCY_NON_SUSPICIOUS_FILENAME = f"{ANALYSIS_FOLDER}/routes_frequency_non_suspicious_{ANALYSIS_MODE}.csv"
ROUTES_FREQUENCY_SUSPICIOUS_FILENAME = f"{ANALYSIS_FOLDER}/routes_frequency_suspicious_{ANALYSIS_MODE}.csv"

ANYCAST_PII_TRAFFIC_LOGS_ANALYSIS_FILEPATH = f"{ANALYSIS_FOLDER}/Anycast_PII_Traffic_Logs_{ANALYSIS_MODE}.csv"
TRAFFIC_LOGS_IP_CLASSIFIED_ANALYSIS_FILEPATH = f"{ANALYSIS_FOLDER}/Traffic_logs_10K_ip_classified_{ANALYSIS_MODE}.csv"

ANYCAST_PII_HOST_AGGREGATION_FILEPATH = f"{ANALYSIS_FOLDER}/Anycast_PII_host_aggregation_{ANALYSIS_MODE}.csv"
TRAFFIC_LOGS_IP_CLASSIFIED_HOST_AGGREGATION_FILEPATH = f"{ANALYSIS_FOLDER}/Traffic_logs_10K_ip_classified_host_aggregation_{ANALYSIS_MODE}.csv"


## Extractions of data from results

The first operation is to extract the routes and relevant info from every hunter execution and obtain a complete dataset with the result to make the analysis.

Auxiliar functions just in case

In [6]:
def populate_dataframe_with_ip_classification(dataframe: pd.DataFrame) -> pd.DataFrame:
    ips_classified = json_file_to_dict(ANYCAST_IP_CLASSIFICATION_FILEPATH)

    for ip_classified in ips_classified.keys():
        dataframe.loc[dataframe["ip_dest"] == ip_classified, "ip_anycast"] = ips_classified[ip_classified]
    
    return dataframe

Generate a file with all the routes got as the result of the experiment execution

In [7]:
# Auxiliary functions
def get_probe_location(probe_id: int, origin_list: []) -> (float, float):
    for origin in origin_list:
        if probe_id == origin["probe_id"]:
            location = from_geojson(origin["location"])
            return location.y, location.x
        else:
            continue
    return 0, 0

def get_result_country_route(hunter_result: dict) -> dict:
    probe_id = hunter_result["origin_id"]
    result_country = hunter_result["location_result"]["country"]
    probe_country = hunter_result["origin_country_code"]

    return {
        "origin_id": probe_id,
        "origin_country": probe_country,
        "result_country": result_country
    }

def get_country_capital_coords(country_code: str) -> (float, float):
    countries = {
        "DE": (52.52437, 13.41053),
        "BE": (50.85045, 4.34878),
        "HR": (45.81444, 15.97798),
        "DK": (55.67594, 12.56553),
        "ES": (40.4165, -3.70256),
        "FR": (48.85341, 2.3488),
        "IE": (53.33306, -6.24889),
        "LV": (56.946, 24.10589),
        "LU": (49.61167, 6.13),
        "NL": (52.37403, 4.88969),
        "BG": (42.69751, 23.32415),
        "SK": (48.14816, 17.10674),
        "SI": (46.05108, 14.50513),
        "EE": (59.43696, 24.75353),
        "GR": (37.98376, 23.72784),
        "MT": (35.89968, 14.5148),
        "PL": (52.22977, 21.01178),
        "CZ": (50.08804, 14.42076),
        "AT": (48.20849, 16.37208),
        "CY": (35.17531, 33.3642),
        "FI": (60.16952, 24.93545),
        "HU": (47.49835, 19.04045),
        "IT": (41.89193, 12.51133),
        "LT": (54.68916, 25.2798),
        "PT": (38.71667, -9.13333),
        "RO": (44.43225, 26.10626),
        "IS": (64.13548, -21.89541),
        "LI": (47.166, 9.555373),
        "NO": (59.91273, 10.74609),
        "SE": (59.32938, 18.06871)
    }
    
    try:
        return countries[country_code]
    except: 
        return 0,0

def get_traceroute_routes(traceroute_measurement: dict) -> dict:
    traceroute_routes = {}
    for traceroute in traceroute_measurement:
        probe_id = traceroute["prb_id"]
        traceroute_routes[probe_id] = []

        traceroute_result = traceroute["result"]
        for hop in traceroute_result:
            try:
                hop_directions = list(set(
                    [
                        direction["from"]
                        for direction in hop["result"]
                        if "from" in direction.keys()
                    ]
                ))
            except:
                hop_directions = []
            traceroute_routes[probe_id].append(hop_directions)

    return traceroute_routes

# Data generation function
def generate_routes_results_raw():
    routes_raw_df = pd.DataFrame(
        columns=[
            "target", "probe_id", "ips_previous_to_target", "route",
            "origin_country", "origin_latitude", "origin_longitude", 
            "capital_origin_latitude", "capital_origin_longitude", 
            "result_country", "result_latitude", "result_longitude",
            "result_filename", "outside_EEE"
        ]
    )
    for result_filename in get_list_files_in_path(EXPERIMENT_RESULTS_FOLDER):
        print(result_filename)
        result = json_file_to_dict(f"{EXPERIMENT_RESULTS_FOLDER}/{result_filename}")
        target = result["target"]
        origin_list = result["measurements"]["origin"]
        if "traceroute" in result["measurements"]["ripe_measurement_results"].keys():
            routes_traceroute_by_probe_id = get_traceroute_routes(result["measurements"]["ripe_measurement_results"]["traceroute"])
        else:
            routes_traceroute_by_probe_id = []
        for hunter_result in result["hunter_results"]:
            route = get_result_country_route(hunter_result)
            probe_id = hunter_result["origin_id"]
            origin_country = route["origin_country"]
            origin_latitude, origin_longitude = get_probe_location(probe_id, origin_list)
            capital_origin_latitude, capital_origin_longitude = get_country_capital_coords(origin_country)
            
            result_country = route["result_country"]
            if len(hunter_result["location_result"]["airports_intersection"]) == 1:
                result_location = from_geojson(hunter_result["location_result"]["airports_intersection"][0]["location"])
                result_latitude = result_location.y
                result_longitude = result_location.x
            else:
                result_latitude = 0
                result_longitude = 0
                
            outside_eee = (result_country not in EEE_countries_set) and (result_country != "Indeterminate")
            
            if result_country == "Indeterminate":
                ips_previous_to_target = ["Indeterminate"]
            else:
                ips_previous_to_target = [
                    ip["ip"]
                    for ip in hunter_result["ips_previous_to_target"]
                ]
            
            try:
                probe_route = routes_traceroute_by_probe_id[probe_id]
            except:
                probe_route = []
                
            routes_raw_df = pd.concat(
                [pd.DataFrame([[
                    target, probe_id, str(ips_previous_to_target), str(probe_route),
                    origin_country, origin_latitude, origin_longitude, 
                    capital_origin_latitude, capital_origin_longitude,
                    result_country, result_latitude, result_longitude,
                    result_filename, outside_eee
                ]], columns=routes_raw_df.columns), routes_raw_df], 
                ignore_index=True
            )
            
    # Sort and save
    routes_raw_df.sort_values(by=["target", "origin_country", "result_country"], inplace=True)
    routes_raw_df.to_csv(ROUTES_RESULTS_FILENAME, sep=",", index=False)


Once we have collected all the routes, we proceed to clean and filtered the results

In [8]:
def get_probe_ip_from_route(route_str: str) -> str:
    route = literal_eval(route_str)
    if len(route) != 0 and len(route[0]) != 0:
        return route[0][0]
    else:
        return "0.0.0.0"

def get_ip_details_via_cache(ip_address: str) -> dict:
    url = f"{IP_URL}/{ip_address}"
    details = json.loads(requests.get(url=url).json()["details"])
    return details

def get_ip_country_via_cache(ip_address: str) -> str:
    ip_details = get_ip_details_via_cache(ip_address)
    if "bogon" in ip_details.keys():
        return "bogon"
    else:
        return ip_details["country"]

def add_probe_id_ip_country_location(routes_results_df: pd.DataFrame) -> pd.DataFrame:
    if "probe_ip" in routes_results_df.columns:
        routes_results_df["probe_ip"] = "0.0.0.0"
    else:
        routes_results_df.insert(2, "probe_ip", "0.0.0.0")

    if "probe_country_with_ip" in routes_results_df.columns:
        routes_results_df["probe_country_with_ip"] = "bogon"
    else:
        routes_results_df.insert(3, "probe_country_with_ip", "bogon")
    
    # Get probes ips from traceroute
    routes_results_df["probe_ip"] = routes_results_df["route"].apply(
        lambda route: get_probe_ip_from_route(route)
    )
    
    for probe_ip in routes_results_df["probe_ip"].unique():
        if probe_ip != "0.0.0.0" and (not ipaddress.ip_address(probe_ip).is_private):
            routes_results_df.loc[
                routes_results_df["probe_ip"] == probe_ip, 
                "probe_country_with_ip"
            ] = get_ip_country_via_cache(probe_ip)
    
    return routes_results_df

def mark_suspicious_routes_results(routes_results_df: pd.DataFrame) -> pd.DataFrame:
    routes_results_df["suspicious"] = False

    # Count the probes that made a specific route
    routes_results_probe_count_df = routes_results_df[
        ["target", "result_country", "probe_id"]
    ].groupby(
        ["target", "result_country"]
    )["probe_id"].count().reset_index(
        ["target", "result_country"]
    ).rename(
        columns={"probe_id":"probes_count"}
    )
    
    if "probes_count" in routes_results_df.columns:
        routes_results_df.drop("probes_count", axis=1, inplace=True)
    
    routes_results_df = pd.merge(
        routes_results_df,
        routes_results_probe_count_df,
        on=["target", "result_country"],
        how="left",
    )
    
    # Validation criteria
    routes_results_df.loc[
        (routes_results_df["probes_count"] < 2) &
        (routes_results_df["result_country"] != "Indeterminate"),
        "suspicious"
    ] = True

    routes_results_df.loc[
        (routes_results_df["probe_country_with_ip"] != routes_results_df["origin_country"]) &
        (routes_results_df["probe_country_with_ip"] != "bogon"),
        "suspicious"
    ] = True
    
    return routes_results_df
    
def clean_routes_results():
    routes_results_raw_df = pd.read_csv(ROUTES_RESULTS_FILENAME, sep=",")
    routes_results_raw_df = add_probe_id_ip_country_location(routes_results_raw_df)
    routes_results_raw_df = mark_suspicious_routes_results(routes_results_raw_df)
    
    routes_results_raw_df.to_csv(ROUTES_RESULTS_FILENAME, sep=",", index=False)
    routes_results_raw_df.loc[
        routes_results_raw_df["suspicious"] != True
    ].to_csv(ROUTES_RESULTS_NON_SUSPICIOUS_FILENAME, sep=",", index=False)
    routes_results_raw_df.loc[
        routes_results_raw_df["suspicious"] == True
    ].to_csv(ROUTES_RESULTS_SUSPICIOUS_FILENAME, sep=",", index=False)
    

Aggregate and count routes repetitions

In [9]:
def generate_routes_frequency_aggregation(routes_results_file: str, routes_frequency_file: str):
    # Aggregate routes counting the repetitions
    routes_frequency_df = pd.read_csv(routes_results_file, sep=",")

    routes_frequency_df = routes_frequency_df.value_counts(
        subset=['target', 'origin_country', 'result_country']
    ).rename_axis(
        ['target', 'origin_country', 'result_country']
    ).reset_index(
        name="count"
    )
    
    routes_frequency_df["outside_EEE"] = False
    routes_frequency_df.loc[
        (routes_frequency_df["result_country"] != "Indeterminate") &
        (~routes_frequency_df["result_country"].isin(EEE_countries_set)),
        ["outside_EEE"]
    ] = True
    routes_frequency_df.to_csv(routes_frequency_file, sep=",", index=False)


Execute the enrichment of data and generation

In [10]:
# Because is a long process and is only necessary to run once I include the condition
if GENERATE_ROUTES_RAW:
    # Generate the results raw
    generate_routes_results_raw()
if GENERATE_ROUTES_ADDONS:
    # Make the cleaning and enrichment of the data
    clean_routes_results()
    # Generate the frequency results
    for file_tuple in [(ROUTES_RESULTS_FILENAME, ROUTES_FREQUENCY_FILENAME),
                       (ROUTES_RESULTS_NON_SUSPICIOUS_FILENAME, ROUTES_FREQUENCY_NON_SUSPICIOUS_FILENAME),
                       (ROUTES_RESULTS_SUSPICIOUS_FILENAME, ROUTES_FREQUENCY_SUSPICIOUS_FILENAME),]:
        generate_routes_frequency_aggregation(file_tuple[0], file_tuple[1])

## Populate traffic logs

The next phase is to populate and filter the traffic logs dataset in order to be able to extract the conclusions from one point with the complete info.

Introduce routes outside EEE and its count in the complete dataset

In [11]:
# Dataset population function
def populate_dataset_with_routes_results(dataframe: pd.DataFrame) -> pd.DataFrame:
    routes_valid_df = pd.read_csv(ROUTES_FREQUENCY_NON_SUSPICIOUS_FILENAME, sep=",")
    routes_valid_df = routes_valid_df.loc[
        (routes_valid_df["outside_EEE"] == True)
    ]

    # Charge default routes in the dataset to populate
    dataframe["origins_transfers_outside_EEE"] = "[]"
    dataframe["destinations_transfers_outside_EEE"] = "[]"
    dataframe["frequency_transfers_outside_EEE"] = "[]"
    dataframe["outside_EEE"] = False
    
    # For every IP get the list of origins, destinations and frequency and save it
    for ip in routes_valid_df["target"].unique().tolist():
        ip_routes = routes_valid_df.loc[routes_valid_df["target"] == ip]
        origins_transfers_outside_eee = ip_routes["origin_country"].values.tolist()
        destinations_transfers_outside_eee = ip_routes["result_country"].values.tolist()
        frequency_transfers_outside_eee = ip_routes["count"].values.tolist()

        dataframe.loc[
            (dataframe["ip_dest"] == ip), 
            ["origins_transfers_outside_EEE", 
             "destinations_transfers_outside_EEE", 
             "frequency_transfers_outside_EEE",
             "outside_EEE"]
        ] = [str(origins_transfers_outside_eee),
             str(destinations_transfers_outside_eee),
             str(frequency_transfers_outside_eee),
             True]
    
    return dataframe


Populate the datasets with metadata info

In [12]:
def populate_dataset_with_apks_metadata(dataframe: pd.DataFrame) -> pd.DataFrame:
    apk_metadata_df = pd.read_csv(APKS_METADATA_FILEPATH, sep=",")
    
    return dataframe.merge(
        apk_metadata_df,
        on=["apk", "version"],
        how="left"
    )


Populate the datasets with the info extrated from every policy

In [13]:
def populate_dataset_with_policy_extracted_info(dataframe: pd.DataFrame) -> pd.DataFrame:
    it_annotation_results_df = pd.read_csv(IT_ANNOTATION_FILEPATH, sep=",")
    
    it_annotation_results_df.drop_duplicates(["apk", "countries"], inplace=True)
    it_annotation_results_df.rename(
        columns={
            "transfer": "it_mentioned_by_policy",
            "adequacy_decision": "adequacy_decision_by_policy",
            "countries": "countries_mentioned_by_policy"
        },
        inplace=True
    )
    
    dataframe = pd.merge(
        dataframe,
        it_annotation_results_df[[
            "apk", "version", 
            "it_mentioned_by_policy", "adequacy_decision_by_policy", "countries_mentioned_by_policy"
        ]], 
        on=["apk", "version"],
        how="left"
    )

    dataframe.fillna(
        value={
            "it_mentioned_by_policy": False,
            "adequacy_decision_by_policy": False,
            "countries_mentioned_by_policy": "[]"
        },
        inplace=True
    )    
    return dataframe
    

Populate the datasets with the info about the libraries which carried out the communication 

In [14]:
def populate_dataset_with_libraries_data(dataframe: pd.DataFrame) -> pd.DataFrame:
    tpls_results_df = pd.read_csv(TPLS_RESULTS_FILEPATH, sep=",")
    tpls_results_df.fillna(
        {
            "TP-performed": False,
            "TP-library": "None",
            "FP-intended": False,
        }, inplace=True
    )
    
    dataframe.drop("stackTrace", axis=1, inplace=True)
    
    dataframe = pd.merge(
        dataframe,
        tpls_results_df[[
            "apk", "stackTrace", "version", "port_source", "host", "port_dest", "ip_dest",
            "TP-performed", "TP-library", "FP-intended"
        ]],
        on=["apk", "version", "port_source", "host", "port_dest", "ip_dest"],
        how="left"
    )

    return dataframe

Check GDPR compliance in terms of international transfers

In [15]:
def check_apk_it_gdpr_compliance(dataframe: pd.DataFrame) -> pd.DataFrame:
    dataframe["apk_it_gdpr_compliance"] = True
    
    dataframe["apk_it_gdpr_compliance"] = dataframe.apply(
        lambda row_to_check: 
        set(literal_eval(row_to_check["destinations_transfers_outside_EEE"])).issubset(
            set(literal_eval(row_to_check["countries_mentioned_by_policy"]))
        ),
        axis=1
    )
    
    return dataframe
        

Aggregation of apk and destination IP by domain or host

In [16]:
def aggregate_by_domain(dataframe: pd.DataFrame, filepath: str):
    group_df = dataframe[["host", "ip_dest", "apk"]]
    group_df = group_df.groupby(["host", "ip_dest", "apk"]).size().reset_index(name="traffic_logs")

    group_df.sort_values(by=["host", "apk", "ip_dest"], inplace=True)
    
    group_df.to_csv(filepath, sep=",", index=False)

Make the analysis

In [17]:
# Charge the dataframes to be used
traffic_logs_ip_classified_analysis_df = pd.read_csv(TRAFFIC_LOGS_IP_CLASSIFIED_FILEPATH, sep=",")
anycast_pii_traffic_logs_df = pd.read_csv(ANYCAST_PII_TRAFFIC_LOGS_FILEPATH, sep=",")
datasets = [traffic_logs_ip_classified_analysis_df, anycast_pii_traffic_logs_df]
analysis_filepaths = [TRAFFIC_LOGS_IP_CLASSIFIED_ANALYSIS_FILEPATH, ANYCAST_PII_TRAFFIC_LOGS_ANALYSIS_FILEPATH]
aggregation_filepaths = [TRAFFIC_LOGS_IP_CLASSIFIED_HOST_AGGREGATION_FILEPATH, ANYCAST_PII_HOST_AGGREGATION_FILEPATH]

In [18]:
# Populate datasets
if ANALYZE_DATA:
    for index in range(0, len(datasets)):
        dataset_to_improve = datasets[index]
        
        # Populate with metadata
        print("Populate with metadata")
        dataset_to_improve = populate_dataset_with_apks_metadata(dataset_to_improve)
        # Populate with the privacy policy extracted data
        print("Populate with the privacy policy extracted data")
        dataset_to_improve = populate_dataset_with_policy_extracted_info(dataset_to_improve)
        # Populate with routes
        print("Populate with routes")
        dataset_to_improve = populate_dataset_with_routes_results(dataset_to_improve)
        # Populate with libraries data
        print("Populate with the libraries data")
        dataset_to_improve = populate_dataset_with_libraries_data(dataset_to_improve)
    
        # Check conditions
        print("Checking IT declarations accomplishment")
        dataset_to_improve = check_apk_it_gdpr_compliance(dataset_to_improve)
    
        print("Setting types and NaN")
        dataset_to_improve.fillna(
            {
                "loadsJNI": False,
                "stackTrace": "None",
                "remote_host": "None",
                "tls": False,
                "https": False,
                "error": "None",
                "TP-performed": False,
                "TP-library": "None",
                "FP-intended": False,
            }, inplace=True
        )
        
        print("Saving data")
        dataset_to_improve.to_csv(analysis_filepaths[index], sep=",", index=False)
        
        # Generate the aggregations for better understanding
        # aggregate_by_domain(dataframe=dataset_to_improve, filepath=aggregation_filepaths[index])
    

Populate with metadata
Populate with the privacy policy extracted data


  dataframe.fillna(


Populate with routes
Populate with the libraries data


  tpls_results_df.fillna(


Checking IT declarations accomplishment
Setting types and NaN


  dataset_to_improve.fillna(


Saving data
Populate with metadata
Populate with the privacy policy extracted data


  dataframe.fillna(


Populate with routes
Populate with the libraries data


  tpls_results_df.fillna(


Checking IT declarations accomplishment
Setting types and NaN
Saving data


  dataset_to_improve.fillna(


## Analysis questions

Answers to the questions needed for the article

Acronyms:
- PII = Personal Identificable Information

In [19]:
# Data load
traffic_logs_ip_classified_analysis_df = pd.read_csv(TRAFFIC_LOGS_IP_CLASSIFIED_ANALYSIS_FILEPATH, sep=",")
anycast_pii_traffic_logs_analysis_df = pd.read_csv(ANYCAST_PII_TRAFFIC_LOGS_ANALYSIS_FILEPATH, sep=",")

  traffic_logs_ip_classified_analysis_df = pd.read_csv(TRAFFIC_LOGS_IP_CLASSIFIED_ANALYSIS_FILEPATH, sep=",")
  anycast_pii_traffic_logs_analysis_df = pd.read_csv(ANYCAST_PII_TRAFFIC_LOGS_ANALYSIS_FILEPATH, sep=",")


Traffic numbers

In [20]:
all_connections = len(traffic_logs_ip_classified_analysis_df.index)
print(f"Number of connections intercepted: {all_connections}")

anycast_connections = len(anycast_pii_traffic_logs_analysis_df.index)
print(f"Number of anycast connections intercepted: {anycast_connections}")

Number of connections intercepted: 4278823
Number of anycast connections intercepted: 195786


**IPs analysis**

In [45]:
ips_total = traffic_logs_ip_classified_analysis_df["ip_dest"].unique().tolist()
print(f"Number of IPs in traffic logs: {len(ips_total)}")

ips_total_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["ip_dest"].unique().tolist()
print(f"Number of IPs with PII: {len(ips_total_pii)}")

ips_anycast = traffic_logs_ip_classified_analysis_df.loc[
    traffic_logs_ip_classified_analysis_df["ip_anycast"]
]["ip_dest"].unique().tolist()
print(f"Number of IPs anycast: {len(ips_anycast)}")

ips_anycast_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["ip_anycast"]) &
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["ip_dest"].unique().tolist()
print(f"Number of IPs anycast with PII: {len(ips_anycast_pii)}")

ips_anycast_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["ip_anycast"]) &
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna()) &
    (traffic_logs_ip_classified_analysis_df["outside_EEE"] == True)
    ]["ip_dest"].unique().tolist()
print(f"Number of IPs anycast with PII that make IT: {len(ips_anycast_pii)}")

Number of IPs in traffic logs: 5647
Number of IPs with PII: 1807
Number of IPs anycast: 991
Number of IPs anycast with PII: 200
Number of IPs anycast with PII that make IT: 195


Check the case of IPs with only one destination country detected

In [67]:
routes_frequency_non_suspicious_df = pd.read_csv(ROUTES_FREQUENCY_NON_SUSPICIOUS_FILENAME, sep=",")
ip_destinations_count_df = routes_frequency_non_suspicious_df.loc[
        routes_frequency_non_suspicious_df["result_country"] != "Indeterminate"
    ].groupby(["target"], as_index=False)["result_country"].nunique().rename(columns={"result_country": "result_countries_count"})

ip_destinations_count_df.sort_values(by="result_countries_count", ascending=True, inplace=True)

ip_destinations_count_df

Unnamed: 0,target,result_countries_count
62,13.107.246.43,1
136,204.79.197.200,1
61,13.107.21.200,2
76,170.33.12.222,4
120,185.151.204.40,4
...,...,...
25,104.19.255.161,30
6,104.17.73.8,30
3,104.17.209.240,30
14,104.18.39.116,30


In [68]:
ip_one_destination_country_list = ip_destinations_count_df.loc[
    ip_destinations_count_df["result_countries_count"] <= 1
    ]["target"].unique().tolist()

ip_one_destination_country_list

['13.107.246.43', '204.79.197.200']

Make the last analysis but only for the IPs with at least one time a specific destination country

In [79]:
result_country_to_filter = "US"

ip_destinations_count_in_country_filter_df = routes_frequency_non_suspicious_df.loc[
    (routes_frequency_non_suspicious_df["result_country"] != "Indeterminate") &
    (routes_frequency_non_suspicious_df["target"].isin(
        routes_frequency_non_suspicious_df.loc[
            (routes_frequency_non_suspicious_df["result_country"] != "Indeterminate") &
            (routes_frequency_non_suspicious_df["result_country"] == result_country_to_filter)
        ]["target"].unique()
    ))
].groupby(["target"], as_index=False)["result_country"].nunique().rename(columns={"result_country": "result_countries_count"})

ip_destinations_count_in_country_filter_df.sort_values(by="result_countries_count", ascending=True, inplace=True)

ip_destinations_count_in_country_filter_df

Unnamed: 0,target,result_countries_count
30,185.151.204.51,4
25,185.151.204.40,4
22,185.151.204.30,5
21,185.151.204.14,5
32,185.151.204.9,5
29,185.151.204.50,5
28,185.151.204.43,5
20,185.151.204.12,5
19,185.151.204.11,5
27,185.151.204.42,5


**APKS analysis**

In [23]:
apks_total = traffic_logs_ip_classified_analysis_df["apk"].unique().tolist()
print(f"Number of APKs in traffic logs: {len(apks_total)}")

apks_total_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["apk"].unique().tolist()
print(f"Number of APKs with PII: {len(apks_total_pii)}")

apks_anycast = traffic_logs_ip_classified_analysis_df.loc[
    traffic_logs_ip_classified_analysis_df["ip_anycast"]
]["apk"].unique().tolist()
print(f"Number of APKs using anycast: {len(apks_anycast)}")

apks_anycast_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["ip_anycast"]) &
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["apk"].unique().tolist()
print(f"Number of APKs anycast with PII: {len(apks_anycast_pii)}")

Number of APKs in traffic logs: 5759
Number of APKs with PII: 3478
Number of APKs using anycast: 1669
Number of APKs anycast with PII: 960


**Hosts Analysis**

In [24]:
hosts_total = traffic_logs_ip_classified_analysis_df["host"].unique().tolist()
print(f"Number of hosts in traffic logs: {len(hosts_total)}")

hosts_total_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["host"].unique().tolist()
print(f"Number of hosts with PII: {len(hosts_total_pii)}")

hosts_anycast = traffic_logs_ip_classified_analysis_df.loc[
    traffic_logs_ip_classified_analysis_df["ip_anycast"]
]["host"].unique().tolist()
print(f"Number of hosts using anycast: {len(hosts_anycast)}")

hosts_anycast_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["ip_anycast"]) &
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["host"].unique().tolist()
print(f"Number of hosts anycast with PII: {len(hosts_anycast_pii)}")

Number of hosts in traffic logs: 4738
Number of hosts with PII: 966
Number of hosts using anycast: 995
Number of hosts anycast with PII: 201


**Data Types**  

In [25]:
pii_data_types_anycast = anycast_pii_traffic_logs_analysis_df["PII"].unique().tolist()
print(f"Types of PII data treated by anycast IPs:")
print(pii_data_types_anycast)

pii_data_types_anycast_pii_it = anycast_pii_traffic_logs_analysis_df.loc[
    anycast_pii_traffic_logs_analysis_df["outside_EEE"]
]["PII"].unique().tolist()
print(f"Types of PII data treated by anycast IPs that make IT:")
print(pii_data_types_anycast_pii_it)

Types of PII data treated by anycast IPs:
['Device_Model', 'Google_Ad_ID', 'Build_No', 'Kernel_version', 'Fingerprint', 'Router_Wifi_BSSID', 'Router_Wifi_BSSID_Close', 'Router_Wifi_MAC', 'Device_location', 'Device_location_coarse']
Types of PII data treated by anycast IPs that make IT:
['Device_Model', 'Google_Ad_ID', 'Build_No', 'Kernel_version', 'Fingerprint', 'Router_Wifi_BSSID', 'Router_Wifi_BSSID_Close', 'Router_Wifi_MAC', 'Device_location', 'Device_location_coarse']


**GDPR Compliance**

In [26]:
apks_anycast_pii_declare_it = anycast_pii_traffic_logs_analysis_df.loc[
    anycast_pii_traffic_logs_analysis_df["it_mentioned_by_policy"]
]["apk"].unique().tolist()
print(f"Number of APKs that use anycast IPs and treat PII that declare IT in privacy policy: {len(apks_anycast_pii_declare_it)}")

apks_anycast_pii_not_compliance = anycast_pii_traffic_logs_analysis_df.loc[
    anycast_pii_traffic_logs_analysis_df["apk_it_gdpr_compliance"] == False
]["apk"].unique().tolist()
print(f"Number of APKs that use anycast IPs and treat PII that has not compliance: {len(apks_anycast_pii_not_compliance)}")

apks_anycast_pii_not_compliance_declare_it = anycast_pii_traffic_logs_analysis_df.loc[
    (anycast_pii_traffic_logs_analysis_df["it_mentioned_by_policy"] == True) &
    (anycast_pii_traffic_logs_analysis_df["apk_it_gdpr_compliance"] == False)
    ]["apk"].unique().tolist()
print(f"Number of APKs that use anycast IPs and treat PII that has not compliance and declare IT in privacy policy: {len(apks_anycast_pii_not_compliance_declare_it)}")


Number of APKs that use anycast IPs and treat PII that declare IT in privacy policy: 196
Number of APKs that use anycast IPs and treat PII that has not compliance: 947
Number of APKs that use anycast IPs and treat PII that has not compliance and declare IT in privacy policy: 189


**TLPs**

In [27]:
anycast_pii_connections = len(anycast_pii_traffic_logs_analysis_df.index)
print(f"Number of connections Anycast+PII: {anycast_pii_connections}")

anycast_pii_it_connections = len(
    anycast_pii_traffic_logs_analysis_df.loc[
        (anycast_pii_traffic_logs_analysis_df["outside_EEE"] == True)
    ].index)
print(f"Number of connections Anycast+PII+IT: {anycast_pii_it_connections}")

anycast_pii_it_connections_by_app = len(
    anycast_pii_traffic_logs_analysis_df.loc[
        (anycast_pii_traffic_logs_analysis_df["outside_EEE"] == True) &
        (anycast_pii_traffic_logs_analysis_df["FP-intended"] == True)
    ].index)
print(f"Number of connections Anycast+PII+IT set up by APP: {anycast_pii_it_connections_by_app}")

anycast_pii_it_connections_by_tpl = len(
    anycast_pii_traffic_logs_analysis_df.loc[
        (anycast_pii_traffic_logs_analysis_df["outside_EEE"] == True) &
        (anycast_pii_traffic_logs_analysis_df["TP-performed"] == True)
        ].index)
print(f"Number of connections Anycast+PII+IT set up by TPL: {anycast_pii_it_connections_by_tpl}")

anycast_pii_it_connections_by_tpl_fp_intended = len(
    anycast_pii_traffic_logs_analysis_df.loc[
        (anycast_pii_traffic_logs_analysis_df["outside_EEE"] == True) &
        (anycast_pii_traffic_logs_analysis_df["TP-performed"] == True) &
        (anycast_pii_traffic_logs_analysis_df["FP-intended"] == True)
        ].index)
print(f"Number of connections Anycast+PII+IT set up by TPL but allowed from FP: {anycast_pii_it_connections_by_tpl_fp_intended}")


Number of connections Anycast+PII: 195786
Number of connections Anycast+PII+IT: 195432
Number of connections Anycast+PII+IT set up by APP: 166976
Number of connections Anycast+PII+IT set up by TPL: 61974
Number of connections Anycast+PII+IT set up by TPL but allowed from FP: 56014


In [28]:
tpls_anycast_pii_ip = anycast_pii_traffic_logs_analysis_df.loc[
    (anycast_pii_traffic_logs_analysis_df["outside_EEE"] == True) &
    (anycast_pii_traffic_logs_analysis_df["TP-library"] != "None") &
    (~anycast_pii_traffic_logs_analysis_df["TP-library"].isnull())
]["TP-library"].unique().tolist()

print("List of TPLs Anycast+PII+IT")
print(tpls_anycast_pii_ip)
print(f"Number of TPLs Anycast+PII+IT: {len(tpls_anycast_pii_ip)}")

List of TPLs Anycast+PII+IT
['com.google', 'com.unity3d', 'com.onesignal', 'com.mob', 'com.bugsnag', 'com.newrelic.agent.android', 'com.android.volley', 'com.adjust.sdk', 'io.bidmachine', 'com.appodeal.ads', 'com.applovin', 'com.mixpanel.android', 'com.google.android.gms', 'com.kochava', 'com.adcolony', 'com.chartboost', 'com.mopub.volley', 'io.sentry', 'com.urbanairship', 'com.emarsys', 'com.leanplum', 'com.fyber', 'io.grpc', 'com.dynatrace']
Number of TPLs Anycast+PII+IT: 24


In [29]:
print("Ranking TPLs")
anycast_pii_traffic_logs_analysis_df.groupby(["TP-library"],as_index=False).size().sort_values(by=["size"], ascending=False, ignore_index=True)

Ranking TPLs


Unnamed: 0,TP-library,size
0,com.unity3d,35072
1,io.bidmachine,9200
2,com.google.android.gms,4998
3,com.adjust.sdk,3470
4,com.appodeal.ads,3127
5,com.google,1980
6,com.chartboost,1068
7,com.mob,568
8,com.adcolony,430
9,com.applovin,405


TPLs Privacy Policy declarations and analysis of complaince

In [30]:
tpls_policy_analysis_df = anycast_pii_traffic_logs_analysis_df.loc[
    (anycast_pii_traffic_logs_analysis_df["outside_EEE"] == True) &
    (anycast_pii_traffic_logs_analysis_df["TP-library"] != "None") &
    (~anycast_pii_traffic_logs_analysis_df["TP-library"].isnull())
    ][
    ["TP-library", "FP-intended", "host", "destinations_transfers_outside_EEE", "PII"]
].value_counts(
    subset=["TP-library", "FP-intended", "host", "destinations_transfers_outside_EEE", "PII"]
).reset_index(
).sort_values(
    by=["TP-library", "FP-intended", "host"]
).merge(
    pd.read_csv(TPLS_MANUAL_POLICY_INFO, sep=","),
    on="TP-library",
    how="left"
)

tpls_policy_analysis_df["destinations_transfers_outside_EEE"] = (
    tpls_policy_analysis_df["destinations_transfers_outside_EEE"].apply(
        lambda country_list: str(list(set(literal_eval(country_list)))),
    ))

tpls_policy_analysis_df["tpl_gdpr_compliance"] = tpls_policy_analysis_df.apply(
    lambda row: set(literal_eval(row["destinations_transfers_outside_EEE"])).issubset(set(literal_eval(row["countries_mentioned"]))),
    axis=1
)

tpls_policy_analysis_df.loc[
    tpls_policy_analysis_df["PII_responsible"]
].to_csv(TPLS_POLICY_ANALYSIS, sep=",", index=False)

# Partial Results

One result to analysis some important apk examples

In [31]:
downloads = anycast_pii_traffic_logs_analysis_df.loc[
    (anycast_pii_traffic_logs_analysis_df["apk_it_gdpr_compliance"] == False) &
    (anycast_pii_traffic_logs_analysis_df["android_numDownloads"] == "1B+")
]["apk"].unique().tolist()
print(downloads)

['com.fingersoft.hillclimb']


In [32]:
apk = anycast_pii_traffic_logs_analysis_df.loc[
    (anycast_pii_traffic_logs_analysis_df["apk"] == "com.fingersoft.hillclimb")
]
apk.to_csv(f"{PARTIAL_RESULTS_DIR}/one_app_results_{ANALYSIS_MODE}.csv", sep=",", index=False)

Generation of the same results but separated by country

In [33]:
routes_results = pd.read_csv(ROUTES_RESULTS_NON_SUSPICIOUS_FILENAME, sep=",")
routes_frequencies = pd.read_csv(ROUTES_FREQUENCY_NON_SUSPICIOUS_FILENAME, sep=",")
# for origin_country in ["CZ"]:
for origin_country in routes_results["origin_country"].unique():
    routes_results.loc[
        (routes_results["origin_country"] == origin_country) &
        (routes_results["outside_EEE"] == True)
    ].sort_values(
        by=["result_country"]
    ).to_csv(f"{PARTIAL_RESULTS_DIR}/{ANALYSIS_MODE}/routes_results_{ANALYSIS_MODE}_{origin_country}.csv", sep=",", index=False)

    routes_frequencies.loc[
        (routes_frequencies["origin_country"] == origin_country) &
        (routes_frequencies["outside_EEE"] == True)
    ].sort_values(
        by=["result_country"]
    ).to_csv(f"{PARTIAL_RESULTS_DIR}/{ANALYSIS_MODE}/routes_frequency_{ANALYSIS_MODE}_{origin_country}.csv", sep=",", index=False)

Show the suspicious results divided for IP

In [34]:
for ip in pd.read_csv(ROUTES_RESULTS_SUSPICIOUS_FILENAME)["target"].unique():
    routes_results.loc[
        (routes_results["outside_EEE"] == True) &
        (routes_results["target"] == ip)
    ].to_csv(f"{PARTIAL_RESULTS_DIR}/{ANALYSIS_MODE}/{ip}_results.csv", sep=",", index=False)

# END