# Experiment Mesh Analysis


In [1]:
# external imports
import pandas as pd
from shapely import (
    from_geojson
)

In [2]:
# internal imports
from src.utils.common_functions import (
    json_file_to_dict,
    get_list_files_in_path,
)
from src.utils.constants import (
    EEE_COUNTRIES_FILEPATH,
    REPLICATION_PACKAGE_DIR,
    TRAFFIC_LOGS_IP_CLASSIFIED_FILEPATH,
    ANYCAST_PII_TRAFFIC_LOGS_FILEPATH,
    APKS_METADATA_FILEPATH,
    RESULTS_MODES,
    IT_ANNOTATION_FILEPATH
)

In [3]:
# Constants
EEE_countries_set = set([country["alpha-2"] for country in json_file_to_dict(EEE_COUNTRIES_FILEPATH)])

In [4]:
# Analysis params
DESTINATION_REPETITIONS_LIMIT = 1
ANALYSIS_MODE=RESULTS_MODES[0]
GENERATE_ROUTES_DATA=False

In [5]:
# Filepaths variables
EXPERIMENT_RESULTS_FOLDER = f"{REPLICATION_PACKAGE_DIR}/experiment_results_{ANALYSIS_MODE}"
ANALYSIS_FOLDER = f"{REPLICATION_PACKAGE_DIR}/analysis_{ANALYSIS_MODE}"

ROUTES_RESULTS_FILENAME = f"{ANALYSIS_FOLDER}/routes_results_{ANALYSIS_MODE}.csv"
ROUTES_FREQUENCY_FILENAME = f"{ANALYSIS_FOLDER}/routes_frequency_{ANALYSIS_MODE}.csv"

ANYCAST_PII_TRAFFIC_LOGS_ANALYSIS_FILEPATH = f"{REPLICATION_PACKAGE_DIR}/analysis_{ANALYSIS_MODE}/Anycast_PII_Traffic_Logs_{ANALYSIS_MODE}.csv"
TRAFFIC_LOGS_IP_CLASSIFIED_ANALYSIS_FILEPATH = f"{REPLICATION_PACKAGE_DIR}/analysis_{ANALYSIS_MODE}/Traffic_logs_10K_ip_classified_{ANALYSIS_MODE}.csv"

## Enrichemnt of data and generation of datasets

### Routes resume data extraction

Generate a file with all the routes got as the result of the experiment execution

In [6]:
# Auxiliary functions
def get_probe_location(probe_id: int, origin_list: []) -> (float, float):
    for origin in origin_list:
        if probe_id == origin["probe_id"]:
            location = from_geojson(origin["location"])
            return location.y, location.x
        else:
            continue
    return 0, 0

def get_result_country_route(hunter_result: dict) -> dict:
    probe_id = hunter_result["origin_id"]
    result_country = hunter_result["location_result"]["country"]
    probe_country = hunter_result["origin_country_code"]

    return {
        "origin_id": probe_id,
        "origin_country": probe_country,
        "result_country": result_country
    }

# Data generation function
def generate_routes_raw():
    routes_raw_df = pd.DataFrame(
        columns=[
            "target", "probe_id", 
            "origin_country", "origin_latitude", "origin_longitude", 
            "result_country", "result_latitude", "result_longitude",
            "result_filename", "outside_EEE"
        ]
    )
    for result_filename in get_list_files_in_path(EXPERIMENT_RESULTS_FOLDER):
        print(result_filename)
        result = json_file_to_dict(f"{EXPERIMENT_RESULTS_FOLDER}/{result_filename}")
        target = result["target"]
        origin_list = result["measurements"]["origin"]
        for hunter_result in result["hunter_results"]:
            route = get_result_country_route(hunter_result)
            probe_id = hunter_result["origin_id"]
            origin_country = route["origin_country"]
            origin_latitude, origin_longitude = get_probe_location(probe_id, origin_list)
            
            result_country = route["result_country"]
            if len(hunter_result["location_result"]["airports_intersection"]) == 1:
                result_location = from_geojson(hunter_result["location_result"]["airports_intersection"][0]["location"])
                result_latitude = result_location.y
                result_longitude = result_location.x
            else:
                result_latitude = 0
                result_longitude = 0
                
            outside_eee = (result_country not in EEE_countries_set) and (result_country != "Indeterminate")
                    
            routes_raw_df = pd.concat(
                [pd.DataFrame([[
                    target, probe_id, 
                    origin_country, origin_latitude, origin_longitude, 
                    result_country, result_latitude, result_longitude,
                    result_filename, outside_eee
                ]], columns=routes_raw_df.columns), routes_raw_df], 
                ignore_index=True
            )
    # Sort and save
    routes_raw_df.sort_values(by=["target", "origin_country", "result_country"], inplace=True)
    routes_raw_df.to_csv(ROUTES_RESULTS_FILENAME, sep=",", index=False)


Aggregate and count routes repetitions

In [7]:
# Data generation function
def generate_routes_frequency_aggregation():
    # Aggregate routes counting the repetitions
    routes_frequency_df = pd.read_csv(ROUTES_RESULTS_FILENAME, sep=",")
    routes_frequency_df = routes_frequency_df[["target", "origin_country", "result_country"]]
    routes_frequency_df = routes_frequency_df.value_counts(subset=['target', 'origin_country', 'result_country'])
    routes_frequency_df.to_csv(ROUTES_FREQUENCY_FILENAME, sep=",")

    # Include the info about outside the EEE
    routes_frequency_df = pd.read_csv(ROUTES_FREQUENCY_FILENAME, sep=",")
    routes_frequency_df["outside_EEE"] = False
    
    for index, row in routes_frequency_df.iterrows():
        result_country = row["result_country"]
        routes_frequency_df.loc[index, "outside_EEE"] = (result_country not in EEE_countries_set) and (result_country != "Indeterminate")
    
    routes_frequency_df.sort_values(by=["target", "origin_country", "result_country"], inplace=True)
    routes_frequency_df.to_csv(ROUTES_FREQUENCY_FILENAME, sep=",", index=False)

Introduce routes outside EEE and its count in the complete dataset WHAT DATASETS POPULATE????

In [8]:
# Dataset population function
def populate_dataset_with_routes_results(dataframe: pd.DataFrame):
    routes_valid_df = pd.read_csv(ROUTES_FREQUENCY_FILENAME, sep=",")
    routes_valid_df = routes_valid_df.loc[
        (routes_valid_df["outside_EEE"] == True) & 
        (routes_valid_df["count"] >= DESTINATION_REPETITIONS_LIMIT)
    ]
    
    routes_valid_dict = {}
    for index, row in routes_valid_df.iterrows():
        target = row["target"]
        if target not in routes_valid_dict.keys():
            routes_valid_dict[row["target"]] = {
                "origins": [],
                "destinations": [],
                "count": []
            }
        
        routes_valid_dict[target]["origins"].append(row["origin_country"])
        routes_valid_dict[target]["destinations"].append(row["result_country"])
        routes_valid_dict[target]["count"].append(str(row["count"]))
    
    # Charge routes in the complete dataset
    dataframe["origins_transfers_outside_EEE"] = ""
    dataframe["destinations_transfers_outside_EEE"] = ""
    dataframe["frequency_transfers_outside_EEE"] = ""
    dataframe["outside_EEE"] = False
    
    for target in routes_valid_dict.keys():
        dataframe.loc[
            (dataframe["ip_dest"] == target), ["origins_transfers_outside_EEE"]
        ] = str(routes_valid_dict[target]["origins"])
        dataframe.loc[
            (dataframe["ip_dest"] == target), ["destinations_transfers_outside_EEE"]
        ] = str(routes_valid_dict[target]["destinations"])
        dataframe.loc[
            (dataframe["ip_dest"] == target), ["frequency_transfers_outside_EEE"]
        ] = str(routes_valid_dict[target]["count"])
        dataframe.loc[
            (dataframe["ip_dest"] == target), ["outside_EEE"]
        ] = True


Populate the datasets with metadata info

In [9]:
def populate_dataset_with_apks_metadata(dataframe: pd.DataFrame):
    apk_metadata_df = pd.read_csv(APKS_METADATA_FILEPATH, sep=",")
    
    for index, row in apk_metadata_df.iterrows():
        dataframe.loc[
            dataframe["apk"] == row["apk"], "android_rating"
        ] = row["android_rating"]
    
        dataframe.loc[
            dataframe["apk"] == row["apk"], "android_numDownloads"
        ] = row["android_numDownloads"]
    
        dataframe.loc[
            dataframe["apk"] == row["apk"], "android_category"
        ] = row["android_category"]


Populate the datasets with the info extrated from every policy

In [10]:
def populate_dataset_with_policy_extracted_info(dataframe: pd.DataFrame):
    it_annotation_results_df = pd.read_csv(IT_ANNOTATION_FILEPATH, sep=",")
    
    # Meter cambio para asegurar contenido de la fila countries
    
    it_annotation_results_df.drop_duplicates(["apk", "countries"], inplace=True)
    for index, row in it_annotation_results_df.iterrows():
        dataframe.loc[
            dataframe["apk"] == row["apk"], "countries_mentioned_by_policy"
        ] = row["countries"]
        dataframe.loc[
            dataframe["apk"] == row["apk"], "it_mentioned_by_policy"
        ] = row["transfer"]
        dataframe.loc[
            dataframe["apk"] == row["apk"], "adequacy_decision_by_policy"
        ] = row["adequacy_decision"]


Execute the enrichment of data and generation

In [11]:
# Charge the dataframes to be used
traffic_logs_ip_classified = pd.read_csv(TRAFFIC_LOGS_IP_CLASSIFIED_FILEPATH, sep=",")
anycast_pii_traffic_logs_df = pd.read_csv(ANYCAST_PII_TRAFFIC_LOGS_FILEPATH, sep=",")

# Because is a long process and is only necessary to run once I include the condition
if GENERATE_ROUTES_DATA: 
    # Generate the results
    generate_routes_raw()
    generate_routes_frequency_aggregation()
    
# Populate datasets
# Populate with routes 
populate_dataset_with_routes_results(traffic_logs_ip_classified)
populate_dataset_with_routes_results(anycast_pii_traffic_logs_df)
# Populate with metadata
populate_dataset_with_apks_metadata(traffic_logs_ip_classified)
populate_dataset_with_apks_metadata(anycast_pii_traffic_logs_df)
# Populate with the privacy policy extracted data
populate_dataset_with_policy_extracted_info(traffic_logs_ip_classified)
populate_dataset_with_policy_extracted_info(anycast_pii_traffic_logs_df)

# Save the datasets populated
traffic_logs_ip_classified.to_csv(TRAFFIC_LOGS_IP_CLASSIFIED_ANALYSIS_FILEPATH, sep=",", index=False)
anycast_pii_traffic_logs_df.to_csv(ANYCAST_PII_TRAFFIC_LOGS_ANALYSIS_FILEPATH, sep=",", index=False)

## Analysis questions

Answers to the questions needed for the article

Acronyms:
- PII = Personal Identificable Information

**IPs analysis**

In [12]:
all_traffic_logs_df = pd.read_csv(TRAFFIC_LOGS_IP_CLASSIFIED_FILEPATH, sep=",")

ips_total = all_traffic_logs_df["ip_dest"].unique().tolist()
print(f"Number of IPs in traffic logs: {len(ips_total)}")

ips_total_pii = all_traffic_logs_df.loc[
    (all_traffic_logs_df["PII"] != "No-PII") &
    (all_traffic_logs_df["PII"].notna())
]["ip_dest"].unique().tolist()
print(f"Number of IPs with PII: {len(ips_total_pii)}")

ips_anycast = all_traffic_logs_df.loc[
    all_traffic_logs_df["ip_anycast"]
]["ip_dest"].unique().tolist()
print(f"Number of IPs anycast: {len(ips_anycast)}")

ips_anycast_pii = all_traffic_logs_df.loc[
    (all_traffic_logs_df["ip_anycast"]) &
    (all_traffic_logs_df["PII"] != "No-PII") &
    (all_traffic_logs_df["PII"].notna())
]["ip_dest"].unique().tolist()
print(f"Number of IPs anycast with PII: {len(ips_anycast_pii)}")

Number of IPs in traffic logs: 5647
Number of IPs with PII: 1807
Number of IPs anycast: 991
Number of IPs anycast with PII: 200


**APKS analysis**

In [13]:
all_traffic_logs_df = pd.read_csv(TRAFFIC_LOGS_IP_CLASSIFIED_FILEPATH, sep=",")

apks_total = all_traffic_logs_df["apk"].unique().tolist()
print(f"Number of APKs in traffic logs: {len(apks_total)}")

apks_total_pii = all_traffic_logs_df.loc[
    (all_traffic_logs_df["PII"] != "No-PII") &
    (all_traffic_logs_df["PII"].notna())
    ]["apk"].unique().tolist()
print(f"Number of APKs with PII: {len(apks_total_pii)}")

apks_anycast = all_traffic_logs_df.loc[
    all_traffic_logs_df["ip_anycast"]
]["apk"].unique().tolist()
print(f"Number of APKs using anycast: {len(apks_anycast)}")

apks_anycast_pii = all_traffic_logs_df.loc[
    (all_traffic_logs_df["ip_anycast"]) &
    (all_traffic_logs_df["PII"] != "No-PII") &
    (all_traffic_logs_df["PII"].notna())
    ]["apk"].unique().tolist()
print(f"Number of APKs anycast with PII: {len(apks_anycast_pii)}")

Number of APKs in traffic logs: 5759
Number of APKs with PII: 3478
Number of APKs using anycast: 1669
Number of APKs anycast with PII: 960


**Hosts Analysis**

In [14]:
all_traffic_logs_df = pd.read_csv(TRAFFIC_LOGS_IP_CLASSIFIED_FILEPATH, sep=",")

hosts_total = all_traffic_logs_df["host"].unique().tolist()
print(f"Number of hosts in traffic logs: {len(hosts_total)}")

hosts_total_pii = all_traffic_logs_df.loc[
    (all_traffic_logs_df["PII"] != "No-PII") &
    (all_traffic_logs_df["PII"].notna())
    ]["host"].unique().tolist()
print(f"Number of hosts with PII: {len(hosts_total_pii)}")

hosts_anycast = all_traffic_logs_df.loc[
    all_traffic_logs_df["ip_anycast"]
]["host"].unique().tolist()
print(f"Number of hosts using anycast: {len(hosts_anycast)}")

hosts_anycast_pii = all_traffic_logs_df.loc[
    (all_traffic_logs_df["ip_anycast"]) &
    (all_traffic_logs_df["PII"] != "No-PII") &
    (all_traffic_logs_df["PII"].notna())
    ]["host"].unique().tolist()
print(f"Number of hosts anycast with PII: {len(hosts_anycast_pii)}")

Number of hosts in traffic logs: 4738
Number of hosts with PII: 966
Number of hosts using anycast: 995
Number of hosts anycast with PII: 201
