# Experiment Mesh Analysis


In [2]:
# external imports
import pandas as pd
import requests
from shapely import (
    Point, from_geojson, to_geojson
)

In [3]:
# internal imports
from src.utils.common_functions import (
    json_file_to_dict,
    get_list_files_in_path,
    get_nearest_airport_to_point
)
from src.utils.constants import (
    RIPE_ATLAS_PROBES_BASE_URL,
    EEE_MESH_3_FILEPATH,
    EEE_COUNTRIES_FILEPATH,
    REPLICATION_PACKAGE_DIR,
    TRAFFIC_LOGS_IP_CLASSIFIED_FILEPATH,
    ANYCAST_PII_TRAFFIC_LOGS_FILEPATH,
    APKS_METADATA_FILEPATH,
    RESULTS_MODES
)

In [3]:
# Constants
EEE_countries_set = set([country["alpha-2"] for country in json_file_to_dict(EEE_COUNTRIES_FILEPATH)])

In [17]:
# Analysis params
DESTINATION_REPETITIONS_LIMIT = 1
ANALYSIS_MODE=RESULTS_MODES[0]
GENERATE=True

In [18]:
# Filepaths variables
EXPERIMENT_RESULTS_FOLDER = f"{REPLICATION_PACKAGE_DIR}/experiment_results_{ANALYSIS_MODE}"
ANALYSIS_FOLDER = f"{REPLICATION_PACKAGE_DIR}/analysis_{ANALYSIS_MODE}"

ROUTES_RESULTS_FILENAME = f"{ANALYSIS_FOLDER}/routes_results_{ANALYSIS_MODE}.csv"
ROUTES_FREQUENCY_FILENAME = f"{ANALYSIS_FOLDER}/routes_frequency_{ANALYSIS_MODE}.csv"

## DELETE??

In [ ]:
ANALYSIS_EXPERIMENT_FILEPATH = f"{ANALYSIS_FOLDER}/analysis_results_repetitions_{DESTINATION_REPETITIONS_LIMIT}_{ANALYSIS_MODE}.csv"
ANALYSIS_OUT_EEE_EXPERIMENT_FILEPATH = f"{ANALYSIS_FOLDER}/analysis_results_out_EEE_repetitions_{DESTINATION_REPETITIONS_LIMIT}_{ANALYSIS_MODE}.csv"
ANALYSIS_OUT_EEE_METADATA_EXPERIMENT_FILEPATH = f"{ANALYSIS_FOLDER}/analysis_results_out_EEE_metadata_repetitions_{DESTINATION_REPETITIONS_LIMIT}_{ANALYSIS_MODE}.csv"

## Enrichemnt of data and generation of datasets

Dataset of IPs analyzed (anycast with PII) to work with and complete

In [19]:
anycast_pii_traffic_logs_df = pd.read_csv(ANYCAST_PII_TRAFFIC_LOGS_FILEPATH, sep=",")

### Routes resume data extraction

In [20]:
# Results extraction
def get_probe_location(probe_id: int, origin_list: []) -> (float, float):
    for origin in origin_list:
        if probe_id == origin["probe_id"]:
            location = from_geojson(origin["location"])
            return location.y, location.x
        else:
            continue
    return 0, 0

def get_result_country_route(hunter_result: dict) -> dict:
    probe_id = hunter_result["origin_id"]
    result_country = hunter_result["location_result"]["country"]
    probe_country = hunter_result["origin_country_code"]
    
    return {
        "origin_id": probe_id,
        "origin_country": probe_country,
        "result_country": result_country
    }

Generate a file with all the routes got as the result of the experiment execution

In [5]:
def generate_routes_raw():
    routes_raw_df = pd.DataFrame(
        columns=[
            "target", "probe_id", 
            "origin_country", "origin_latitude", "origin_longitude", 
            "result_country", "result_latitude", "result_longitude",
            "result_filename", "outside_EEE"
        ]
    )
    for result_filename in get_list_files_in_path(EXPERIMENT_RESULTS_FOLDER):
        print(result_filename)
        result = json_file_to_dict(f"{EXPERIMENT_RESULTS_FOLDER}/{result_filename}")
        target = result["target"]
        origin_list = result["measurements"]["origin"]
        for hunter_result in result["hunter_results"]:
            route = get_result_country_route(hunter_result)
            probe_id = hunter_result["origin_id"]
            origin_country = route["origin_country"]
            origin_latitude, origin_longitude = get_probe_location(probe_id, origin_list)
            
            result_country = route["result_country"]
            if len(hunter_result["location_result"]["airports_intersection"]) == 1:
                result_location = from_geojson(hunter_result["location_results"]["airports_intersection"][0]["location"])
                result_latitude = result_location.y
                result_longitude = result_location.x
            else:
                result_latitude = 0
                result_longitude = 0
                
            outside_eee = (result_country not in EEE_countries_set) and (result_country != "Indeterminate")
                    
            routes_raw_df = pd.concat(
                [pd.DataFrame([[
                    target, probe_id, 
                    origin_country, origin_latitude, origin_longitude, 
                    result_country, result_latitude, result_longitude,
                    result_filename, outside_eee
                ]], columns=routes_raw_df.columns), routes_raw_df], 
                ignore_index=True
            )
    # Sort and save
    routes_raw_df.sort_values(by=["target", "origin_country", "result_country"], inplace=True)
    routes_raw_df.to_csv(ROUTES_RESULTS_FILENAME, sep=",", index=False)


Aggregate and count routes repetitions

In [22]:
def generate_routes_frequency_aggregation():
    # Aggregate routes counting the repetitions
    routes_frequency_df = pd.read_csv(ROUTES_RESULTS_FILENAME, sep=",")
    routes_frequency_df = routes_frequency_df["target", "origin_country", "result_country"]
    routes_frequency_df = routes_frequency_df.value_counts(subset=['target', 'origin_country', 'result_country'])
    routes_frequency_df.to_csv(ROUTES_FREQUENCY_FILENAME, sep=",")

    # Include the info about outside the EEE
    routes_frequency_df = pd.read_csv(ROUTES_FREQUENCY_FILENAME, sep=",")
    routes_frequency_df["outside_EEE"] = False
    
    for index, row in routes_frequency_df.iterrows():
        result_country = row["result_country"]
        routes_frequency_df.loc[index, "outside_EEE"] = (result_country not in EEE_countries_set) and (result_country != "Indeterminate")
    
    routes_frequency_df.sort_values(by=["target", "origin_country", "result_country"], inplace=True)
    routes_frequency_df.to_csv(ROUTES_FREQUENCY_FILENAME, sep=",", index=False)

Introduce routes outside EEE and its count in the complete dataset WHAT DATASETS POPULATE????

In [23]:
def populate_complete_dataset_with_routes_results(traffic_complete_df: pd.DataFrame):
    routes_valid_df = pd.read_csv(ROUTES_FREQUENCY_FILENAME, sep=",")
    routes_valid_df = routes_valid_df.loc[
        (routes_valid_df["outside_EEE"] == True) & 
        (routes_valid_df["count"] >= DESTINATION_REPETITIONS_LIMIT)
    ]
    
    routes_valid_dict = {}
    for index, row in routes_valid_df.iterrows():
        target = row["target"]
        if target not in routes_valid_dict.keys():
            routes_valid_dict[row["target"]] = {
                "origins": [],
                "destinations": [],
                "count": []
            }
        
        routes_valid_dict[target]["origins"].append(row["origin_country"])
        routes_valid_dict[target]["destinations"].append(row["result_country"])
        routes_valid_dict[target]["count"].append(str(row["count"]))
    
    # Charge routes in the complete dataset
    traffic_complete_df["origins"] = ""
    traffic_complete_df["destinations_outside_EEE"] = ""
    traffic_complete_df["routes_count"] = ""
    traffic_complete_df["outside_EEE"] = False
    
    for target in routes_valid_dict.keys():
        traffic_complete_df.loc[
            (traffic_complete_df["ip_dest"] == target), ["origins"]
        ] = ",".join(routes_valid_dict[target]["origins"])
        traffic_complete_df.loc[
            (traffic_complete_df["ip_dest"] == target), ["destinations_outside_EEE"]
        ] = ",".join(routes_valid_dict[target]["destinations"])
        traffic_complete_df.loc[
            (traffic_complete_df["ip_dest"] == target), ["routes_count"]
        ] = ",".join(routes_valid_dict[target]["count"])
        traffic_complete_df.loc[
            (traffic_complete_df["ip_dest"] == target), ["outside_EEE"]
        ] = True


Populate the datasets with metadata info

In [ ]:
original_complete_metadata_df = pd.read_csv(COMPLETE_DATA_METADATA_FILEPATH)
original_complete_metadata_df.drop_duplicates(["apk"], inplace=True)

anycast_analysis_out_EEE_df = pd.read_csv(ANALYSIS_OUT_EEE_EXPERIMENT_FILEPATH)

for index, row in original_complete_metadata_df.iterrows():
    anycast_analysis_out_EEE_df.loc[
        anycast_analysis_out_EEE_df["apk"] == row["apk"], "android_rating"
    ] = row["android_rating"]

    anycast_analysis_out_EEE_df.loc[
        anycast_analysis_out_EEE_df["apk"] == row["apk"], "android_numDownloads"
    ] = row["android_numDownloads"]

    anycast_analysis_out_EEE_df.loc[
        anycast_analysis_out_EEE_df["apk"] == row["apk"], "android_category"
    ] = row["android_category"]


anycast_analysis_out_EEE_df.to_csv(ANALYSIS_OUT_EEE_METADATA_EXPERIMENT_FILEPATH, sep=",", index=False)

Execute the enrichment of data and generation

In [24]:
# Beacuse is a long process and is only necessary to run once I include the condition
if GENERATE:
    # Get the original data of traffic interception
    anycast_traffic_df = pd.read_csv(ANYCAST_PII_TRAFFIC_LOGS_FILEPATH, sep=",")
    # Generate the results
    generate_routes_raw()
    generate_routes_frequency_aggregation()
    populate_complete_dataset_with_routes_results(anycast_traffic_df)
    
    # Save the dataset populated, both with and without outside_EEE traffic
    anycast_traffic_df.to_csv(ANALYSIS_EXPERIMENT_FILEPATH, sep=",", index=False)
    anycast_traffic_out_EEE_df = anycast_traffic_df.loc[(anycast_traffic_df["outside_EEE"] == True)]
    anycast_traffic_out_EEE_df.to_csv(ANALYSIS_OUT_EEE_EXPERIMENT_FILEPATH, sep=",", index=False)

## Analysis questions

Answers to the questions needed for the article

Acronyms:
- PII = Personal Identificable Information

**IPs analysis**

In [31]:
all_traffic_logs_df = pd.read_csv(TRAFFIC_LOGS_IP_CLASSIFIED_FILEPATH, sep=",")

ips_total = all_traffic_logs_df["ip_dest"].unique().tolist()
print(f"Number of IPs in traffic logs: {len(ips_total)}")

ips_total_pii = all_traffic_logs_df.loc[
    (all_traffic_logs_df["PII"] != "No-PII") &
    (all_traffic_logs_df["PII"].notna())
]["ip_dest"].unique().tolist()
print(f"Number of IPs with PII: {len(ips_total_pii)}")

ips_anycast = all_traffic_logs_df.loc[
    all_traffic_logs_df["ip_anycast"]
]["ip_dest"].unique().tolist()
print(f"Number of IPs anycast: {len(ips_anycast)}")

ips_anycast_pii = all_traffic_logs_df.loc[
    (all_traffic_logs_df["ip_anycast"]) &
    (all_traffic_logs_df["PII"] != "No-PII") &
    (all_traffic_logs_df["PII"].notna())
]["ip_dest"].unique().tolist()
print(f"Number of IPs anycast with PII: {len(ips_anycast_pii)}")

Number of IPs in traffic logs: 5647
Number of IPs with PII: 1807
Number of IPs anycast: 991
Number of IPs anycast with PII: 200


**APKS analysis**

In [32]:
all_traffic_logs_df = pd.read_csv(TRAFFIC_LOGS_IP_CLASSIFIED_FILEPATH, sep=",")

apks_total = all_traffic_logs_df["apk"].unique().tolist()
print(f"Number of APKs in traffic logs: {len(apks_total)}")

apks_total_pii = all_traffic_logs_df.loc[
    (all_traffic_logs_df["PII"] != "No-PII") &
    (all_traffic_logs_df["PII"].notna())
    ]["apk"].unique().tolist()
print(f"Number of APKs with PII: {len(apks_total_pii)}")

apks_anycast = all_traffic_logs_df.loc[
    all_traffic_logs_df["ip_anycast"]
]["apk"].unique().tolist()
print(f"Number of APKs using anycast: {len(apks_anycast)}")

apks_anycast_pii = all_traffic_logs_df.loc[
    (all_traffic_logs_df["ip_anycast"]) &
    (all_traffic_logs_df["PII"] != "No-PII") &
    (all_traffic_logs_df["PII"].notna())
    ]["apk"].unique().tolist()
print(f"Number of APKs anycast with PII: {len(apks_anycast_pii)}")

Number of APKs in traffic logs: 5759
Number of APKs with PII: 3478
Number of APKs using anycast: 1669
Number of APKs anycast with PII: 960


**Hosts Analysis**

In [33]:
all_traffic_logs_df = pd.read_csv(TRAFFIC_LOGS_IP_CLASSIFIED_FILEPATH, sep=",")

hosts_total = all_traffic_logs_df["host"].unique().tolist()
print(f"Number of hosts in traffic logs: {len(hosts_total)}")

hosts_total_pii = all_traffic_logs_df.loc[
    (all_traffic_logs_df["PII"] != "No-PII") &
    (all_traffic_logs_df["PII"].notna())
    ]["host"].unique().tolist()
print(f"Number of hosts with PII: {len(hosts_total_pii)}")

hosts_anycast = all_traffic_logs_df.loc[
    all_traffic_logs_df["ip_anycast"]
]["host"].unique().tolist()
print(f"Number of hosts using anycast: {len(hosts_anycast)}")

hosts_anycast_pii = all_traffic_logs_df.loc[
    (all_traffic_logs_df["ip_anycast"]) &
    (all_traffic_logs_df["PII"] != "No-PII") &
    (all_traffic_logs_df["PII"].notna())
    ]["host"].unique().tolist()
print(f"Number of hosts anycast with PII: {len(hosts_anycast_pii)}")

Number of hosts in traffic logs: 4738
Number of hosts with PII: 966
Number of hosts using anycast: 995
Number of hosts anycast with PII: 201
