# Experiment Mesh Analysis


In [14]:
# external imports
import pandas as pd
import requests

In [15]:
# internal imports
from src.utils.common_functions import (
    json_file_to_dict,
    get_list_files_in_path,
)
from src.utils.constants import (
    RIPE_ATLAS_PROBES_BASE_URL
)

In [16]:
# Constants
REPLICATION_PACKAGE_PATH = "./replication_package"
COMPLETE_DATA_FILEPATH = f"{REPLICATION_PACKAGE_PATH}/Traffic_logs_10K.csv"
COMPLETE_DATA_METADATA_FILEPATH = f"{REPLICATION_PACKAGE_PATH}/Traffic_logs_10K_metadata.csv"
ANYCAST_TRAFFIC_FILEPATH = f"{REPLICATION_PACKAGE_PATH}/anycast_traffic_logs.csv"
EEE_COUNTRIES_FILE_PATH = "../src/resources/countries_sets/EEE_countries.json"
EEE_countries_set = set([country["alpha-2"] for country in json_file_to_dict(EEE_COUNTRIES_FILE_PATH)])

In [17]:
# Analysis params
DESTINATION_REPETITIONS_LIMIT = 1
MODES_LIST=["first_ip","voting"]
ANALYSIS_MODE=MODES_LIST[0]
GENERATE=False

In [18]:
# Filepaths variables
ANALYSIS_FOLDER = f"{REPLICATION_PACKAGE_PATH}/analysis_{ANALYSIS_MODE}"
EXPERIMENT_RESULTS_FOLDER = f"{REPLICATION_PACKAGE_PATH}/experiment_results_{ANALYSIS_MODE}"
ROUTES_RAW_FILENAME = f"{ANALYSIS_FOLDER}/routes_raw_{ANALYSIS_MODE}.csv"
ROUTES_FREQUENCY_FILENAME = f"{ANALYSIS_FOLDER}/routes_frequency_{ANALYSIS_MODE}.csv"
ANALYSIS_EXPERIMENT_FILEPATH = f"{ANALYSIS_FOLDER}/analysis_results_repetitions_{DESTINATION_REPETITIONS_LIMIT}_{ANALYSIS_MODE}.csv"
ANALYSIS_OUT_EEE_EXPERIMENT_FILEPATH = f"{ANALYSIS_FOLDER}/analysis_results_out_EEE_repetitions_{DESTINATION_REPETITIONS_LIMIT}_{ANALYSIS_MODE}.csv"
ANALYSIS_OUT_EEE_METADATA_EXPERIMENT_FILEPATH = f"{ANALYSIS_FOLDER}/analysis_results_out_EEE_metadata_repetitions_{DESTINATION_REPETITIONS_LIMIT}_{ANALYSIS_MODE}.csv"

In [19]:
# Create a dataset only with the anycast IPs
ips_classification_dict = json_file_to_dict(f"{REPLICATION_PACKAGE_PATH}/ips_classification.json")
anycast_ips_set = set([
    ip
    for ip in ips_classification_dict.keys() 
    if ips_classification_dict[ip]
])
traffic_complete_df = pd.read_csv(COMPLETE_DATA_FILEPATH, sep=",")
anycast_traffic_df = traffic_complete_df.loc[
    traffic_complete_df["ip_dest"].isin(anycast_ips_set)
]
anycast_traffic_df.to_csv(ANYCAST_TRAFFIC_FILEPATH, sep=",", index=False)


## Results data extraction and complete datasets

Data functions extraction

In [20]:
# Results extraction
def get_probe_country_code(probe_id: int) -> str:
    petition_url = (RIPE_ATLAS_PROBES_BASE_URL +
                    f"/{probe_id}")
    probe = requests.get(url=petition_url).json()
    if "country_code" in probe.keys():
        probe_country = probe["country_code"]
    else:
        probe_country = ""
    return probe_country
    

def get_result_country_route(hunter_result: dict) -> dict:
    probe_id = hunter_result["origin_id"]
    result_country = hunter_result["location_result"]["country"]
    probe_country = hunter_result["origin_country_code"]
    
    return {
        "origin_id": probe_id,
        "origin_country": probe_country,
        "result_country": result_country
    }

Extract the routes followed from every origin probe into a csv

In [21]:
def generate_routes_raw():
    routes_raw_df = pd.DataFrame(columns=["target", "probe_id", "origin_country", "result_country", "result_filename", "outside_EEE"])
    for result_filename in get_list_files_in_path(EXPERIMENT_RESULTS_FOLDER):
        print(result_filename)
        result = json_file_to_dict(f"{EXPERIMENT_RESULTS_FOLDER}/{result_filename}")
        target = result["target"]
        for hunter_result in result["hunter_results"]:
            route = get_result_country_route(hunter_result)
            probe_id = hunter_result["origin_id"]
            origin_country = route["origin_country"]
            result_country = route["result_country"]
            outside_EEE = (result_country not in EEE_countries_set) and (result_country != "Indeterminate")
                    
            routes_raw_df = pd.concat(
                [pd.DataFrame([[
                    target, probe_id, origin_country, result_country, result_filename, outside_EEE
                ]], columns=routes_raw_df.columns), routes_raw_df], 
                ignore_index=True
            )
    # Sort and save
    routes_raw_df.sort_values(by=["target", "origin_country", "result_country"], inplace=True)
    routes_raw_df.to_csv(ROUTES_RAW_FILENAME, sep=",", index=False)


Aggregate and count routes repetitions

In [22]:
def generate_routes_frequency_aggregation():
    # Aggregate routes counting the repetitions
    routes_frequency_df = pd.read_csv(ROUTES_RAW_FILENAME, sep=",")
    routes_frequency_df.drop(columns=["probe_id", "outside_EEE", "result_filename"], inplace=True)
    routes_frequency_df = routes_frequency_df.value_counts(subset=['target', 'origin_country', 'result_country'])
    routes_frequency_df.to_csv(ROUTES_FREQUENCY_FILENAME, sep=",")

    # Include the info about outside the EEE
    routes_frequency_df = pd.read_csv(ROUTES_FREQUENCY_FILENAME, sep=",")
    routes_frequency_df["outside_EEE"] = False
    
    for index, row in routes_frequency_df.iterrows():
        result_country = row["result_country"]
        routes_frequency_df.loc[index, "outside_EEE"] = (result_country not in EEE_countries_set) and (result_country != "Indeterminate")
    
    routes_frequency_df.sort_values(by=["target", "origin_country", "result_country"], inplace=True)
    routes_frequency_df.to_csv(ROUTES_FREQUENCY_FILENAME, sep=",", index=False)

Introduce routes outside EEE and its count in the complete dataset

In [23]:
def populate_complete_dataset_with_routes_results(traffic_complete_df: pd.DataFrame):
    routes_valid_df = pd.read_csv(ROUTES_FREQUENCY_FILENAME, sep=",")
    routes_valid_df = routes_valid_df.loc[
        (routes_valid_df["outside_EEE"] == True) & 
        (routes_valid_df["count"] >= DESTINATION_REPETITIONS_LIMIT)
    ]
    
    routes_valid_dict = {}
    for index, row in routes_valid_df.iterrows():
        target = row["target"]
        if target not in routes_valid_dict.keys():
            routes_valid_dict[row["target"]] = {
                "origins": [],
                "destinations": [],
                "count": []
            }
        
        routes_valid_dict[target]["origins"].append(row["origin_country"])
        routes_valid_dict[target]["destinations"].append(row["result_country"])
        routes_valid_dict[target]["count"].append(str(row["count"]))
    
    # Charge routes in the complete dataset
    traffic_complete_df["origins"] = ""
    traffic_complete_df["destinations_outside_EEE"] = ""
    traffic_complete_df["routes_count"] = ""
    traffic_complete_df["outside_EEE"] = False
    
    for target in routes_valid_dict.keys():
        traffic_complete_df.loc[
            (traffic_complete_df["ip_dest"] == target), ["origins"]
        ] = ",".join(routes_valid_dict[target]["origins"])
        traffic_complete_df.loc[
            (traffic_complete_df["ip_dest"] == target), ["destinations_outside_EEE"]
        ] = ",".join(routes_valid_dict[target]["destinations"])
        traffic_complete_df.loc[
            (traffic_complete_df["ip_dest"] == target), ["routes_count"]
        ] = ",".join(routes_valid_dict[target]["count"])
        traffic_complete_df.loc[
            (traffic_complete_df["ip_dest"] == target), ["outside_EEE"]
        ] = True


Save data with all the analysis

In [24]:
# Beacuse is a long process and is only necessary  to run once I include the condition
if GENERATE:
    # Get the original data of traffic interception
    anycast_traffic_df = pd.read_csv(ANYCAST_TRAFFIC_FILEPATH, sep=",")
    # Generate the results
    generate_routes_raw()
    generate_routes_frequency_aggregation()
    populate_complete_dataset_with_routes_results(anycast_traffic_df)
    
    # Save the dataset populated, both with and without outside_EEE traffic
    anycast_traffic_df.to_csv(ANALYSIS_EXPERIMENT_FILEPATH, sep=",", index=False)
    anycast_traffic_out_EEE_df = anycast_traffic_df.loc[(anycast_traffic_df["outside_EEE"] == True)]
    anycast_traffic_out_EEE_df.to_csv(ANALYSIS_OUT_EEE_EXPERIMENT_FILEPATH, sep=",", index=False)

## Statistics of results

In [25]:
traffic_complete_apks = pd.read_csv(COMPLETE_DATA_FILEPATH, sep=",")["apk"].unique().tolist()
anycast_traffic_apks = pd.read_csv(ANYCAST_TRAFFIC_FILEPATH, sep=",")["apk"].unique().tolist()

print("Number of APKs: ", len(traffic_complete_apks))
print("Number of anycast APKs: ", len(anycast_traffic_apks))
print("Percentage of anycast APKs: ", (len(anycast_traffic_apks)/len(traffic_complete_apks))*100)

traffic_complete_ips = pd.read_csv(COMPLETE_DATA_FILEPATH, sep=",")["ip_dest"].unique().tolist()
anycast_traffic_ips = pd.read_csv(ANYCAST_TRAFFIC_FILEPATH, sep=",")["ip_dest"].unique().tolist()

print("Number of IPs: ", len(traffic_complete_ips))
print("Number of anycast IPs: ", len(anycast_traffic_ips))
print("Percentage of anycast IPs: ", (len(anycast_traffic_ips)/len(traffic_complete_ips))*100)

Number of APKs:  5759
Number of anycast APKs:  1323
Percentage of anycast APKs:  22.972738322625457
Number of IPs:  5647
Number of anycast IPs:  218
Percentage of anycast IPs:  3.8604568797591643


In [26]:
analysis_df = pd.read_csv(ANALYSIS_EXPERIMENT_FILEPATH, sep=",")
apks_list = analysis_df["apk"].unique().tolist()
hosts_list = analysis_df["host"].unique().tolist()
ips_list = analysis_df["ip_dest"].unique().tolist()

apks_out_EEE_list = analysis_df.loc[
    analysis_df["outside_EEE"] == True
]["apk"].unique().tolist()
hosts_out_EEE_list = analysis_df.loc[
    analysis_df["outside_EEE"] == True
]["host"].unique().tolist()
ips_out_EEE_list = analysis_df.loc[
    analysis_df["outside_EEE"] == True
]["ip_dest"].unique().tolist()

print("Result mode: ", ANALYSIS_MODE)
print("Number of apks: ", len(apks_list))
print("Number of apks out of EEE: ", len(apks_out_EEE_list))
print("Percentage of apks out of EEE: ", (len(apks_out_EEE_list) / len(apks_list))*100)
print("Number of hosts: ", len(hosts_list))
print("Number of hosts out of EEE: ", len(hosts_out_EEE_list))
print("Percentage of apks out of EEE: ", (len(hosts_out_EEE_list) / len(hosts_list))*100)
print("Number of ips: ", len(ips_list))
print("Number of ips out of EEE: ", len(ips_out_EEE_list))
print("Percentage of apks out of EEE: ", (len(ips_out_EEE_list) / len(ips_list))*100)
print("##########################################")

Result mode:  first_ip
Number of apks:  1323
Number of apks out of EEE:  1313
Percentage of apks out of EEE:  99.24414210128496
Number of hosts:  362
Number of hosts out of EEE:  351
Percentage of apks out of EEE:  96.96132596685084
Number of ips:  218
Number of ips out of EEE:  214
Percentage of apks out of EEE:  98.1651376146789
##########################################


## Generate other data

In [27]:
# Generate CSV with hosts for David
pd.DataFrame(hosts_list, columns=["host"]).to_csv(f"{ANALYSIS_FOLDER}/hosts_{ANALYSIS_MODE}.csv", sep=",", index=False)
pd.DataFrame(hosts_out_EEE_list, columns=["host"]).to_csv(f"{ANALYSIS_FOLDER}/hosts_out_EEE_{ANALYSIS_MODE}.csv", sep=",", index=False)

## Complete dataset

Metadata from Play Store

In [28]:
original_complete_metadata_df = pd.read_csv(COMPLETE_DATA_METADATA_FILEPATH)
original_complete_metadata_df.drop_duplicates(["apk"], inplace=True)

anycast_analysis_out_EEE_df = pd.read_csv(ANALYSIS_OUT_EEE_EXPERIMENT_FILEPATH)

for index, row in original_complete_metadata_df.iterrows():
    anycast_analysis_out_EEE_df.loc[
        anycast_analysis_out_EEE_df["apk"] == row["apk"], "android_rating"
    ] = row["android_rating"]

    anycast_analysis_out_EEE_df.loc[
        anycast_analysis_out_EEE_df["apk"] == row["apk"], "android_numDownloads"
    ] = row["android_numDownloads"]

    anycast_analysis_out_EEE_df.loc[
        anycast_analysis_out_EEE_df["apk"] == row["apk"], "android_category"
    ] = row["android_category"]


anycast_analysis_out_EEE_df.to_csv(ANALYSIS_OUT_EEE_METADATA_EXPERIMENT_FILEPATH, sep=",", index=False)

  anycast_analysis_out_EEE_df.loc[
  anycast_analysis_out_EEE_df.loc[
