# Experiment Mesh Analysis


In [1]:
# external imports
import pandas as pd
from ast import literal_eval
from shapely import (
    from_geojson
)

In [2]:
# internal imports
from src.utils.common_functions import (
    json_file_to_dict,
    get_list_files_in_path,
)
from src.utils.constants import (
    EEE_COUNTRIES_FILEPATH,
    REPLICATION_PACKAGE_DIR,
    TRAFFIC_LOGS_IP_CLASSIFIED_FILEPATH,
    ANYCAST_PII_TRAFFIC_LOGS_FILEPATH,
    APKS_METADATA_FILEPATH,
    RESULTS_MODES,
    IT_ANNOTATION_FILEPATH
)

In [3]:
# Constants
EEE_countries_set = set([country["alpha-2"] for country in json_file_to_dict(EEE_COUNTRIES_FILEPATH)])

In [4]:
# Analysis params
DESTINATION_REPETITIONS_LIMIT = 1
ANALYSIS_MODE=RESULTS_MODES[0]
GENERATE_ROUTES_DATA=False

In [5]:
# Filepaths variables
EXPERIMENT_RESULTS_FOLDER = f"{REPLICATION_PACKAGE_DIR}/experiment_results_{ANALYSIS_MODE}"
ANALYSIS_FOLDER = f"{REPLICATION_PACKAGE_DIR}/analysis_{ANALYSIS_MODE}"

ROUTES_RESULTS_FILENAME = f"{ANALYSIS_FOLDER}/routes_results_{ANALYSIS_MODE}.csv"
ROUTES_FREQUENCY_FILENAME = f"{ANALYSIS_FOLDER}/routes_frequency_{ANALYSIS_MODE}.csv"

ANYCAST_PII_TRAFFIC_LOGS_ANALYSIS_FILEPATH = f"{REPLICATION_PACKAGE_DIR}/analysis_{ANALYSIS_MODE}/Anycast_PII_Traffic_Logs_{ANALYSIS_MODE}.csv"
TRAFFIC_LOGS_IP_CLASSIFIED_ANALYSIS_FILEPATH = f"{REPLICATION_PACKAGE_DIR}/analysis_{ANALYSIS_MODE}/Traffic_logs_10K_ip_classified_{ANALYSIS_MODE}.csv"
ANYCAST_PII_TRAFFIC_LOGS_ANALYSIS_AGGREGATION_FILEPATH = f"{REPLICATION_PACKAGE_DIR}/analysis_{ANALYSIS_MODE}/Anycast_PII_Traffic_Logs_aggregation_{ANALYSIS_MODE}.csv"
TRAFFIC_LOGS_IP_CLASSIFIED_ANALYSIS_AGGREGATION_FILEPATH = f"{REPLICATION_PACKAGE_DIR}/analysis_{ANALYSIS_MODE}/Traffic_logs_10K_ip_classified_aggregation_{ANALYSIS_MODE}.csv"

## Enrichment of data and generation of datasets

Generate a file with all the routes got as the result of the experiment execution

In [6]:
# Auxiliary functions
def get_probe_location(probe_id: int, origin_list: []) -> (float, float):
    for origin in origin_list:
        if probe_id == origin["probe_id"]:
            location = from_geojson(origin["location"])
            return location.y, location.x
        else:
            continue
    return 0, 0

def get_result_country_route(hunter_result: dict) -> dict:
    probe_id = hunter_result["origin_id"]
    result_country = hunter_result["location_result"]["country"]
    probe_country = hunter_result["origin_country_code"]

    return {
        "origin_id": probe_id,
        "origin_country": probe_country,
        "result_country": result_country
    }

# Data generation function
def generate_routes_raw():
    routes_raw_df = pd.DataFrame(
        columns=[
            "target", "probe_id", 
            "origin_country", "origin_latitude", "origin_longitude", 
            "result_country", "result_latitude", "result_longitude",
            "result_filename", "outside_EEE"
        ]
    )
    for result_filename in get_list_files_in_path(EXPERIMENT_RESULTS_FOLDER):
        print(result_filename)
        result = json_file_to_dict(f"{EXPERIMENT_RESULTS_FOLDER}/{result_filename}")
        target = result["target"]
        origin_list = result["measurements"]["origin"]
        for hunter_result in result["hunter_results"]:
            route = get_result_country_route(hunter_result)
            probe_id = hunter_result["origin_id"]
            origin_country = route["origin_country"]
            origin_latitude, origin_longitude = get_probe_location(probe_id, origin_list)
            
            result_country = route["result_country"]
            if len(hunter_result["location_result"]["airports_intersection"]) == 1:
                result_location = from_geojson(hunter_result["location_result"]["airports_intersection"][0]["location"])
                result_latitude = result_location.y
                result_longitude = result_location.x
            else:
                result_latitude = 0
                result_longitude = 0
                
            outside_eee = (result_country not in EEE_countries_set) and (result_country != "Indeterminate")
                    
            routes_raw_df = pd.concat(
                [pd.DataFrame([[
                    target, probe_id, 
                    origin_country, origin_latitude, origin_longitude, 
                    result_country, result_latitude, result_longitude,
                    result_filename, outside_eee
                ]], columns=routes_raw_df.columns), routes_raw_df], 
                ignore_index=True
            )
    # Sort and save
    routes_raw_df.sort_values(by=["target", "origin_country", "result_country"], inplace=True)
    routes_raw_df.to_csv(ROUTES_RESULTS_FILENAME, sep=",", index=False)


Aggregate and count routes repetitions

In [7]:
# Data generation function
def generate_routes_frequency_aggregation():
    # Aggregate routes counting the repetitions
    routes_frequency_df = pd.read_csv(ROUTES_RESULTS_FILENAME, sep=",")
    routes_frequency_df = routes_frequency_df[["target", "origin_country", "result_country"]]
    routes_frequency_df = routes_frequency_df.value_counts(subset=['target', 'origin_country', 'result_country'])
    routes_frequency_df.to_csv(ROUTES_FREQUENCY_FILENAME, sep=",")

    # Include the info about outside the EEE
    routes_frequency_df = pd.read_csv(ROUTES_FREQUENCY_FILENAME, sep=",")
    routes_frequency_df["outside_EEE"] = False
    
    for index, row in routes_frequency_df.iterrows():
        result_country = row["result_country"]
        routes_frequency_df.loc[index, "outside_EEE"] = (result_country not in EEE_countries_set) and (result_country != "Indeterminate")
    
    routes_frequency_df.sort_values(by=["target", "origin_country", "result_country"], inplace=True)
    routes_frequency_df.to_csv(ROUTES_FREQUENCY_FILENAME, sep=",", index=False)

Introduce routes outside EEE and its count in the complete dataset WHAT DATASETS POPULATE????

In [8]:
# Dataset population function
def populate_dataset_with_routes_results(dataframe: pd.DataFrame):
    routes_valid_df = pd.read_csv(ROUTES_FREQUENCY_FILENAME, sep=",")
    routes_valid_df = routes_valid_df.loc[
        (routes_valid_df["outside_EEE"] == True) & 
        (routes_valid_df["count"] >= DESTINATION_REPETITIONS_LIMIT)
    ]
    
    routes_valid_dict = {}
    for index, row in routes_valid_df.iterrows():
        target = row["target"]
        if target not in routes_valid_dict.keys():
            routes_valid_dict[row["target"]] = {
                "origins": [],
                "destinations": [],
                "count": []
            }
        
        routes_valid_dict[target]["origins"].append(row["origin_country"])
        routes_valid_dict[target]["destinations"].append(row["result_country"])
        routes_valid_dict[target]["count"].append(str(row["count"]))
    
    # Charge routes in the complete dataset
    dataframe["origins_transfers_outside_EEE"] = "[]"
    dataframe["destinations_transfers_outside_EEE"] = "[]"
    dataframe["frequency_transfers_outside_EEE"] = "[]"
    dataframe["outside_EEE"] = False
    
    for target in routes_valid_dict.keys():
        dataframe.loc[
            (dataframe["ip_dest"] == target), ["origins_transfers_outside_EEE"]
        ] = str(routes_valid_dict[target]["origins"])
        dataframe.loc[
            (dataframe["ip_dest"] == target), ["destinations_transfers_outside_EEE"]
        ] = str(routes_valid_dict[target]["destinations"])
        dataframe.loc[
            (dataframe["ip_dest"] == target), ["frequency_transfers_outside_EEE"]
        ] = str(routes_valid_dict[target]["count"])
        dataframe.loc[
            (dataframe["ip_dest"] == target), ["outside_EEE"]
        ] = True


Populate the datasets with metadata info

In [9]:
def populate_dataset_with_apks_metadata(dataframe: pd.DataFrame):
    apk_metadata_df = pd.read_csv(APKS_METADATA_FILEPATH, sep=",")
    
    for index, row in apk_metadata_df.iterrows():
        dataframe.loc[
            dataframe["apk"] == row["apk"], "android_rating"
        ] = row["android_rating"]
    
        dataframe.loc[
            dataframe["apk"] == row["apk"], "android_numDownloads"
        ] = row["android_numDownloads"]
    
        dataframe.loc[
            dataframe["apk"] == row["apk"], "android_category"
        ] = row["android_category"]


Populate the datasets with the info extrated from every policy

In [10]:
def populate_dataset_with_policy_extracted_info(dataframe: pd.DataFrame):
    it_annotation_results_df = pd.read_csv(IT_ANNOTATION_FILEPATH, sep=",")
    
    it_annotation_results_df.drop_duplicates(["apk", "countries"], inplace=True)
    for index, row in it_annotation_results_df.iterrows():
        if row["countries"]:
            countries_mentioned_by_policy = row["countries"]
        else:
            countries_mentioned_by_policy = "[]"
        dataframe.loc[
            dataframe["apk"] == row["apk"], "countries_mentioned_by_policy"
        ] = countries_mentioned_by_policy

        dataframe.loc[
            dataframe["apk"] == row["apk"], "it_mentioned_by_policy"
        ] = row["transfer"]
        
        dataframe.loc[
            dataframe["apk"] == row["apk"], "adequacy_decision_by_policy"
        ] = row["adequacy_decision"]
    

Check GDPR compliance in terms of international transfers

In [11]:
def check_it_gdpr_compliance(dataframe: pd.DataFrame):
    for index, row in dataframe.iterrows():
        destinations_transfers_outside_eee_raw = row["destinations_transfers_outside_EEE"]
        if type(destinations_transfers_outside_eee_raw) is str:
            try:
                destinations_transfers_outside_eee = set(literal_eval(destinations_transfers_outside_eee_raw))
            except:
                destinations_transfers_outside_eee = set()
        else:
            destinations_transfers_outside_eee = set()
        
        countries_mentioned_by_policy_raw = row["countries_mentioned_by_policy"]
        if type(countries_mentioned_by_policy_raw) is str:
            try:
                countries_mentioned_by_policy = set(literal_eval(countries_mentioned_by_policy_raw))
            except:
                countries_mentioned_by_policy = set()
        else:
            countries_mentioned_by_policy = set()
        
        if not destinations_transfers_outside_eee:
            dataframe.loc[index, "it_gdpr_compliance"] = True
        else:
            if destinations_transfers_outside_eee.issubset(countries_mentioned_by_policy):
                dataframe.loc[index, "it_gdpr_compliance"] = True
            else:
                dataframe.loc[index, "it_gdpr_compliance"] = False
        

Aggregate analysis

In [12]:
def aggregate_analysis(dataframe: pd.DataFrame, to_filepath: str):
    aggregation_df = dataframe[
        ["apk", "version", "phase", 
         "tls", "https", "host", "ip_dest", 
         "PII", "ip_anycast", 
         "origins_transfers_outside_EEE", "destinations_transfers_outside_EEE", "frequency_transfers_outside_EEE", "outside_EEE",
         "android_rating", "android_numDownloads", "android_category", 
         "countries_mentioned_by_policy", "it_mentioned_by_policy", "adequacy_decision_by_policy", "it_gdpr_compliance"]
    ].copy()
    
    aggregation_df.drop_duplicates(inplace=True)

    aggregation_df.fillna(
        value={
            "countries_mentioned_by_policy": "[]",
            "it_mentioned_by_policy": False,
            "adequacy_decision_by_policy": False
        }, inplace=True
    )
    
    aggregation_df.to_csv(to_filepath, sep=",", index=False)
    

Execute the enrichment of data and generation

In [13]:
# Charge the dataframes to be used
traffic_logs_ip_classified_analysis_df = pd.read_csv(TRAFFIC_LOGS_IP_CLASSIFIED_FILEPATH, sep=",")
anycast_pii_traffic_logs_df = pd.read_csv(ANYCAST_PII_TRAFFIC_LOGS_FILEPATH, sep=",")

In [14]:
# Because is a long process and is only necessary to run once I include the condition
if GENERATE_ROUTES_DATA:
    # Generate the results
    generate_routes_raw()
    generate_routes_frequency_aggregation()

In [15]:
# Populate datasets
# Populate with routes 
populate_dataset_with_routes_results(traffic_logs_ip_classified_analysis_df)
populate_dataset_with_routes_results(anycast_pii_traffic_logs_df)
# Populate with metadata
populate_dataset_with_apks_metadata(traffic_logs_ip_classified_analysis_df)
populate_dataset_with_apks_metadata(anycast_pii_traffic_logs_df)
# Populate with the privacy policy extracted data
populate_dataset_with_policy_extracted_info(traffic_logs_ip_classified_analysis_df)
populate_dataset_with_policy_extracted_info(anycast_pii_traffic_logs_df)

In [16]:
# Check conditions
check_it_gdpr_compliance(traffic_logs_ip_classified_analysis_df)
check_it_gdpr_compliance(anycast_pii_traffic_logs_df)

In [17]:
# Generate datasets aggregated
aggregate_analysis(traffic_logs_ip_classified_analysis_df, TRAFFIC_LOGS_IP_CLASSIFIED_ANALYSIS_AGGREGATION_FILEPATH)
aggregate_analysis(anycast_pii_traffic_logs_df, ANYCAST_PII_TRAFFIC_LOGS_ANALYSIS_AGGREGATION_FILEPATH)

  aggregation_df.fillna(
  aggregation_df.fillna(


In [18]:
traffic_logs_ip_classified_analysis_df = traffic_logs_ip_classified_analysis_df.fillna(
    value={
        "countries_mentioned_by_policy": "[]",
        "it_mentioned_by_policy": False,
        "adequacy_decision_by_policy": False
    }, inplace=True
)

anycast_pii_traffic_logs_df = anycast_pii_traffic_logs_df.fillna(
    value={
        "countries_mentioned_by_policy": "[]",
        "it_mentioned_by_policy": False,
        "adequacy_decision_by_policy": False
    }, inplace=True
)

  traffic_logs_ip_classified_analysis_df = traffic_logs_ip_classified_analysis_df.fillna(
  anycast_pii_traffic_logs_df = anycast_pii_traffic_logs_df.fillna(


In [19]:
# Save the datasets populated
traffic_logs_ip_classified_analysis_df.to_csv(TRAFFIC_LOGS_IP_CLASSIFIED_ANALYSIS_FILEPATH, sep=",", index=False)
anycast_pii_traffic_logs_df.to_csv(ANYCAST_PII_TRAFFIC_LOGS_ANALYSIS_FILEPATH, sep=",", index=False)

AttributeError: 'NoneType' object has no attribute 'to_csv'

## Analysis questions

Answers to the questions needed for the article

Acronyms:
- PII = Personal Identificable Information

In [None]:
# Data load
traffic_logs_ip_classified_analysis_df = pd.read_csv(TRAFFIC_LOGS_IP_CLASSIFIED_ANALYSIS_FILEPATH, sep=",")
anycast_pii_traffic_logs_analysis_df = pd.read_csv(ANYCAST_PII_TRAFFIC_LOGS_ANALYSIS_FILEPATH, sep=",")

**IPs analysis**

In [None]:
ips_total = traffic_logs_ip_classified_analysis_df["ip_dest"].unique().tolist()
print(f"Number of IPs in traffic logs: {len(ips_total)}")

ips_total_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["ip_dest"].unique().tolist()
print(f"Number of IPs with PII: {len(ips_total_pii)}")

ips_anycast = traffic_logs_ip_classified_analysis_df.loc[
    traffic_logs_ip_classified_analysis_df["ip_anycast"]
]["ip_dest"].unique().tolist()
print(f"Number of IPs anycast: {len(ips_anycast)}")

ips_anycast_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["ip_anycast"]) &
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["ip_dest"].unique().tolist()
print(f"Number of IPs anycast with PII: {len(ips_anycast_pii)}")

**APKS analysis**

In [None]:
apks_total = traffic_logs_ip_classified_analysis_df["apk"].unique().tolist()
print(f"Number of APKs in traffic logs: {len(apks_total)}")

apks_total_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["apk"].unique().tolist()
print(f"Number of APKs with PII: {len(apks_total_pii)}")

apks_anycast = traffic_logs_ip_classified_analysis_df.loc[
    traffic_logs_ip_classified_analysis_df["ip_anycast"]
]["apk"].unique().tolist()
print(f"Number of APKs using anycast: {len(apks_anycast)}")

apks_anycast_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["ip_anycast"]) &
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["apk"].unique().tolist()
print(f"Number of APKs anycast with PII: {len(apks_anycast_pii)}")

**Hosts Analysis**

In [None]:
hosts_total = traffic_logs_ip_classified_analysis_df["host"].unique().tolist()
print(f"Number of hosts in traffic logs: {len(hosts_total)}")

hosts_total_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["host"].unique().tolist()
print(f"Number of hosts with PII: {len(hosts_total_pii)}")

hosts_anycast = traffic_logs_ip_classified_analysis_df.loc[
    traffic_logs_ip_classified_analysis_df["ip_anycast"]
]["host"].unique().tolist()
print(f"Number of hosts using anycast: {len(hosts_anycast)}")

hosts_anycast_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["ip_anycast"]) &
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["host"].unique().tolist()
print(f"Number of hosts anycast with PII: {len(hosts_anycast_pii)}")

**Data Types**  

In [None]:
pii_data_types_anycast = anycast_pii_traffic_logs_analysis_df["PII"].unique().tolist()
print(f"Types of PII data treated by anycast IPs:")
print(pii_data_types_anycast)

pii_data_types_anycast_pii_it = anycast_pii_traffic_logs_analysis_df.loc[
    anycast_pii_traffic_logs_analysis_df["outside_EEE"]
]["PII"].unique().tolist()
print(f"Types of PII data treated by anycast IPs that make IT:")
print(pii_data_types_anycast_pii_it)

**GDPR Compliance**

In [None]:
apks_anycast_pii_declare_it = anycast_pii_traffic_logs_analysis_df.loc[
    anycast_pii_traffic_logs_analysis_df["it_mentioned_by_policy"]
]["apk"].unique().tolist()
print(f"Number of APKs that use anycast IPs and treat PII that declare IT in privacy policy: {len(apks_anycast_pii_declare_it)}")

apks_anycast_pii_not_compliance = anycast_pii_traffic_logs_analysis_df.loc[
    anycast_pii_traffic_logs_analysis_df["it_gdpr_compliance"] == False
]["apk"].unique().tolist()
print(f"Number of APKs that use anycast IPs and treat PII that has not compliance: {len(apks_anycast_pii_not_compliance)}")

apks_anycast_pii_not_compliance_declare_it = anycast_pii_traffic_logs_analysis_df.loc[
    (anycast_pii_traffic_logs_analysis_df["it_mentioned_by_policy"] == True) &
    (anycast_pii_traffic_logs_analysis_df["it_gdpr_compliance"] == False)
    ]["apk"].unique().tolist()
print(f"Number of APKs that use anycast IPs and treat PII that has not compliance and declare IT in privacy policy: {len(apks_anycast_pii_not_compliance_declare_it)}")

**Libraries**

In [None]:
print("Libraries")