# Experiment Mesh Analysis


In [1]:
# external imports
import pandas as pd
from ast import literal_eval
from shapely import (
    from_geojson
)

In [2]:
# internal imports
from src.utils.common_functions import (
    json_file_to_dict,
    get_list_files_in_path,
)
from src.utils.constants import (
    EEE_COUNTRIES_FILEPATH,
    REPLICATION_PACKAGE_DIR,
    TRAFFIC_LOGS_IP_CLASSIFIED_FILEPATH,
    ANYCAST_PII_TRAFFIC_LOGS_FILEPATH,
    APKS_METADATA_FILEPATH,
    RESULTS_MODES,
    IT_ANNOTATION_FILEPATH,
    TPLS_RESULTS_FILEPATH,
    ANYCAST_IP_CLASSIFICATION_FILEPATH
)

In [3]:
# Constants
EEE_countries_set = set([country["alpha-2"] for country in json_file_to_dict(EEE_COUNTRIES_FILEPATH)])

In [4]:
# Analysis params
DESTINATION_REPETITIONS_LIMIT = 1
ANALYSIS_MODE=RESULTS_MODES[0]
GENERATE_ROUTES_DATA = False
ANALYZE_DATA = True

In [5]:
# Filepaths variables
EXPERIMENT_RESULTS_FOLDER = f"{REPLICATION_PACKAGE_DIR}/experiment_results_{ANALYSIS_MODE}"
ANALYSIS_FOLDER = f"{REPLICATION_PACKAGE_DIR}/analysis_{ANALYSIS_MODE}"

ROUTES_RESULTS_FILENAME = f"{ANALYSIS_FOLDER}/routes_results_{ANALYSIS_MODE}.csv"
ROUTES_FREQUENCY_FILENAME = f"{ANALYSIS_FOLDER}/routes_frequency_{ANALYSIS_MODE}.csv"

ANYCAST_PII_TRAFFIC_LOGS_ANALYSIS_FILEPATH = f"{REPLICATION_PACKAGE_DIR}/analysis_{ANALYSIS_MODE}/Anycast_PII_Traffic_Logs_{ANALYSIS_MODE}.csv"
TRAFFIC_LOGS_IP_CLASSIFIED_ANALYSIS_FILEPATH = f"{REPLICATION_PACKAGE_DIR}/analysis_{ANALYSIS_MODE}/Traffic_logs_10K_ip_classified_{ANALYSIS_MODE}.csv"

ANYCAST_PII_HOST_AGGREGATION_FILEPATH = f"{REPLICATION_PACKAGE_DIR}/analysis_{ANALYSIS_MODE}/Anycast_PII_host_aggregation_{ANALYSIS_MODE}.csv"
TRAFFIC_LOGS_IP_CLASSIFIED_HOST_AGGREGATION_FILEPATH = f"{REPLICATION_PACKAGE_DIR}/analysis_{ANALYSIS_MODE}/Traffic_logs_10K_ip_classified_host_aggregation_{ANALYSIS_MODE}.csv"


## Enrichment of data and generation of datasets

Auxiliar just if needed

In [6]:
def populate_dataframe_with_ip_classification(dataframe: pd.DataFrame) -> pd.DataFrame:
    ips_classified = json_file_to_dict(ANYCAST_IP_CLASSIFICATION_FILEPATH)

    for ip_classified in ips_classified.keys():
        dataframe.loc[dataframe["ip_dest"] == ip_classified, "ip_anycast"] = ips_classified[ip_classified]
    
    return dataframe    

Generate a file with all the routes got as the result of the experiment execution

In [7]:
# Auxiliary functions
def get_probe_location(probe_id: int, origin_list: []) -> (float, float):
    for origin in origin_list:
        if probe_id == origin["probe_id"]:
            location = from_geojson(origin["location"])
            return location.y, location.x
        else:
            continue
    return 0, 0

def get_result_country_route(hunter_result: dict) -> dict:
    probe_id = hunter_result["origin_id"]
    result_country = hunter_result["location_result"]["country"]
    probe_country = hunter_result["origin_country_code"]

    return {
        "origin_id": probe_id,
        "origin_country": probe_country,
        "result_country": result_country
    }

# Data generation function
def generate_routes_raw():
    routes_raw_df = pd.DataFrame(
        columns=[
            "target", "probe_id", "ips_previous_to_target",
            "origin_country", "origin_latitude", "origin_longitude", 
            "result_country", "result_latitude", "result_longitude",
            "result_filename", "outside_EEE"
        ]
    )
    for result_filename in get_list_files_in_path(EXPERIMENT_RESULTS_FOLDER):
        print(result_filename)
        result = json_file_to_dict(f"{EXPERIMENT_RESULTS_FOLDER}/{result_filename}")
        target = result["target"]
        origin_list = result["measurements"]["origin"]
        for hunter_result in result["hunter_results"]:
            route = get_result_country_route(hunter_result)
            probe_id = hunter_result["origin_id"]
            origin_country = route["origin_country"]
            origin_latitude, origin_longitude = get_probe_location(probe_id, origin_list)
            
            result_country = route["result_country"]
            if len(hunter_result["location_result"]["airports_intersection"]) == 1:
                result_location = from_geojson(hunter_result["location_result"]["airports_intersection"][0]["location"])
                result_latitude = result_location.y
                result_longitude = result_location.x
            else:
                result_latitude = 0
                result_longitude = 0
                
            outside_eee = (result_country not in EEE_countries_set) and (result_country != "Indeterminate")
            
            if result_country == "Indeterminate":
                ips_previous_to_target = ["Indeterminate"]
            else:
                ips_previous_to_target = [
                    ip["ip"]
                    for ip in hunter_result["ips_previous_to_target"]
                ]
            
            routes_raw_df = pd.concat(
                [pd.DataFrame([[
                    target, probe_id, str(ips_previous_to_target),
                    origin_country, origin_latitude, origin_longitude, 
                    result_country, result_latitude, result_longitude,
                    result_filename, outside_eee
                ]], columns=routes_raw_df.columns), routes_raw_df], 
                ignore_index=True
            )
    # Sort and save
    routes_raw_df.sort_values(by=["target", "origin_country", "result_country"], inplace=True)
    routes_raw_df.to_csv(ROUTES_RESULTS_FILENAME, sep=",", index=False)


Aggregate and count routes repetitions

In [8]:
# Data generation function
def generate_routes_frequency_aggregation():
    # Aggregate routes counting the repetitions
    routes_frequency_df = pd.read_csv(ROUTES_RESULTS_FILENAME, sep=",")
    routes_frequency_df = routes_frequency_df[["target", "origin_country", "result_country"]]
    routes_frequency_df = routes_frequency_df.value_counts(subset=['target', 'origin_country', 'result_country'])
    routes_frequency_df.to_csv(ROUTES_FREQUENCY_FILENAME, sep=",")
    
    # Include the info about outside the EEE
    routes_frequency_df = pd.read_csv(ROUTES_FREQUENCY_FILENAME, sep=",")
    routes_frequency_df["outside_EEE"] = False
    
    routes_frequency_df.loc[
        (routes_frequency_df["result_country"] != "Indeterminate") &
        (~routes_frequency_df["result_country"].isin(EEE_countries_set)),
        ["outside_EEE"]
    ] = True
    routes_frequency_df.to_csv(ROUTES_FREQUENCY_FILENAME, sep=",", index=False)


Introduce routes outside EEE and its count in the complete dataset

In [9]:
# Dataset population function
def populate_dataset_with_routes_results(dataframe: pd.DataFrame) -> pd.DataFrame:
    routes_valid_df = pd.read_csv(ROUTES_FREQUENCY_FILENAME, sep=",")
    routes_valid_df = routes_valid_df.loc[
        (routes_valid_df["outside_EEE"] == True) & 
        (routes_valid_df["count"] >= DESTINATION_REPETITIONS_LIMIT)
    ]

    # Charge default routes in the dataset to populate
    dataframe["origins_transfers_outside_EEE"] = "[]"
    dataframe["destinations_transfers_outside_EEE"] = "[]"
    dataframe["frequency_transfers_outside_EEE"] = "[]"
    dataframe["outside_EEE"] = False
    
    # For every IP get the list of origins, destinations and frequency and save it
    for ip in routes_valid_df["target"].unique().tolist():
        ip_routes = routes_valid_df.loc[routes_valid_df["target"] == ip]
        origins_transfers_outside_eee = ip_routes["origin_country"].values.tolist()
        destinations_transfers_outside_eee = ip_routes["result_country"].values.tolist()
        frequency_transfers_outside_eee = ip_routes["count"].values.tolist()

        dataframe.loc[
            (dataframe["ip_dest"] == ip), 
            ["origins_transfers_outside_EEE", 
             "destinations_transfers_outside_EEE", 
             "frequency_transfers_outside_EEE",
             "outside_EEE"]
        ] = [str(origins_transfers_outside_eee),
             str(destinations_transfers_outside_eee),
             str(frequency_transfers_outside_eee),
             True]
    
    return dataframe


Populate the datasets with metadata info

In [10]:
def populate_dataset_with_apks_metadata(dataframe: pd.DataFrame) -> pd.DataFrame:
    apk_metadata_df = pd.read_csv(APKS_METADATA_FILEPATH, sep=",")
    
    return dataframe.merge(
        apk_metadata_df,
        on=["apk", "version"],
        how="left"
    )


Populate the datasets with the info extrated from every policy

In [11]:
def populate_dataset_with_policy_extracted_info(dataframe: pd.DataFrame) -> pd.DataFrame:
    it_annotation_results_df = pd.read_csv(IT_ANNOTATION_FILEPATH, sep=",")
    
    it_annotation_results_df.drop_duplicates(["apk", "countries"], inplace=True)
    it_annotation_results_df.rename(
        columns={
            "transfer": "it_mentioned_by_policy",
            "adequacy_decision": "adequacy_decision_by_policy",
            "countries": "countries_mentioned_by_policy"
        },
        inplace=True
    )
    
    dataframe = pd.merge(
        dataframe,
        it_annotation_results_df[[
            "apk", "version", 
            "it_mentioned_by_policy", "adequacy_decision_by_policy", "countries_mentioned_by_policy"
        ]], 
        on=["apk", "version"],
        how="left"
    )

    dataframe.fillna(
        value={
            "it_mentioned_by_policy": False,
            "adequacy_decision_by_policy": False,
            "countries_mentioned_by_policy": "[]"
        },
        inplace=True
    )    
    return dataframe
    

Populate the datasets with the info about the libraries which carried out the communication 

In [12]:
def populate_dataset_with_libraries_data(dataframe: pd.DataFrame) -> pd.DataFrame:
    tpls_results_df = pd.read_csv(TPLS_RESULTS_FILEPATH, sep=",")
    tpls_results_df.fillna(
        {
            "TP-performed": False,
            "TP-library": "None",
            "FP-intended": False,
        }, inplace=True
    )
    
    dataframe.drop("stackTrace", axis=1, inplace=True)
    
    dataframe = pd.merge(
        dataframe,
        tpls_results_df[[
            "apk", "stackTrace", "version", "port_source", "host", "port_dest", "ip_dest",
            "TP-performed", "TP-library", "FP-intended"
        ]],
        on=["apk", "version", "port_source", "host", "port_dest", "ip_dest"],
        how="left"
    )

    return dataframe

Check GDPR compliance in terms of international transfers

In [13]:
def check_apk_it_gdpr_compliance(dataframe: pd.DataFrame) -> pd.DataFrame:
    dataframe["apk_it_gdpr_compliance"] = True
    
    dataframe["apk_it_gdpr_compliance"] = dataframe.apply(
        lambda row_to_check: 
        set(literal_eval(row_to_check["destinations_transfers_outside_EEE"])).issubset(
            set(literal_eval(row_to_check["countries_mentioned_by_policy"]))
        ),
        axis=1
    )
    
    return dataframe
        

Aggregation of apk and destination IP by domain or host

In [14]:
def aggregate_by_domain(dataframe: pd.DataFrame, filepath: str):
    group_df = dataframe[["host", "ip_dest", "apk"]]
    group_df = group_df.groupby(["host", "ip_dest", "apk"]).size().reset_index(name="traffic_logs")

    group_df.sort_values(by=["host", "apk", "ip_dest"], inplace=True)
    
    group_df.to_csv(filepath, sep=",", index=False)

Execute the enrichment of data and generation

In [15]:
# Because is a long process and is only necessary to run once I include the condition
if GENERATE_ROUTES_DATA:
    # Generate the results
    generate_routes_raw()
    generate_routes_frequency_aggregation()

Make the analysis

In [16]:
# Charge the dataframes to be used
traffic_logs_ip_classified_analysis_df = pd.read_csv(TRAFFIC_LOGS_IP_CLASSIFIED_FILEPATH, sep=",")
anycast_pii_traffic_logs_df = pd.read_csv(ANYCAST_PII_TRAFFIC_LOGS_FILEPATH, sep=",")
datasets = [traffic_logs_ip_classified_analysis_df, anycast_pii_traffic_logs_df]
analysis_filepaths = [TRAFFIC_LOGS_IP_CLASSIFIED_ANALYSIS_FILEPATH, ANYCAST_PII_TRAFFIC_LOGS_ANALYSIS_FILEPATH]
aggregation_filepaths = [TRAFFIC_LOGS_IP_CLASSIFIED_HOST_AGGREGATION_FILEPATH, ANYCAST_PII_HOST_AGGREGATION_FILEPATH]

In [17]:
# Populate datasets
if ANALYZE_DATA:
    for index in range(0, len(datasets)):
        dataset_to_improve = datasets[index]
        
        # Populate with metadata
        print("Populate with metadata")
        dataset_to_improve = populate_dataset_with_apks_metadata(dataset_to_improve)
        # Populate with the privacy policy extracted data
        print("Populate with the privacy policy extracted data")
        dataset_to_improve = populate_dataset_with_policy_extracted_info(dataset_to_improve)
        # Populate with routes
        print("Populate with routes")
        dataset_to_improve = populate_dataset_with_routes_results(dataset_to_improve)
        # Populate with libraries data
        print("Populate with the libraries data")
        dataset_to_improve = populate_dataset_with_libraries_data(dataset_to_improve)
    
        # Check conditions
        print("Checking IT declarations accomplishment")
        dataset_to_improve = check_apk_it_gdpr_compliance(dataset_to_improve)
    
        print("Setting types and NaN")
        dataset_to_improve.fillna(
            {
                "loadsJNI": False,
                "stackTrace": "None",
                "remote_host": "None",
                "tls": False,
                "https": False,
                "error": "None",
                "TP-performed": False,
                "TP-library": "None",
                "FP-intended": False,
            }, inplace=True
        )
        
        print("Saving data")
        dataset_to_improve.to_csv(analysis_filepaths[index], sep=",", index=False)
        
        # Generate the aggregations for better understanding
        aggregate_by_domain(dataframe=dataset_to_improve, filepath=aggregation_filepaths[index])
    

Populate with metadata
Populate with the privacy policy extracted data


  dataframe.fillna(


Populate with routes
Populate with the libraries data


  tpls_results_df.fillna(


Checking IT declarations accomplishment
Setting types and NaN


  dataset_to_improve.fillna(


Saving data
Populate with metadata
Populate with the privacy policy extracted data


  dataframe.fillna(


Populate with routes
Populate with the libraries data


  tpls_results_df.fillna(


Checking IT declarations accomplishment
Setting types and NaN
Saving data


  dataset_to_improve.fillna(


## Analysis questions

Answers to the questions needed for the article

Acronyms:
- PII = Personal Identificable Information

In [18]:
# Data load
traffic_logs_ip_classified_analysis_df = pd.read_csv(TRAFFIC_LOGS_IP_CLASSIFIED_ANALYSIS_FILEPATH, sep=",")
anycast_pii_traffic_logs_analysis_df = pd.read_csv(ANYCAST_PII_TRAFFIC_LOGS_ANALYSIS_FILEPATH, sep=",")

  traffic_logs_ip_classified_analysis_df = pd.read_csv(TRAFFIC_LOGS_IP_CLASSIFIED_ANALYSIS_FILEPATH, sep=",")
  anycast_pii_traffic_logs_analysis_df = pd.read_csv(ANYCAST_PII_TRAFFIC_LOGS_ANALYSIS_FILEPATH, sep=",")


Traffic numbers

In [19]:
all_connections = len(traffic_logs_ip_classified_analysis_df.index)
print(f"Number of connections intercepted: {all_connections}")

anycast_connections = len(anycast_pii_traffic_logs_analysis_df.index)
print(f"Number of anycast connections intercepted: {anycast_connections}")

Number of connections intercepted: 4278823
Number of anycast connections intercepted: 195786


**IPs analysis**

In [20]:
ips_total = traffic_logs_ip_classified_analysis_df["ip_dest"].unique().tolist()
print(f"Number of IPs in traffic logs: {len(ips_total)}")

ips_total_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["ip_dest"].unique().tolist()
print(f"Number of IPs with PII: {len(ips_total_pii)}")

ips_anycast = traffic_logs_ip_classified_analysis_df.loc[
    traffic_logs_ip_classified_analysis_df["ip_anycast"]
]["ip_dest"].unique().tolist()
print(f"Number of IPs anycast: {len(ips_anycast)}")

ips_anycast_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["ip_anycast"]) &
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["ip_dest"].unique().tolist()
print(f"Number of IPs anycast with PII: {len(ips_anycast_pii)}")

Number of IPs in traffic logs: 5647
Number of IPs with PII: 1807
Number of IPs anycast: 991
Number of IPs anycast with PII: 200


**APKS analysis**

In [21]:
apks_total = traffic_logs_ip_classified_analysis_df["apk"].unique().tolist()
print(f"Number of APKs in traffic logs: {len(apks_total)}")

apks_total_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["apk"].unique().tolist()
print(f"Number of APKs with PII: {len(apks_total_pii)}")

apks_anycast = traffic_logs_ip_classified_analysis_df.loc[
    traffic_logs_ip_classified_analysis_df["ip_anycast"]
]["apk"].unique().tolist()
print(f"Number of APKs using anycast: {len(apks_anycast)}")

apks_anycast_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["ip_anycast"]) &
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["apk"].unique().tolist()
print(f"Number of APKs anycast with PII: {len(apks_anycast_pii)}")

Number of APKs in traffic logs: 5759
Number of APKs with PII: 3478
Number of APKs using anycast: 1669
Number of APKs anycast with PII: 960


**Hosts Analysis**

In [22]:
hosts_total = traffic_logs_ip_classified_analysis_df["host"].unique().tolist()
print(f"Number of hosts in traffic logs: {len(hosts_total)}")

hosts_total_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["host"].unique().tolist()
print(f"Number of hosts with PII: {len(hosts_total_pii)}")

hosts_anycast = traffic_logs_ip_classified_analysis_df.loc[
    traffic_logs_ip_classified_analysis_df["ip_anycast"]
]["host"].unique().tolist()
print(f"Number of hosts using anycast: {len(hosts_anycast)}")

hosts_anycast_pii = traffic_logs_ip_classified_analysis_df.loc[
    (traffic_logs_ip_classified_analysis_df["ip_anycast"]) &
    (traffic_logs_ip_classified_analysis_df["PII"] != "No-PII") &
    (traffic_logs_ip_classified_analysis_df["PII"].notna())
    ]["host"].unique().tolist()
print(f"Number of hosts anycast with PII: {len(hosts_anycast_pii)}")

Number of hosts in traffic logs: 4738
Number of hosts with PII: 966
Number of hosts using anycast: 995
Number of hosts anycast with PII: 201


**Data Types**  

In [23]:
pii_data_types_anycast = anycast_pii_traffic_logs_analysis_df["PII"].unique().tolist()
print(f"Types of PII data treated by anycast IPs:")
print(pii_data_types_anycast)

pii_data_types_anycast_pii_it = anycast_pii_traffic_logs_analysis_df.loc[
    anycast_pii_traffic_logs_analysis_df["outside_EEE"]
]["PII"].unique().tolist()
print(f"Types of PII data treated by anycast IPs that make IT:")
print(pii_data_types_anycast_pii_it)

Types of PII data treated by anycast IPs:
['Device_Model', 'Google_Ad_ID', 'Build_No', 'Kernel_version', 'Fingerprint', 'Router_Wifi_BSSID', 'Router_Wifi_BSSID_Close', 'Router_Wifi_MAC', 'Device_location', 'Device_location_coarse']
Types of PII data treated by anycast IPs that make IT:
['Device_Model', 'Google_Ad_ID', 'Build_No', 'Kernel_version', 'Fingerprint', 'Router_Wifi_BSSID', 'Router_Wifi_BSSID_Close', 'Router_Wifi_MAC', 'Device_location', 'Device_location_coarse']


**GDPR Compliance**

In [24]:
apks_anycast_pii_declare_it = anycast_pii_traffic_logs_analysis_df.loc[
    anycast_pii_traffic_logs_analysis_df["it_mentioned_by_policy"]
]["apk"].unique().tolist()
print(f"Number of APKs that use anycast IPs and treat PII that declare IT in privacy policy: {len(apks_anycast_pii_declare_it)}")

apks_anycast_pii_not_compliance = anycast_pii_traffic_logs_analysis_df.loc[
    anycast_pii_traffic_logs_analysis_df["apk_it_gdpr_compliance"] == False
]["apk"].unique().tolist()
print(f"Number of APKs that use anycast IPs and treat PII that has not compliance: {len(apks_anycast_pii_not_compliance)}")

apks_anycast_pii_not_compliance_declare_it = anycast_pii_traffic_logs_analysis_df.loc[
    (anycast_pii_traffic_logs_analysis_df["it_mentioned_by_policy"] == True) &
    (anycast_pii_traffic_logs_analysis_df["apk_it_gdpr_compliance"] == False)
    ]["apk"].unique().tolist()
print(f"Number of APKs that use anycast IPs and treat PII that has not compliance and declare IT in privacy policy: {len(apks_anycast_pii_not_compliance_declare_it)}")


Number of APKs that use anycast IPs and treat PII that declare IT in privacy policy: 196
Number of APKs that use anycast IPs and treat PII that has not compliance: 947
Number of APKs that use anycast IPs and treat PII that has not compliance and declare IT in privacy policy: 189


**TLPs**

In [25]:
anycast_pii_connections = len(anycast_pii_traffic_logs_analysis_df.index)
print(f"Number of connections Anycast+PII: {anycast_pii_connections}")

anycast_pii_it_connections = len(
    anycast_pii_traffic_logs_analysis_df.loc[
        (anycast_pii_traffic_logs_analysis_df["outside_EEE"] == True)
    ].index)
print(f"Number of connections Anycast+PII+IT: {anycast_pii_it_connections}")

anycast_pii_it_connections_by_app = len(
    anycast_pii_traffic_logs_analysis_df.loc[
        (anycast_pii_traffic_logs_analysis_df["outside_EEE"] == True) &
        (anycast_pii_traffic_logs_analysis_df["TP-performed"] == False) &
        (anycast_pii_traffic_logs_analysis_df["FP-intended"] == True)
    ].index)
print(f"Number of connections Anycast+PII+IT set up by APP: {anycast_pii_it_connections_by_app}")

anycast_pii_it_connections_unknown = len(
    anycast_pii_traffic_logs_analysis_df.loc[
        (anycast_pii_traffic_logs_analysis_df["outside_EEE"] == True) &
        (anycast_pii_traffic_logs_analysis_df["TP-performed"] == False) &
        (anycast_pii_traffic_logs_analysis_df["TP-library"] == "None") &
        (anycast_pii_traffic_logs_analysis_df["FP-intended"] == False)
        ].index)
print(f"Number of connections Anycast+PII+IT unknow: {anycast_pii_it_connections_unknown}")

anycast_pii_it_connections_by_tpl = len(
    anycast_pii_traffic_logs_analysis_df.loc[
        (anycast_pii_traffic_logs_analysis_df["outside_EEE"] == True) &
        (anycast_pii_traffic_logs_analysis_df["TP-performed"] == True)
        ].index)
print(f"Number of connections Anycast+PII+IT set up by TPL: {anycast_pii_it_connections_by_tpl}")

anycast_pii_it_connections_by_tpl_fp_intended = len(
    anycast_pii_traffic_logs_analysis_df.loc[
        (anycast_pii_traffic_logs_analysis_df["outside_EEE"] == True) &
        (anycast_pii_traffic_logs_analysis_df["TP-performed"] == True) &
        (anycast_pii_traffic_logs_analysis_df["FP-intended"] == True)
        ].index)
print(f"Number of connections Anycast+PII+IT set up by TPL but performed from FP: {anycast_pii_it_connections_by_tpl_fp_intended}")


Number of connections Anycast+PII: 195786
Number of connections Anycast+PII+IT: 195432
Number of connections Anycast+PII+IT set up by APP: 89103
Number of connections Anycast+PII+IT unknow: 0
Number of connections Anycast+PII+IT set up by TPL: 83833
Number of connections Anycast+PII+IT set up by TPL but performed from FP: 77873


In [26]:
tpls_anycast_pii_ip = anycast_pii_traffic_logs_analysis_df.loc[
    (anycast_pii_traffic_logs_analysis_df["outside_EEE"] == True) &
    (anycast_pii_traffic_logs_analysis_df["TP-library"] != "None") &
    (~anycast_pii_traffic_logs_analysis_df["TP-library"].isnull())
]["TP-library"].unique().tolist()

print("List of TPLs Anycast+PII+IT")
print(tpls_anycast_pii_ip)
print(f"Number of TPLs Anycast+PII+IT: {len(tpls_anycast_pii_ip)}")

List of TPLs Anycast+PII+IT
['com.google', 'com.unity3d', 'com.onesignal', 'com.mob', 'com.bugsnag', 'com.newrelic.agent.android', 'com.android.volley', 'com.adjust.sdk', 'io.bidmachine', 'com.appodeal.ads', 'com.applovin', 'com.mixpanel.android', 'com.google.android.gms', 'com.kochava.core', 'com.adcolony', 'com.safedk', 'com.chartboost', 'com.mopub.volley', 'io.sentry', 'com.urbanairship', 'com.emarsys', 'com.leanplum', 'com.kochava', 'com.fyber', 'io.grpc', 'com.dynatrace']
Number of TPLs Anycast+PII+IT: 26


# STOP

In [27]:
downloads = anycast_pii_traffic_logs_analysis_df.loc[
    (anycast_pii_traffic_logs_analysis_df["apk_it_gdpr_compliance"] == False) &
    (anycast_pii_traffic_logs_analysis_df["android_numDownloads"] == "1B+")
]["apk"].unique().tolist()
print(downloads)

apk = anycast_pii_traffic_logs_analysis_df.loc[
    (anycast_pii_traffic_logs_analysis_df["apk"] == "com.fingersoft.hillclimb")
]
apk

['com.fingersoft.hillclimb']


Unnamed: 0,levelname,asctime,message,apk,version,phase,loadsJNI,local_port,remote_host,container,...,countries_mentioned_by_policy,origins_transfers_outside_EEE,destinations_transfers_outside_EEE,frequency_transfers_outside_EEE,outside_EEE,stackTrace,TP-performed,TP-library,FP-intended,apk_it_gdpr_compliance
158968,DEBUG,2023-12-28 21:19:39.701,Connection intercepted,com.fingersoft.hillclimb,610,idle,False,,,new_traffic,...,[],"['BE', 'IS', 'FR', 'FR']","['GB', 'GB', 'GB', 'CH']","[5, 3, 1, 1]",True,,False,,False,False
158969,DEBUG,2023-12-28 21:19:39.701,Connection intercepted,com.fingersoft.hillclimb,610,idle,False,,,new_traffic,...,[],"['BE', 'IS', 'FR', 'FR']","['GB', 'GB', 'GB', 'CH']","[5, 3, 1, 1]",True,,False,,False,False
158970,DEBUG,2023-12-28 21:19:39.701,Connection intercepted,com.fingersoft.hillclimb,610,idle,False,,,new_traffic,...,[],"['BE', 'IS', 'FR', 'FR']","['GB', 'GB', 'GB', 'CH']","[5, 3, 1, 1]",True,,False,,False,False
158971,DEBUG,2023-12-28 21:19:39.701,Connection intercepted,com.fingersoft.hillclimb,610,idle,False,,,new_traffic,...,[],"['BE', 'IS', 'FR', 'FR']","['GB', 'GB', 'GB', 'CH']","[5, 3, 1, 1]",True,,False,,False,False
158972,DEBUG,2023-12-28 21:19:39.701,Connection intercepted,com.fingersoft.hillclimb,610,idle,False,,,new_traffic,...,[],"['BE', 'IS', 'FR', 'FR']","['GB', 'GB', 'GB', 'CH']","[5, 3, 1, 1]",True,,False,,False,False
158973,DEBUG,2023-12-28 21:19:39.701,Connection intercepted,com.fingersoft.hillclimb,610,idle,False,,,new_traffic,...,[],"['BE', 'IS', 'FR', 'FR']","['GB', 'GB', 'GB', 'CH']","[5, 3, 1, 1]",True,,False,,False,False
158974,DEBUG,2023-12-28 21:19:39.701,Connection intercepted,com.fingersoft.hillclimb,610,idle,False,,,new_traffic,...,[],"['BE', 'IS', 'FR', 'FR']","['GB', 'GB', 'GB', 'CH']","[5, 3, 1, 1]",True,,False,,False,False
158975,DEBUG,2023-12-28 21:19:40.560,Connection intercepted,com.fingersoft.hillclimb,610,idle,False,,,new_traffic,...,[],"['BE', 'IS', 'FR', 'FR']","['GB', 'GB', 'GB', 'CH']","[5, 3, 1, 1]",True,,False,,False,False
158976,DEBUG,2023-12-28 21:19:40.560,Connection intercepted,com.fingersoft.hillclimb,610,idle,False,,,new_traffic,...,[],"['BE', 'IS', 'FR', 'FR']","['GB', 'GB', 'GB', 'CH']","[5, 3, 1, 1]",True,,False,,False,False
158977,DEBUG,2023-12-28 21:19:40.560,Connection intercepted,com.fingersoft.hillclimb,610,idle,False,,,new_traffic,...,[],"['BE', 'IS', 'FR', 'FR']","['GB', 'GB', 'GB', 'CH']","[5, 3, 1, 1]",True,,False,,False,False


In [28]:
apk.to_csv(f"{REPLICATION_PACKAGE_DIR}/one__result.csv", sep=",", index=False)

In [31]:
tpl = anycast_pii_traffic_logs_analysis_df.loc[
    anycast_pii_traffic_logs_analysis_df["TP-library"] == "com.mob"
]
tpl.to_csv(f"{REPLICATION_PACKAGE_DIR}/one_tpl_result.csv", sep=",", index=False)