## Exploring the AS's traversed during a series of traceroutes to various websites

In [123]:
from parse_output import parse_output
import pandas as pd
import pyasn
import numpy as np
import plotly.graph_objects as go
import csv
import glob

In [127]:
# GET DATA PATHS
DATA_DIR = "final-data/"
data_files = glob.glob(f"{DATA_DIR}*.csv")
print(f"Data files: {data_files}")
print(f"Number of data_files: {len(data_files)}")

Data files: ['final-data/google-toronto.csv', 'final-data/home-cambridge-frances.csv', 'final-data/aws-ohio-results-1637264695.csv', 'final-data/google-virginia.csv', 'final-data/home-cleveland-1.csv', 'final-data/home-cleveland-2.csv', 'final-data/aws-virginia-results-1637263092.csv', 'final-data/home-cambridge-victor-results-1637268720.csv']
Number of data_files: 8


In [115]:
# GET ASN TO OWNER MAP
ASN_TO_OWNER = {"user": "user", "AS396982": "GOOGLE"}
with open('data/ASN.csv', newline='') as csvfile:
    file = csv.reader(csvfile)
    for row in file:
        asn, owner = row[0].split(None, maxsplit=1)
        abrev_owner = owner.split(None, maxsplit=1)[0]
        ASN_TO_OWNER[asn] = abrev_owner

In [135]:
# DEFINE HELPER FUNCTIONS

def was_traceroute_successful(traceroute):
    """Returns true if we found the resolved IP in the traceroute (success)."""
    traceroute_ips = traceroute.filter(regex=".*ip.*")
    ips = list(traceroute_ips)
    dest = list(traceroute_ips)[0]
    return dest in ips[1:]

def get_destination_df(df, destination):
    return df.loc[df["dest name"] == destination]

def get_completed_df(df):
    return df[df.apply(lambda x: was_traceroute_successful(x), axis=1)]

def get_asn_path_df(df):
    cols = [f"asn {i}" for i in range(31)]
    rows = []
    for ind, row in df.iterrows():
        curr_row = [("user", 0)]
        prev_asn = None
        # count instance of the asn to track paths where we go from A -> B -> A -> C
        asn_counts = {}
        for i in range(1, 31):
            asn = row[f"hop {i} probe 1 asn"]
            if not pd.isnull(asn) and asn != prev_asn:
                if asn not in asn_counts:
                    asn_counts[asn] = 0
                else:
                    asn_counts[asn] += 1
                curr_row.append((asn, asn_counts[asn]))
                prev_asn = asn
        while len(curr_row) < 31:
            curr_row.append(None)
        rows.append(curr_row)
    return pd.DataFrame(rows, columns=cols)

def get_site_df(df, destination):
    return get_asn_path_df(get_completed_df(get_destination_df(df, destination)))

def get_source_target_pairs_count(asn_path_df):
    pairs_count = {}
    for ind, row in asn_path_df.iterrows():
        for i in range(30):
            source_asn, target_asn = row[f"asn {i}"], row[f"asn {i+1}"]
            if not pd.isnull(source_asn) and not pd.isnull(target_asn):
                pair = (source_asn, target_asn)
                if pair not in pairs_count:
                    pairs_count[pair] = 1
                else:
                    pairs_count[pair] += 1
    return pairs_count

def get_unique_asns(asn_path_df):
    all_asns = asn_path_df[[f"asn {i}" for i in range(31)]].values.ravel()
    all_asns = all_asns[~pd.isnull(all_asns)]
    return list(pd.unique(all_asns))

def plot_sankey_diagram(data_file, site="all", labelled=True):
    df = parse_output(data_file)
    if site == "all":
        sites_df = get_completed_df(df)["dest name"].value_counts()
        sites = list(sites_df.index)
    else:
        sites = [site]

    for s in sites:
        asn_path_df = get_site_df(df, s)
        unique_asns = get_unique_asns(asn_path_df)


        source_target_pairs_count = get_source_target_pairs_count(asn_path_df)

        # get labels to asn numbers here, example for google.com
        #if labelled:
            #ip_asn_df = pd.read_csv("data/ASN.csv")
            #ip_to_asn ={}
            #for ind, row in ip_asn_df.iterrows():
            #    ip_to_asn[row["AS"]] = row["Owner"] 
            #ip_to_asn["user"] = "user"

        node_to_ind = {}
        for ind, node in enumerate(unique_asns):
            node_to_ind[node] = ind
        
        # check which asns are unaccounted for
        # print(ip_to_asn.keys() ^ [node[0] for node in node_to_ind.keys()])
        if labelled:
            labels = [ASN_TO_OWNER["AS"+str(int(node[0]))] if node[0] != "user" else "user" for node in unique_asns]
        else:
            labels = ["AS"+str(int(node[0])) if node[0] != "user" else "user" for node in unique_asns]

        sources = []
        targets = []
        values = []
        for key, val in source_target_pairs_count.items():
            sources.append(node_to_ind[key[0]])
            targets.append(node_to_ind[key[1]])
            values.append(val)

        fig = go.Figure(data=[go.Sankey(node=dict(label=labels), link=dict(source=sources, target=targets, value=values))])

        filename = data_file.split("/")[1]
        fig.update_layout(title=f"{filename}, {s} ASN Path", font_size=10)
        fig.show() 
    

In [136]:
# GOOGLE-TORONTO.csv
plot_sankey_diagram(data_files[0])

In [137]:
# HOME-CAMBRIDGE-FRANCES
plot_sankey_diagram(data_files[1])

In [None]:
# AWS-OHIO
plot_sankey_diagram(data_files[2])

In [None]:
# GOOGLE-VIRGINIA
plot_sankey_diagram(data_files[3])

In [120]:
# HOME-CLEVELAND-1
plot_sankey_diagram(data_files[4])

In [138]:
# HOME-CLEVELAND-2
plot_sankey_diagram(data_files[5])

In [139]:
# AWS-VIRGINIA
plot_sankey_diagram(data_files[6])

In [140]:
# HOME-CAMBRIDGE-VICTOR
plot_sankey_diagram(data_files[7])

In [142]:
for file in data_files:
    f_df = parse_output(file)
    print(file, f_df.shape)

final-data/google-toronto.csv (31458, 154)
final-data/home-cambridge-frances.csv (31458, 154)
final-data/aws-ohio-results-1637264695.csv (36860, 154)
final-data/google-virginia.csv (31311, 154)
final-data/home-cleveland-1.csv (4802, 154)
final-data/home-cleveland-2.csv (10192, 154)
final-data/aws-virginia-results-1637263092.csv (33957, 154)
final-data/home-cambridge-victor-results-1637268720.csv (29421, 154)
