## Exploring the AS's traversed during a series of traceroutes to various websites

In [30]:
from parse_output import parse_output
import pandas as pd
import pyasn
import numpy as np
import plotly.graph_objects as go

In [2]:
data_path = "data/home-victor-results-1637268720-incomplete.csv"
df = parse_output(data_path)
df.head()

Unnamed: 0,dest name,dest ip,num hops,hop 1 probe 1 name,hop 1 probe 1 ip,hop 1 probe 1 asn,hop 1 probe 1 rtt,hop 1 probe 1 annotation,hop 2 probe 1 name,hop 2 probe 1 ip,...,hop 29 probe 1 name,hop 29 probe 1 ip,hop 29 probe 1 asn,hop 29 probe 1 rtt,hop 29 probe 1 annotation,hop 30 probe 1 name,hop 30 probe 1 ip,hop 30 probe 1 asn,hop 30 probe 1 rtt,hop 30 probe 1 annotation
0,google.com,142.250.80.46,14,DESKTOP-BBCCJU7.mshome.net,172.26.208.1,,0.298,,Linksys02715.hsd1.ma.comcast.net,192.168.1.1,...,,,,,,,,,,
1,youtube.com,142.251.35.174,14,DESKTOP-BBCCJU7.mshome.net,172.26.208.1,,0.162,,Linksys02715.hsd1.ma.comcast.net,192.168.1.1,...,,,,,,,,,,
2,amazon.com,205.251.242.103,30,DESKTOP-BBCCJU7.mshome.net,172.26.208.1,,0.256,,Linksys02715.hsd1.ma.comcast.net,192.168.1.1,...,,,,,,,,,,
3,yahoo.com,74.6.143.25,16,DESKTOP-BBCCJU7.mshome.net,172.26.208.1,,0.156,,Linksys02715.hsd1.ma.comcast.net,192.168.1.1,...,,,,,,,,,,
4,facebook.com,157.240.220.35,10,DESKTOP-BBCCJU7.mshome.net,172.26.208.1,,0.279,,Linksys02715.hsd1.ma.comcast.net,192.168.1.1,...,,,,,,,,,,


In [3]:
df.shape

(9665, 153)

In [96]:
def was_traceroute_successful(traceroute):
    """Returns true if we found the resolved IP in the traceroute (success)."""
    traceroute_ips = traceroute.filter(regex=".*ip.*")
    ips = list(traceroute_ips)
    dest = list(traceroute_ips)[0]
    return dest in ips[1:]

def get_destination_df(df, destination):
    return df.loc[df["dest name"] == destination]

def get_completed_df(df):
    return df[df.apply(lambda x: was_traceroute_successful(x), axis=1)]

def get_asn_path_df(df):
    cols = [f"asn {i}" for i in range(1, 31)]
    rows = []
    for ind, row in df.iterrows():
        curr_row = []
        prev_asn = None
        # count instance of the asn to track paths where we go from A -> B -> A -> C
        asn_counts = {}
        for i in range(1, 31):
            asn = row[f"hop {i} probe 1 asn"]
            if not pd.isnull(asn) and asn != prev_asn:
                if asn not in asn_counts:
                    asn_counts[asn] = 0
                else:
                    asn_counts[asn] += 1
                curr_row.append((asn, asn_counts[asn]))
                prev_asn = asn
        while len(curr_row) < 30:
            curr_row.append(None)
        rows.append(curr_row)
    return pd.DataFrame(rows, columns=cols)

def get_site_df(df, destination):
    return get_asn_path_df(get_completed_df(get_destination_df(df, destination)))

def get_source_target_pairs_count(asn_path_df):
    pairs_count = {}
    for ind, row in asn_path_df.iterrows():
        for i in range(1, 30):
            source_asn, target_asn = row[f"asn {i}"], row[f"asn {i+1}"]
            if not pd.isnull(source_asn) and not pd.isnull(target_asn):
                pair = (source_asn, target_asn)
                if pair not in pairs_count:
                    pairs_count[pair] = 1
                else:
                    pairs_count[pair] += 1
    return pairs_count

def get_unique_asns(asn_path_df):
    all_asns = asn_path_df[[f"asn {i}" for i in range(1, 31)]].values.ravel()
    all_asns = all_asns[~pd.isnull(all_asns)]
    return list(pd.unique(all_asns))

def plot_sankey_diagram(site):
    asn_path_df = get_site_df(df, site)
    unique_asns = get_unique_asns(asn_path_df)

    source_target_pairs_count = get_source_target_pairs_count(asn_path_df)

    # get labels to asn numbers here, example for google.com
    # asn_dict = {
    #     7922: "Comcast (AS7922)",
    #     33657: "Comcast (AS33657)",
    #     33491: "Comcast (AS33491)",
    #     15169: "Google AS(15169)"
    # }

    labels = unique_asns
    node_to_ind = {}
    for ind, node in enumerate(labels):
        node_to_ind[node] = ind
    
    # update labels here
    # labels = [asn_dict[node[0]] for node in unique_asns]

    sources = []
    targets = []
    values = []
    for key, val in source_target_pairs_count.items():
        sources.append(node_to_ind[key[0]])
        targets.append(node_to_ind[key[1]])
        values.append(val)

    fig = go.Figure(data=[go.Sankey(node=dict(label=labels), link=dict(source=sources, target=targets, value=values))])

    fig.update_layout(title=f"{site} asn", font_size=10)
    fig.show() 
    

In [97]:
filtered_df = get_completed_df(df)["dest name"].value_counts()
sites = list(filtered_df.index)
print(sites)

['google.com', 'yahoo.com', 'facebook.com', 'force.com', 'myshopify.com', 'youtube.com', 'dropbox.com', 'craigslist.com', 'canva.com', 'apple.com', 'adobe.com', 'salesforce.com', 'twitter.com', 'instagram.com', 'chaturbate.com', 'homedepot.com', 'zillow.com', 'espn.com', 'tiktok.com']


In [98]:
plot_sankey_diagram("google.com")