In [None]:
import collections
import copy
import csv
import itertools
import os

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import scipy.stats

import linear_geodesic_optimization.data.input_network as input_network
import linear_geodesic_optimization.plot as plot

# Data Input

We start off by just reading in data from RIPE. This data has been pre-compiled by pulling `ping` measurements over the course of the week between midnight 2024-05-01 until midnight 2024-05-08. The ping measurements are aggregated over each consecutive hour, giving us 168 sets of measurements.

In [None]:
data_directory = os.path.join('..', 'data', 'animation_US')
probes_file_path = os.path.join(data_directory, 'probes.csv')
latencies_directory = os.path.join(data_directory, 'latencies')
clustering_distance = None

# Set NetworkX graphs and latencies
graphs = []
gcls = {}
latencies = []
latencies_filenames = list(sorted(os.listdir(latencies_directory)))
latencies_file_paths = [
    os.path.join(latencies_directory, latencies_filename)
    for latencies_filename in latencies_filenames
]
for latencies_file_path in latencies_file_paths:
    graph = input_network.get_graph_from_paths(
        probes_file_path, latencies_file_path,
        clustering_distance=clustering_distance,
        should_compute_curvatures=False
    )
    graphs.append(graph)
    for u, v, d in graph.edges(data=True):
        gcls[(u, v)] = float(d['gcl'])
        gcls[(v, u)] = float(d['gcl'])
    latency_dict = {}
    for u, v, d in graph.edges(data=True):
        latency_dict[(u, v)] = float(d['rtt'])
        latency_dict[(v, u)] = float(d['rtt'])
    latencies.append(latency_dict)

# Mapping from probes to their cluster, in case we enable clustering
probes_mapping = {}
for v, d in graphs[0].nodes(data=True):
    if 'elements' in d:
        for u in d['elements']:
            probes_mapping[u] = v
    else:
        probes_mapping[v] = v

We additionally create a combined graph by aggregating across the entire week. This will be useful in later analyses.

In [None]:
latencies_combined = {}
for latency_dict in latencies:
    for edge, latency in latency_dict.items():
        if edge in latencies_combined:
            latencies_combined[edge] = min(latencies_combined[edge], latency)
        else:
            latencies_combined[edge] = latency
latencies_combined = [
    {
        'source_id': source_id,
        'target_id': target_id,
        'rtt': rtt,
    }
    for (source_id, target_id), rtt in latencies_combined.items()
]
probes = [
    {
        'id': v,
        'latitude': d['lat'],
        'longitude': d['long'],
        'city': d['city'],
        'country': d['country'],
    }
    for v, d in graphs[0].nodes(data=True)
]

# Cluster this one for more analysis later
graph_combined = input_network.get_graph(
    probes, latencies_combined,
    clustering_distance=clustering_distance,
    should_compute_curvatures=False
)
edges_combined = set()
latencies_combined = {}
for u, v, d in graph_combined.edges(data=True):
    edges_combined.add((u, v))
    edges_combined.add((v, u))
    latencies_combined[(u, v)] = float(d['rtt'])
    latencies_combined[(v, u)] = float(d['rtt'])

# Threshold Crossing Stability

Ideally, we would have some way to determine whether a link's latency crosses a given threshold (say, $\epsilon = 10\;\text{ms}$). To avoid undesired oscillatory behavior on a small time scale caused by random noise, we need some way of smoothing the measurements. Let's start by looking at a single edge in particular. The edge between LA and San Jose is a decent candidate.

In [None]:
source_id = probes_mapping['6492']  # LA
target_id = probes_mapping['6411']  # San Jose

for i, element in enumerate(latencies):
    if (source_id, target_id) not in element:
        print(i)
assert np.all([(source_id, target_id) in element for element in latencies])


z = np.array([element[(source_id, target_id)] for element in latencies]) - gcls[(source_id, target_id)]
plt.plot(range(len(z)), z, 'b-')
# plt.plot([0, 167], [26.25, 26.25], 'g--')
plt.show()

If the threshold were at $26.25\;\text{ms}$, then we see that a naive thresholding strategy would have the "high" portion drop below the level undesirably sometimes.

Now, let's check what the distribution of latencies looks like.

In [None]:
plt.hist(z[40:])
plt.show()

This looks nearly normal. This is (roughly) confirmed with a normal probability plot.

In [None]:
scipy.stats.probplot(z[40:], plot=plt)
plt.show()

Now let's work on smoothing out the latencies.

Consider a time series of points $(t_i, z_i)$, where the $z_i$'s are measured quantities. We imagine that the $z_i$'s are drawn from some probability distribution.

Suppose $z_i$ is drawn from $\mathcal{N}(\mu_i, \sigma_i^2)$ at time $t_i$. The log-likelihood of this observation is $$-\frac{1}{2}\log(2 \pi \sigma^2) - \frac{(z_i - \mu_i)^2}{2 \sigma^2}.$$ [Some algebra and calculus](https://en.wikipedia.org/wiki/Normal_distribution#Estimation_of_parameters) show that the best estimates of the parameters for a sample of measurements are actually just the sample mean and (biased) sample variance. The resulting log-likelihood is $$-\frac{n}{2}(\log(2 \pi \widehat{\sigma}^2) + 1).$$

With this measurement in hand, we can compute the BIC for every potential model following this generative model:
* Pick $k - 1$ changepoints from $\{1, \dots, n\}$. In other words, partition $\{1, \dots, n\}$ into $k$ blocks.
* For each block, select a mean and variance. Generate each point in the partition according to a normal distribution with the corresponding parameters.

This model has $2 k$ parameters. We then just minimize the BIC across all of these. For a given partition, we need only consider the maximum likelihood estimates described above. To find the best partition for a given partition, we can use a dynamic programming approach.

In [None]:
def get_changepoints(z, max_count=None):
    z = np.array(z)
    n = len(z)
    # Compute log-likelihood in O(n^2) time
    dp_sum = np.full((n, n + 1), np.nan)
    dp_sum_square = np.full((n, n + 1), np.nan)
    dp_mean = np.full((n, n + 1), np.nan)
    dp_log_likelihood = np.full((n, n + 1), np.nan)
    for i in range(n):
        dp_sum[i, i] = 0.
        dp_sum_square[i, i] = 0.
        dp_log_likelihood[i, i + 1] = 0.
        for j in range(i + 1, n + 1):
            dp_sum[i, j] = dp_sum[i, j - 1] + z[j - 1]
            dp_sum_square[i, j] = dp_sum_square[i, j - 1] + z[j - 1]**2
            dp_mean[i, j] = dp_sum[i, j] / (j - i)
        for j in range(i + 2, n + 1):
            variance_i_j = dp_sum_square[i, j] / (j - i) - dp_mean[i, j]**2
            if variance_i_j <= 0:
                dp_log_likelihood[i, j] = -np.inf
            else:
                dp_log_likelihood[i, j] = -((j - i) / 2) * (np.log(2 * np.pi * variance_i_j) + 1)
    # At this point, dp_log_likelihood[i, j] is the log-likelihood of the
    # MLE of z[i:j].

    dp_bics = []
    dp_changepoints = []
    k_max = max_count if max_count is not None else n
    # Element (l, i) is the best BIC score with l changepoints and i
    # elements (that is, looking at z[:i])
    dp_log_likelihood_partition = np.full((k_max, n + 1), np.nan)
    dp_backtrack = np.full((k_max, n + 1), -1, dtype=int)
    # Fill in the DP table
    for i in range(n + 1):
        dp_log_likelihood_partition[0, i] = dp_log_likelihood[0, i]
    for l in range(1, k_max):
        for i in range(l + 1, n + 1):
            log_likelihoods = dp_log_likelihood_partition[l - 1, l : i] + np.array([
                dp_log_likelihood[i - j, i]
                for j in range(i - l, 0, -1)
            ])
            max_index = np.argmax(log_likelihoods)
            dp_backtrack[l, i] = l + max_index
            dp_log_likelihood_partition[l, i] = log_likelihoods[max_index]

    # Iterate over the number of blocks in the partition. Note that the
    # number of parameters in the model is twice the number of blocks
    k_best = np.argmin([
        2 * k * np.log(n) - 2 * dp_log_likelihood_partition[k - 1, n]  # BIC
        for k in range(1, k_max + 1)
    ]) + 1
    if k_best == 1:
        changepoints = []
        means = [dp_mean[0, -1]]
    else:
        backtrack_path = [dp_backtrack[k_best - 1, n]]
        for l in range(k_best - 2):
            backtrack_path.append(dp_backtrack[k_best - l - 1, backtrack_path[-1]])

        # Backtrack to find the changepoints
        changepoints = list(reversed(backtrack_path))
        means = [dp_mean[0, changepoints[0]]] + [
            dp_mean[changepoints[i], changepoints[i + 1] - 1]
            for i in range(len(changepoints) - 1)
        ] + [dp_mean[changepoints[-1], n]]

    return changepoints, means

In [None]:
changepoints, means = get_changepoints(z)
endpoints = [0] + changepoints + [len(z)]

x = list(range(len(z)))
plt.plot(x, z, 'b-')
for i in range(len(means)):
    plt.plot([endpoints[i] - 0.5, endpoints[i + 1] - 0.5], [means[i], means[i]], 'r-')
# plt.plot([0, 167], [26.25, 26.25], 'g--')
plt.show()

With this plot, we see that the smoothed latencies (in red) are more stable.

Up next is a quick investigation about how often the changepoint detection operates on a fixed threshold (10 ms).

In [None]:
epsilon = 10

count = 0
for edge in graph_combined.edges:
    z = []
    for i, latency_dict in enumerate(latencies):
        if edge in latency_dict:
            z.append(latency_dict[edge])
    if epsilon > np.amin(z, initial=np.inf) - gcls[edge] \
            and epsilon < np.amax(z, initial=-np.inf) - gcls[edge]:
        count += 1

print(
    f'{count} edges around the threshold {epsilon} '
    f'({100 * count / len(graph_combined.edges):0.4f}%)'
)

The upshot is that we need to consider around 5% of the edges over the course of the week. The other edges are either always in or always out of the network graph (due to having sufficiently low or sufficiently high residuals).

# The Search for a Skeleton Graph

Ideally, we would be able to automatically pick a latency threshold $\epsilon$ automatically. The general idea would be to identify some subset of the edges of our nonthresholded network graph that are stable. We would then pick a threshold so that the "skeleton graph" formed by these edges is present.

Let's start by investigating how the latencies of the edges vary over time. We plot out the log variances (as these follow a more understandable distribution in this case). We additionally plot the log of the variances scaled by the means of the latencies.

In [None]:
variances = {}
variances_normalized = {}
for edge in gcls:
    z = []
    for latency_dict in latencies:
        if edge in latency_dict:
            z.append(latency_dict[edge])
    variances[edge] = np.log(np.var(z))
    variances_normalized[edge] = np.log(np.var(z) / gcls[edge]**2)
fig, (ax1, ax2) = plt.subplots(2, dpi=200)
fig.subplots_adjust(hspace=0.5)
ax1.set_title('Log Variances')
ax1.hist(variances.values(), 40)
ax1.set_xlabel(r'$\log(\sigma^2)$')
ax1.set_ylabel('Count')
ax2.set_title('Log Normalized Variances')
ax2.hist(variances_normalized.values(), 40)
ax2.set_xlabel(r'$\log(\sigma^2)$ (Normalized)')
ax2.set_ylabel('Count')
plt.show()

We also plot the residuals.

In [None]:
residuals = {
    edge: min([
        latency_dict[edge]
        for latency_dict in latencies
        if edge in latency_dict
    ]) - gcl
    for edge, gcl in gcls.items()
}

fig, ax = plt.subplots(dpi=200)
ax.hist(residuals.values(), 40)
ax.set_xlabel('Residual')
ax.set_ylabel('Count')
plt.show()

Finally, we combine the above plots, and we additionally add some coloration for when links have the potential to have negative curvature (i.e. have negative curvature at some (integer) latency threshold).

In [None]:
epsilons = list(range(int(np.ceil(max(residuals.values()))), 0, -1))
curvatures = []
negative_edges = collections.defaultdict(list)
graph = copy.deepcopy(graph_combined)
for epsilon in epsilons:
    graph = input_network.threshold_graph(graph, epsilon)
    if len(graph.edges) == 0:
        curvatures.append([])
    else:
        graph = input_network.compute_ricci_curvatures(graph)
        curvatures.append([
            d['ricciCurvature']
            for _, _, d in graph.edges(data=True)
        ])
        for u, v, d in graph.edges(data=True):
            if d['ricciCurvature'] < 0.:
                negative_edges[u, v].append((epsilon, d['ricciCurvature']))
                negative_edges[v, u].append((epsilon, d['ricciCurvature']))

    # plot.get_network_plot(graph)
    # plt.savefig(os.path.join('figs', 'networks', f'{epsilon}.png'), dpi=200)
    # plt.close()

# These now contain the the latency thresholds and the corresponding
# curvatures of the edges at those thresholds
epsilons = list(reversed(epsilons))
curvatures = list(reversed(curvatures))

In [None]:
fig, ax = plt.subplots(dpi=200)
ax.scatter(
    [residuals[edge] for edge in gcls],
    [variances_normalized[edge] for edge in gcls],
    s=1.,
    c=[
        'r' if edge in negative_edges else 'b'
        for edge in gcls
    ]
)
ax.set_xlabel('Minimal Residual')
ax.set_ylabel(r'$\log(\sigma^2)$ (Normalized)')
# fig.savefig(os.path.join('figs', 'variances.png'), dpi=200)
plt.show()

From this we see that the "stability" of edges doesn't really correlate with the residual. Therefore, the idea of a skeleton graph is not viable. In particular, if we try to include all of the stable edges, then our $\epsilon$ would have to be very large (so large that there would be no negative curvature edges). Conversely, small values of $\epsilon$ include unstable edges.

Additionally, being negatively (red) or positively (blue) curved doesn't imply stability.

We can also do more analysis by separating based on the lengths of the links.

In [None]:
plt.hist(gcls.values())
plt.show()

In [None]:
near_cuttoff = 40. * 2. / (186. * 2. / 3.)
far_cuttoff = 500. * 2. / (186. * 2. / 3.)

fig, (ax_near, ax_mid, ax_far) = plt.subplots(3, dpi=200)
fig.subplots_adjust(hspace=0.9)
edges_near = [
    edge
    for edge, gcl in gcls.items()
    if gcl < near_cuttoff
]
ax_near.set_title('Near')
ax_near.scatter(
    [residuals[edge] for edge in edges_near],
    [variances_normalized[edge] for edge in edges_near],
    s=1.,
    c=[
        'r' if edge in negative_edges else 'b'
        for edge in edges_near
    ]
)
ax_near.set_xlabel('Minimal Residual')
ax_near.set_ylabel(r'$\log(\sigma^2)$ (Normalized)')
edges_mid = [
    edge
    for edge, gcl in gcls.items()
    if gcl >= near_cuttoff and gcl < far_cuttoff
]
ax_mid.set_title('Mid')
ax_mid.scatter(
    [residuals[edge] for edge in edges_mid],
    [variances_normalized[edge] for edge in edges_mid],
    s=1.,
    c=[
        'r' if edge in negative_edges else 'b'
        for edge in edges_mid
    ]
)
ax_mid.set_xlabel('Minimal Residual')
ax_mid.set_ylabel(r'$\log(\sigma^2)$ (Normalized)')
edges_far = [
    edge
    for edge, gcl in gcls.items()
    if gcl >= far_cuttoff
]
ax_far.set_title('Far')
ax_far.scatter(
    [residuals[edge] for edge in edges_far],
    [variances_normalized[edge] for edge in edges_far],
    s=1.,
    c=[
        'r' if edge in negative_edges else 'b'
        for edge in edges_far
    ]
)
ax_far.set_xlabel('Minimal Residual')
ax_far.set_ylabel(r'$\log(\sigma^2)$ (Normalized)')
plt.show()

In [None]:
box_count = max([index for index, element in enumerate(curvatures) if np.amin(element, initial=np.inf) < 0.]) + 1

fig, ax = plt.subplots(dpi=200)
ax.boxplot(curvatures[:box_count], 0, 'kD')
ax.set_xticks(
    [y + 1 for y in range(box_count)],
    labels=epsilons[:box_count])
ax.set_xlabel('Threshold')
ax.set_ylabel('Ricci Curvature')
# fig.savefig(os.path.join('figs', 'boxplot.png'), dpi=200)
plt.show()

Since box plots don't necessarily give the whole picture, let's look at one particular (arbitrary) distribution of the curvatures.

In [None]:
fig, ax = plt.subplots(dpi=200)
ax.hist(curvatures[18])
ax.set_xlabel('Ricci Curvature')
ax.set_ylabel('Count')
plt.show()

We see that edges with negative curvature are rare, especially as the threshold increases.

To observe as many of the potentially negative edges as possible, we need the threshold to be as large as possible. This, however, forces some of the edges to be positive.

Now, to make a toy example be convincing, the made-up data should have as many of the same properties as we see above. Fortunately for us, the most challenging of these (distribution of the variance) can be brushed under the rug. As in the earlier figure, the latency versus time is often times piecewise constant (plus noise). Our toy example thus can be the same toy example from before, but with an added edge. Essentially, we want the lone negative curvature edge to "appear" when a different edge drops out. In particular, we can add a link from F to G that later goes down.

We do also want around 5% of the edges to appear/disappear. With the small size of the old toy example, this agrees with only allowing one edge to pop in/out.