In [None]:
using Glob
using JSON
using LargeScaleAnalysis
using ProgressMeter
using PyCall
using PyPlot
using Statistics
using StatsBase

In [None]:
include("../../ParsimoniousMonitoring/notebooks/thesis.jl")

### Helpers

In [None]:
mesh = parsefile(AnchoringMesh, "../data/mesh_20200520.json")
ping_traceroute = measurement_mapping(mesh, 4, "ping", "traceroute");

In [None]:
parsename(f) = map(x -> parse(Int, x), match(r"/?(\d+)_(\d+)\.", f).captures)

In [None]:
function measurement_pairs(d1, d2)
    pairs = []
    for file in glob("*.ndjson", d1)
        msm_id, prb_id = parsename(file)
        if !haskey(ping_traceroute, msm_id)
            print("#$(msm_id) ")
            continue
        end
        new = replace(basename(file), string(msm_id) => string(ping_traceroute[msm_id]))
        push!(pairs, (file, joinpath(d2, new)))
    end
    pairs
end;

### Analysis of self measurements

In [None]:
files = glob("*.model.json", "../data/ping_v4_1580511600_1581116400_self_pairs/");

In [None]:
models = map(x -> parsefile(DataSegmentationModel, x), files);

In [None]:
nstates = map(x -> size(x.model, 1), models);

In [None]:
nstatesdist = counts(nstates, maximum(nstates))
bar(1:length(nstatesdist), nstatesdist);

In [None]:
# TODO: Check in traceroute that there is no "spurious" hops

In [None]:
for m in models[nstates .>= 4]
    figure(figsize = (12,2))
    plot(coalesce.(m.data, NaN))
end

### Analysis of non-self measurements

In [None]:
# TODO: Re-do analysis with 10% of the pairs instead of 1%.
# TODO: ACF insides the states?

In [None]:
# TODO: Number of states vs. number of hops ?
# TODO: Comparer modeles appris sur 1 et 3 jours vs. les sous-sequences
# de 1 et 3 jours sur un modele appris sur 7 jours (est-ce que ca match ?).

In [None]:
files = glob("*.model.json", "../data/ping_v4_1580511600_1581116400_noself_pairs/");

In [None]:
models = map(x -> parsefile(DataSegmentationModel, x), files);

In [None]:
nstates = map(x -> size(x.model, 1), models);

In [None]:
nstatesdist = counts(nstates, maximum(nstates))
bar(1:length(nstatesdist), nstatesdist);

In [None]:
# Compute avg. durations by states
durations, stds = [], []
for model in models
    for (state, segments) in group(segments(model.state))
        data = model.data[segments]
        push!(durations, mean(length, segments))
        push!(stds, std(skipmissing(data)))
    end
end

In [None]:
# fig, ax = subplots()
# ax.scatter(durations, stds, alpha = 0.1)
# ax.set_xscale("log")
# ax.set_yscale("log")

#### Traceroutes

In [None]:
# How many AS paths are associated to a given state?
# Cas extreme => 1 nouvel etats par timestep = exactement 1 AS/IP path par état
# How many states are associated to a given AS path?
# Cas extreme => 1 seul etat pour toute la serie = exactement 1 etat par AS/IP path
# => On veut que ces deux conditions soit le plus rapproche (= bonne segmentation)
# TODO: 3 levels : IP, router (alias resolution), AS

In [None]:
ping_traceroute_pairs = measurement_pairs(
    "../data/ping_v4_1580511600_1581116400_noself_pairs/",
    "../data/traceroute_v4_1580511600_1581116400_noself_pairs/"
);

In [None]:
counts_ab = Int[]
counts_ba = Int[]
@showprogress for (ping_file, traceroute_file) in ping_traceroute_pairs
    try
        ping_file = "$(ping_file).model.json"
        traceroute_file = "$(splitext(traceroute_file)[1]).processed.json"

        model = parsefile(DataSegmentationModel, ping_file)
        traceroute = parsefile(Vector{TracerouteRecord}, traceroute_file)

        ping_segments = segments(model)
        traceroute_segments = segments(labelize(traceroute))

        res1 = map(x -> length(x), values(reduce(bidirectional_mapping(ping_segments, traceroute_segments)[1])))
        res2 = map(x -> length(x), values(reduce(bidirectional_mapping(ping_segments, traceroute_segments)[2])))
        push!(counts_ab, res1...)
        push!(counts_ba, res2...)
    catch e
        !(e isa SystemError) && raise(e)
        println(e.prefix)
    end
end

In [None]:
d = counts(counts_ab, maximum(counts_ab))
bar(1:length(d), d);
xlim(1, 25)

In [None]:
d = counts(counts_ba, maximum(counts_ba))
bar(1:length(d), d);

In [None]:
d = counts(counts_ab, maximum(counts_ab))
bar(1:length(d), d);
xlim(1, 25)

In [None]:
d = counts(counts_ba, maximum(counts_ba))
bar(1:length(d), d);

In [None]:
mean(counts_ab)

In [None]:
mean(counts_ba)

In [None]:
ping_file, traceroute_file = ping_traceroute_pairs[1]
traceroute = TracerouteRecord.(load_traceroute(traceroute_file, asntree))[1].hops