In [None]:
using Glob
using JSON
using LargeScaleAnalysis
using ProgressMeter
using PyCall
using PyPlot
using Statistics
using StatsBase

In [None]:
include("../../ParsimoniousMonitoring/notebooks/thesis.jl")

### Analysis of self measurements

In [None]:
files = glob("*.model.json", "../data/ping_v4_1580511600_1581116400_self_pairs/");

In [None]:
models = map(x -> parsefile(DataSegmentationModel, x), files);

In [None]:
nstates = map(x -> size(x.model, 1), models);

In [None]:
nstatesdist = counts(nstates, maximum(nstates))
bar(1:length(nstatesdist), nstatesdist);

In [None]:
# TODO: Check in traceroute that there is no "spurious" hops

In [None]:
for m in models[nstates .>= 4]
    figure(figsize = (12,2))
    plot(coalesce.(m.data, NaN))
end

### Analysis of non-self measurements

In [None]:
# TODO: Re-do analysis with 10% of the pairs instead of 1%.
# TODO: ACF insides the states?

In [None]:
# TODO: Number of states vs. number of hops ?
# TODO: Comparer modeles appris sur 1 et 3 jours vs. les sous-sequences
# de 1 et 3 jours sur un modele appris sur 7 jours (est-ce que ca match ?).

In [None]:
files = glob("*ndjson.model.json", "../data/ping_v4_1580511600_1581116400_noself_pairs/");
files_L20 = glob("*ndjson.model.json", "../data/ping_v4_1580511600_1581116400_noself_pairs_L20/");
files_1d = glob("*ndjson_360.model.json", "../data/ping_v4_1580511600_1581116400_noself_pairs/");
files_3d = glob("*ndjson_1080.model.json", "../data/ping_v4_1580511600_1581116400_noself_pairs/");
files_14d = glob("*ndjson.model.json", "../data/ping_v4_1580511600_1581721200_noself_pairs/");

In [None]:
models = map(x -> parsefile(DataSegmentationModel, x), files);
models_L20 = map(x -> parsefile(DataSegmentationModel, x), files_L20);
models_1d = map(x -> parsefile(DataSegmentationModel, x), files_1d);
models_3d = map(x -> parsefile(DataSegmentationModel, x), files_3d);
models_14d = map(x -> parsefile(DataSegmentationModel, x), files_14d);

In [None]:
nstates = map(x -> size(x.model, 1), models);
nstates_L20 = map(x -> size(x.model, 1), models_L20);
nstates_1d = map(x -> size(x.model, 1), models_1d);
nstates_3d = map(x -> size(x.model, 1), models_3d);
nstates_14d = map(x -> size(x.model, 1), models_14d);

In [None]:
fig, ax = subplots()
ax.boxplot([nstates, nstates_L20])
ax.set(ylabel = "Nombre d'états", ylim = (0, 20))

In [None]:
mean.([nstates, nstates_L20])

In [None]:
median.([nstates, nstates_L20])

In [None]:
fig, ax = subplots()
ax.boxplot([nstates_1d, nstates_3d, nstates, nstates_14d], labels = ["1 jour", "3 jours", "7 jours", "14 jours"], whis = (0, 100))
ax.set(ylabel = "Nombre d'états", ylim = (0, 20))
save_thesis("atlas_nstates_dist_boxplot", hwr = 0.75, extra_axis_params = ["xtick={1,2,3,4}", "xticklabels={1 jour, 3 jours, 7 jours, 14 jours}"])

In [None]:
# Verifier si pour une series donnee le nombre d'etats augmente toujours avec le temps.
# Chercher les series pour lesquelles le nombre d'etats entre 7 et 14 jours change peu/change beaucoup.

In [None]:
nstatesdist = counts(nstates, 20)
bar(1:20, nstatesdist);
# save_thesis("test")

In [None]:
nstatesdist = counts(nstates_14d, 20)
bar(1:20, nstatesdist);
# save_thesis("test")

In [None]:
nstatesdist = counts(nstates_1d, 15)
bar(1:15, nstatesdist);

In [None]:
# Compute avg. durations by states
durations, stds = [], []
for model in models
    for (state, segments) in group(segments(model.state))
        data = model.data[segments]
        push!(durations, mean(length, segments))
        push!(stds, std(skipmissing(data)))
    end
end

In [None]:
# fig, ax = subplots()
# ax.scatter(durations, stds, alpha = 0.1)
# ax.set_xscale("log")
# ax.set_yscale("log")

#### Traceroutes

In [None]:
# How many AS paths are associated to a given state?
# Cas extreme => 1 nouvel etats par timestep = exactement 1 AS/IP path par état
# How many states are associated to a given AS path?
# Cas extreme => 1 seul etat pour toute la serie = exactement 1 etat par AS/IP path
# => On veut que ces deux conditions soit le plus rapproche (= bonne segmentation)
# TODO: 3 levels : IP, router (alias resolution), AS

In [None]:
ping_traceroute_pairs = measurement_pairs(
    "../data/ping_v4_1580511600_1581116400_noself_pairs/",
    "../data/traceroute_v4_1580511600_1581116400_noself_pairs/"
);

In [None]:
counts_ab = Int[]
counts_ba = Int[]
@showprogress for (ping_file, traceroute_file) in ping_traceroute_pairs
    try
        ping_file = "$(ping_file).model.json"
        traceroute_file = "$(splitext(traceroute_file)[1]).processed.json"

        model = parsefile(DataSegmentationModel, ping_file)
        traceroute = parsefile(Vector{TracerouteRecord}, traceroute_file)

        ping_segments = segments(model)
        traceroute_segments = segments(labelize(traceroute))

        res1 = map(x -> length(x), values(reduce(bidirectional_mapping(ping_segments, traceroute_segments)[1])))
        res2 = map(x -> length(x), values(reduce(bidirectional_mapping(ping_segments, traceroute_segments)[2])))
        push!(counts_ab, res1...)
        push!(counts_ba, res2...)
    catch e
        !(e isa SystemError) && raise(e)
        println(e.prefix)
    end
end

In [None]:
d = counts(counts_ab, maximum(counts_ab))
bar(1:length(d), d);
xlim(1, 25)

In [None]:
d = counts(counts_ba, maximum(counts_ba))
bar(1:length(d), d);

In [None]:
d = counts(counts_ab, maximum(counts_ab))
bar(1:length(d), d);
xlim(1, 25)

In [None]:
d = counts(counts_ba, maximum(counts_ba))
bar(1:length(d), d);

In [None]:
mean(counts_ab)

In [None]:
mean(counts_ba)

In [None]:
ping_file, traceroute_file = ping_traceroute_pairs[1]
traceroute = TracerouteRecord.(load_traceroute(traceroute_file, asntree))[1].hops