In [None]:
import pandas as pd
import networkx as nx
from pathlib import Path
# import pathpy as pp
import numpy as np
from collections import Counter
from datetime import timedelta
import random
from more_itertools import distinct_combinations

import pygraphviz
from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import altair as alt
alt.data_transformers.disable_max_rows()

In [None]:
from simulation.dataset import Dataset

In [None]:
dataset = Dataset("copenhagen_interactions")

In [None]:
DATA_FOLDER = Path("./data")
OUTPUT_FOLDER = Path("./output")
TEST_FOLDER = Path("./tests")

In [None]:
df = pd.read_csv(DATA_FOLDER / "copenhagen_interactions_distance.csv", parse_dates=["datetime"])
thresh = 90
days = [snapshot for _, snapshot in df.resample("D", on="datetime")]
five_minutes = [[snapshot for _, snapshot in d.resample("5T", on="datetime")] for d in days]

use this to visualize a graph. the param `labels` is for labels on the edges. by default there are labels on the nodes

In [None]:
def show_graph(G, labels=False):
    plt.figure(1, figsize=(4,4))
    if labels == True:
        pos = nx.spring_layout(G, k=1, weight='distance')
    else:
        pos = graphviz_layout(G, prog="neato")
    nx.draw(G, pos, node_size=40, vmin=0.0, vmax=1.0, with_labels=True)
    if labels:
        nx.draw_networkx_edge_labels(G, pos, edge_labels={t[:2]: t[2]["distance"] for t in G.edges.data()})
    plt.show()
#     plt.savefig(f"{'-'.join(list(map(str,shortest_path)))}.png")

1. take each connected component in each timestamp with over two nodes (i.e. group interactions)
2. remove nodes that are completely disconnected from component when threshold is in place
3. remove edges above threshold
4. find the complementory graph
5. find the length of the shortest path between each two nodes from complementory graph in original graph, and append it as an edge to the original graph
6. add the now full graph to a list of meetings

In [None]:
meetings = []
for day in range(dataset.period):
    print(day)
    for i, fm in enumerate(five_minutes[day]):
        G = nx.from_pandas_edgelist(fm, target="destination", edge_attr="distance")
        timestamp = fm["datetime"].iloc[0]
        subgraphs = [G.subgraph(c).copy() for c in nx.connected_components(G) if len(c) > 2]
        # subgraphs = list of any connected component with over 2 nodes in current timestamp
        for SG in subgraphs:
            # G_relevant_nodes = graph consisting of the nodes of all components that are still connected
            #     if edges are filtered to be under threshold
            G_relevant_nodes = G.subgraph(sum([[u,v] for u,v,d in SG.edges(data=True) if d['distance'] < thresh], [])).copy()
            # if the components left are still a group meeting => add hops to the mix
            if len(G_relevant_nodes) > 2:
                # G_sub = a subgraph of G_relevant_nodes by filtering out edges with distance above threshold 
                G_sub = G_relevant_nodes.edge_subgraph(
                    [(u,v) for u,v,d in G_relevant_nodes.edges(data=True) if d['distance'] < thresh]
                ).copy()
                # H = complementory graph, all the moissing edges to form a complete graph
                H = nx.complement(G_sub)
                for t in H.edges(): 
                    # add an edge + attr or only attr if edge already exists with the amount of hops needed to "create" this path 
                    SG.add_edge(*t, **{"hops": len(nx.shortest_path(SG, *t))})
#                 meetings.append(nx.to_pandas_edgelist(SG).assign(**{"meeting_number": meeting_n, "timestamp": timestamp}))
                meetings.append([SG, timestamp])
#         union = nx.union_all(timestamp_graphs)

cdf = pd.DataFrame(meetings, columns=["meetings", "timestamp"])
# cdf = pd.concat(meetings)

make a dataframe out of all meetings, grouping by timediff, so to create continuous meetings over timestamps

In [None]:
cdf_dict = cdf.to_dict()
cdf_exploded = (pd.concat(
    [nx.to_pandas_edgelist(v).assign(**{"timestamp": cdf_dict["timestamp"][k]}) for k, v in cdf_dict["meetings"].items()]
).rename(columns={"target": "destination"}))

cdf_exploded["meeting_nodes"] = cdf_exploded[["source", "destination"]].apply(lambda x: tuple(sorted(x)), axis=1)
cdf_exploded = cdf_exploded.sort_values(["meeting_nodes", "timestamp"]).reset_index(drop=True)
cdf_exploded["meeting_id"] = (cdf_exploded["timestamp"].diff() !=  pd.Timedelta('5m')).cumsum()

cdf_exploded = (cdf_exploded.groupby(["meeting_id","meeting_nodes"])
            .agg(**{
                "duration": pd.NamedAgg(column='timestamp', aggfunc=lambda x: x.count() * 5), 
                "datetime": pd.NamedAgg(column='timestamp', aggfunc='min'),
                "hops":  pd.NamedAgg(column='hops', aggfunc='mean'),
                "distance": pd.NamedAgg(column='distance', aggfunc='mean')
            }).reset_index()
           )
cdf_exploded[["source", "destination"]] = pd.DataFrame(cdf_exploded["meeting_nodes"].tolist())
cdf_exploded = cdf_exploded.drop(columns=["meeting_nodes", "meeting_id"])

get all one-on-one meetings from original csv, aggregate them by timediff

In [None]:
def group_by_time_diff(df):
    groups = (df.sort_values(["source","destination"])["datetime"].diff() !=  pd.Timedelta('5m')).cumsum()
    df["groups"] = groups
    return (df.groupby(["source","destination", "groups"])
            .agg(**{
                "duration": pd.NamedAgg(column='datetime', aggfunc=lambda x: x.count() * 5), 
                "datetime": pd.NamedAgg(column='datetime', aggfunc='min'),
                "distance": pd.NamedAgg(column='distance', aggfunc='mean')
            })
            .reset_index()
            .drop(columns=["groups"])
           )

In [None]:
%time new_df = df.groupby(["source", "destination"]).apply(group_by_time_diff)

combine and save

In [None]:
final = cdf_exploded.append(new_df).drop_duplicates(ignore_index=True)

In [None]:
final.to_csv("copenhagen_hops.csv", index=False)

In [None]:
ax = venn2([set(final.index[~final["distance"].isna()]), set(final.index[~final["hops"].isna()])], ("distance", "hops"))
ax.figsize = (8, 8)

In [None]:
chart = alt.Chart(final[(final["distance"]>=80) & (final["hops"]<5)]).mark_rect().encode(
    alt.X('hops:Q', bin=alt.Bin(maxbins=6)),
    alt.Y('distance:Q', bin=alt.Bin(maxbins=20)),
    alt.Color('count()', scale=alt.Scale(scheme='greenblue')),
)
chart.save(
    str(OUTPUT_FOLDER / "hops_vs_distance.html"), format="html"
)
chart

## leftovers

In [None]:
# meetings_df = cdf.copy()
# meetings_df["meeting_nodes"] = meetings_df["meetings"].apply(lambda x: tuple(sorted(x.nodes)))
# meetings_df = meetings_df.sort_values(["meeting_nodes", "timestamp"]).reset_index(drop=True)
# meetings_df["meeting_id"] = (meetings_df["timestamp"].diff() !=  pd.Timedelta('5m')).cumsum()

# meetings_df = (meetings_df.groupby(["meeting_id", "meeting_nodes"])
#             .agg(**{
#                 "duration": pd.NamedAgg(column='timestamp', aggfunc=lambda x: x.count() * 5), 
#                 "datetime": pd.NamedAgg(column='timestamp', aggfunc='min')
#             }).reset_index()
#            )
# meetings_df["participants"] = meetings_df["meeting_nodes"].str.len()
# # meetings_index = cdf[["group", "meetings"]].drop_duplicates("meetings")
# meetings_df["meeting_nodes"] = meetings_df["meeting_nodes"].apply(lambda x: list(distinct_combinations(x, r=2)))
# meetings_df = meetings_df.explode("meeting_nodes").reset_index(drop=True)
# meetings_df[["source", "destination"]] = pd.DataFrame(meetings_df["meeting_nodes"].tolist())
# meetings_df = meetings_df.set_index(["datetime", "meeting_nodes"])

In [None]:
def is_full(G):
    degrees = [x[1] for x in list(G.degree)]
    return True if len(set(degrees))==1 and degrees[0] == len(G)-1 else False

# infection in meetings

In [None]:
all_days = []
for day in range(dataset.period):
    groups = []
    for i, fm in enumerate(five_minutes[day]):
        G = from_pandas_edgelist(fm, target="destination")
        groups += list(set(sum([x for x in list(find_cliques(G)) if len(x) > 2], [])))
    meetings = (pd.DataFrame.from_dict(
        {"meeting_duration": {k: v*5 for k, v in dict(Counter(groups)).items()}}
    ).join(days[day][["source", "destination"]]
           .stack()
           .value_counts()
           .rename("all_interactions")*5).assign(
        percent=lambda x:x["meeting_duration"] / x["all_interactions"], 
                        infection_date=dataset.start_date + timedelta(days=day)))
    all_days.append(meetings)
percents = pd.concat(all_days).reset_index().rename(columns={"index": "id"}).set_index(["infection_date", "id"])

In [None]:
output = pd.read_pickle(OUTPUT_FOLDER / "9427637205343771_df.pkl").reset_index().set_index(["infection_date", "id"])

In [None]:
output.join(percents, how="left")["percent"].fillna(0)

## Trig and triangulation

In [None]:
def find_angle(a, b, c):
    return np.arccos((b**2 + c**2 - a**2) / (2*b*c)) * 180/np.pi

import math 

def find_side(a, b, C):
    return math.sqrt(a**2 + b**2 - 2*b*a*np.cos(C))

find_side(90, 78, 62.697)



## pathpy

In [None]:
for i, df in enumerate(split):
    df["time"] = i+1

In [None]:
dff = pd.concat(split[:10])[["source", "destination", "time"]]
tn = pp.io.from_dataframe(dff, directed=False)

In [None]:
tn = pp.Network(temporal=True)
for i in range(288):
    for _, s, d, t in split[i][["source", "destination", "datetime"]].itertuples():
        tn.add_edge(pp.Edge(pp.Node(s), pp.Node(d)))

In [None]:
style = {"width": 1500, "height": 600, "label_opacity": 0}
#          , "ms_per_frame": 500}
pp.visualisation.export_html(tn, './temporal_network.html', **style)

## RSSI max

In [None]:
pd.DataFrame.from_dict({"amount":{i:len(df[df["distance"]<i]) for i in range(df["distance"].max()+1)}}).plot(logy=True)
# df.groupby("distance")["source"].count().plot.bar(figsize=(16,8))