In [1]:
import numpy as np
import pandas as pd
from simulation.helpers import timing
import altair as alt
from simulation.basic_configuration import BasicConfiguration
from simulation.google_cloud import GoogleCloud
from simulation.visualizer import Visualizer
from simulation.output import Output
from simulation.dataset import Dataset
from simulation.constants import *
%matplotlib inline

make sure to initialize all of these:

In [2]:
bc = BasicConfiguration()
gcloud = GoogleCloud(bc)
dataset = Dataset("copenhagen_interactions")
dataset.load_dataset()
output = Output(dataset)
visualizer = Visualizer(output)

You must have a __concated__ output for this to work, since the code checks each repetitions, so we don't want an averaged version.

Make sure you add `output.df.export(how="concated")` at the end of the full run. 

Then change the `REPETITIONS` here to match the amount of repetitions you used for this run. 

In [3]:
REPETITIONS = 30


## Kullback Leibler

This code checks KL divergence between average of repeptition `0->i` and average of repetition `0->i+1` for each day - P of all colors sum to 1.0 .

Change `SUM_KLD` to `True` if you want a summed chart.

In [4]:
SUM_KLD = False

In [5]:
def KL(P, Q):
    return np.sum(P * np.log(P / Q), axis=1)


def avg(dfs):
    return (pd.concat(dfs)
            .groupby(["color", "day"])["amount"]
            .mean()
            .replace(0, .00001) # epsilon instead of 0, makes KL work better
            .reindex(pd.MultiIndex.from_product([output.colors, range(dataset.period)]))
            .values.reshape(len(output.colors), dataset.period)
#             .reshape(dataset.period, len(visualizer.colors))
           .T 
            / dataset.nodes
           )

In [6]:
concated = pd.read_csv(OUTPUT_FOLDER / f"concated_output_{REPETITIONS}.csv")
df_list = np.array_split(concated, REPETITIONS)

KLDs = []
for i in range(2,REPETITIONS+1):
    prev, this = avg(df_list[:i-1]), avg(df_list[:i])
    KLDs.append((KL(prev, this) + KL(this, prev)) / 2)
#     KLDs.append(KL(this, prev))

final_KLDs = (pd.DataFrame(np.array(KLDs), columns=range(dataset.period))
                  .reset_index()
                  .melt(id_vars="index")
                  .rename(columns={"index": "repetition", "value": "KLD", "variable": "day"})
                 )
final_KLDs["repetition"] = final_KLDs["repetition"].astype(str).apply(lambda x: x.replace(x, f"{int(x):02}-{int(x)+1:02}"))

In [7]:
KLD_chart = alt.Chart(final_KLDs).mark_line().encode(x="repetition", y="KLD:Q", 
                                          color="day:O",
                                          tooltip=final_KLDs.columns.tolist()
                                                    ).interactive()
if SUM_KLD:
    KLD_chart = KLD_chart.transform_aggregate(
        KLD='sum(KLD)',
        groupby=["repetition"]
    )
KLD_chart

## Proportion Test

This is the beggining of the work on the proportion tests. didn't finish. 

Currently plotting the proportion at the last day only.

According to the viz it seems like the amounts are pretty static.

In [8]:

def last_percent(df):
    return (df.tail(len(visualizer.colors))
            .set_index("color")
            .reindex(visualizer.colors.keys())["amount"] # * 100 / dataset.nodes
           )

In [9]:
percents = [last_percent(df) for df in df_list]
final_percents = (pd.DataFrame(np.array(percents), columns=output.colors)
                    .reset_index()
                  .melt(id_vars="index")
                  .rename(columns={"index": "repetition", "value": "amount", "variable": "color"})
                 )

In [10]:
alt.Chart(final_percents).mark_line().encode(x="repetition:O", y="amount:Q", color=alt.Color(
                    "color",
                    scale=alt.Scale(
                        domain=list(visualizer.colors.keys()),
                        range=list(visualizer.colors.values()),
                    ),
                )).interactive()