## Variance between Infomap Runs

In this notebook, we check the robustness of our results against the randomness inherent in the `infomap` algorithm
as reported in the SI.

In short, we investigate the question:
How much variance does there exist between infomap runs with different seeds?

### Preparations

In [None]:
from collections import defaultdict
import itertools
import multiprocessing
import networkx as nx
import pandas as pd
import matplotlib.pylab as plt
import matplotlib.colors as colors
import seaborn as sns
from cdlib import NodeClustering, evaluation

from legal_data_clustering.pipeline.cd_cluster import cluster

In [None]:
def _cluster(run_idx):
        return cluster(g, config, return_tree=False, seed=run_idx * 10000).communities

def analyze_cluster_run_diff(year, graphfile):
    global g
    g = nx.read_gpickle(graphfile)
    global config
    config = dict(method='infomap', markov_time=1.0, number_of_modules=100)

    with multiprocessing.Pool(4) as p:
        clusterings = p.map(_cluster, range(100))
        
    scores_nmi = [
        evaluation.normalized_mutual_information(
            NodeClustering(c1, g, None),
            NodeClustering(c2, g, None)
        ).score
        for c1, c2 in itertools.combinations(clusterings, 2)
    ]
    
    scores_rand = [
        evaluation.adjusted_rand_index(
            NodeClustering(c1, g, None),
            NodeClustering(c2, g, None)
        ).score
        for c1, c2 in itertools.combinations(clusterings, 2)
    ]
    
#     scores_pairs = [
#         evaluation.adjusted_mutual_information(
#             NodeClustering(c1, g, None),
#             NodeClustering(c2, g, None)
#         ).score
#         for c1, c2 in zip(clusterings[::2],clusterings[1::2])
#     ]

    return {
        "NMI": scores_nmi,
        "Rand": scores_rand,
    }

In [None]:
years = list(range(1994, 2018+1))

### Compute statistics

#### US

In [None]:
scores = defaultdict(list)
for year in years:
    scores_dict = analyze_cluster_run_diff(
        year,
        f"../../legal-networks-data/us/10_preprocessed_graph/{year}_0-0_1-0_-1.gpickle.gz"
    )
    for method, method_scores in scores_dict.items():    
        scores[method].append(method_scores)

    print(year, 'done')

In [None]:
dfs = []
for method in scores:
    df = pd.DataFrame({y:s for y, s in zip(years, scores[method])}).T
    df['Method'] = method
    df = df.reset_index().set_index(['Method', 'index']).T
    dfs.append(df)
df = pd.concat(dfs)
df.to_pickle('../graphics/variance_infomap_runs_us.pickle')

In [None]:
df = pd.read_pickle('../graphics/variance_infomap_runs_us.pickle')
df.describe()

In [None]:
cm = plt.get_cmap('viridis')
cNorm  = colors.Normalize(vmin=0, vmax=len(years)-1)
plt.rcParams['figure.figsize'] = (12,9)
plt.rcParams['font.size'] = 16
sns.set_style('darkgrid')

In [None]:
def create_variance_plot(df_col, xlabel, save_path=None):
    for idx, (year, col) in enumerate(df_col.iteritems()):
        sns.distplot(col, norm_hist=True, hist=False, rug=False, color=cm(idx/25), label=year)
        #col.hist(bins=25, histtype='step', label=year, density=True, color=cm(idx/25))
    plt.legend(ncol=2, loc='upper left', bbox_to_anchor=(0,1))
    plt.xlabel(xlabel)
    plt.ylabel('Smoothed Frequency among 4950 Pairs of Clusterings')
    plt.tight_layout()
    if save_path is not None:
        plt.savefig(save_path)

In [None]:
create_variance_plot(df['NMI'], 'Normalised Mutual Information', save_path='../graphics/variance_infomap_runs_us_nmi.pdf')

In [None]:
create_variance_plot(df['Rand'], 'Adjusted Rand Index', save_path='../graphics/variance_infomap_runs_us_rand.pdf')

#### DE

In [None]:
scores = defaultdict(list)
for year in years:
    scores_dict = analyze_cluster_run_diff(
        year,
        f"../../legal-networks-data/de/10_preprocessed_graph/{year}-01-01_0-0_1-0_-1.gpickle.gz"
    )
    for method, method_scores in scores_dict.items():    
        scores[method].append(method_scores)

    print(year, 'done')

In [None]:
dfs = []
for method in scores:
    df = pd.DataFrame({y:s for y, s in zip(years, scores[method])}).T
    df['Method'] = method
    df = df.reset_index().set_index(['Method', 'index']).T
    dfs.append(df)
df = pd.concat(dfs)
df.to_pickle('../graphics/variance_infomap_runs_de.pickle')

In [None]:
df = pd.read_pickle('../graphics/variance_infomap_runs_de.pickle')
df.describe()

In [None]:
create_variance_plot(df['NMI'], 'Normalised Mutual Information', save_path='../graphics/variance_infomap_runs_de_nmi.pdf')

In [None]:
create_variance_plot(df['Rand'], 'Adjusted Rand Index', save_path='../graphics/variance_infomap_runs_de_rand.pdf')