## Matched Nodes Statistics

In this notebook, we compute the percentage of nodes that we managed to map between snapshots
using on our multi-pass node alignment heuristic, as reported in the text of the paper.

### Preparations

In [None]:
import networkx as nx
import os
import json
import pandas as pd
from quantlaw.utils.networkx import get_leaves

In [None]:
data = []
for dataset in ['us', 'de']:
    folder = f'../../legal-networks-data/{dataset}/4_crossreference_graph/subseqitems/'
    leaves_dict = {}
    for file in sorted(os.listdir(folder)):
        if file.endswith('.gpickle.gz'):
            G = nx.read_gpickle(folder+file)
            leaves = get_leaves(G)
            leaves_dict[file.split('.')[0]] = len(leaves)
            print(file, 'done')
    
    folder = f'../../legal-networks-data/{dataset}/5_snapshot_mapping_edgelist/subseqitems/'
    for file in sorted(os.listdir(folder)):
        if file.endswith('.json'):
            with open(folder+file) as f:
                mappings = json.load(f)
            file_base = os.path.splitext(file)[0]
            snapshot_1, snapshot_2 = file_base.split('_')
            data.append({
                'dataset': dataset,
                'year1': snapshot_1,
                'year2': snapshot_2,
                'count1': leaves_dict[snapshot_1],
                'count2': leaves_dict[snapshot_2],
                'mapped': len(mappings)
            })

In [None]:
df = pd.DataFrame(data).sort_values(['dataset', 'year1'])
df['Mappend Ratio'] = [t.mapped / min(t.count1, t.count2) for t in df.itertuples()]
df.to_csv('../graphics/matched_stats.csv')

In [None]:
df = pd.read_csv('../graphics/matched_stats.csv')
df.groupby('dataset')['Mappend Ratio'].describe()

### End

