## Summary Statistics

In this notebook, we compute the statistics for Table 1 and create Figure 2.
We also derive some further statistics, not all of which are discussed in the paper.

### Preparation

In [None]:
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from quantlaw.utils.files import list_dir
from quantlaw.utils.networkx import induced_subgraph, hierarchy_graph

In [None]:
%matplotlib inline
sns.set_style('darkgrid')

In [None]:
def abs_to_rel(abs_array):
    rel_array = abs_array / abs_array[0]
    return rel_array

In [None]:
def plot_point_statistic(xs, de, us, xlabel, ylabel, savepath=None):
    sns.lineplot(x=xs, y=de, color='k', marker='^', markersize=5)
    sns.lineplot(x=xs, y=us, color='r', marker='v', markersize=5)
    plt.xticks(xs, rotation=45)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.legend(['DE', 'US'])
    if savepath is not None:
        plt.savefig(savepath, dpi=1000)

In [None]:
def plot_two_point_statistics(xs, stat1, stat2, 
                              xlabel, ylabel, color1, color2, 
                              savepath=None):
    sns.lineplot(x=xs, y=stat1, marker='^', color=color1, markersize=5)
    sns.lineplot(x=xs, y=stat2, marker='v', color=color2, markersize=5)
    plt.xticks(xs, rotation=45)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.legend(['Tokens', 'Cross-references'])
    if savepath is not None:
        plt.savefig(savepath, dpi=1000)

In [None]:
us_crossreference_path = '../../legal-networks-data/us/4_crossreference_graph/seqitems'
de_crossreference_path = '../../legal-networks-data/de/4_crossreference_graph/seqitems'

In [None]:
de_graph_files = sorted(list_dir(de_crossreference_path, 'gpickle.gz'))
de_graphs = [nx.read_gpickle(f'{de_crossreference_path}/{gf}') for gf in de_graph_files]

In [None]:
us_graph_files = sorted(list_dir(us_crossreference_path, 'gpickle.gz'))
us_graphs = [nx.read_gpickle(f'{us_crossreference_path}/{gf}') for gf in us_graph_files]

In [None]:
years = list(range(1994,1994+min(len(de_graphs),len(us_graphs)))) # usually our xs

In [None]:
plt.rcParams['figure.figsize'] = (9,6)

### Number of tokens (~ "words") over time

In [None]:
de_tokens_n_abs = np.array([sum([ndata['tokens_n'] for n, ndata in G.nodes(data=True) if ndata['level'] == 0])
                         for G in de_graphs])
de_tokens_n_rel = abs_to_rel(de_tokens_n_abs)

In [None]:
us_tokens_n_abs = np.array([sum([ndata['tokens_n'] for n, ndata in G.nodes(data=True) if ndata['level'] == 0])
                         for G in us_graphs])
us_tokens_n_rel = abs_to_rel(us_tokens_n_abs)

##### Table 1, Line 1

In [None]:
print(f'DE #Tokens 1994: {de_tokens_n_abs[0]}, DE #Tokens 2018: {de_tokens_n_abs[-1]}, Ratio: {de_tokens_n_abs[-1] / de_tokens_n_abs[0]}')
print(f'US #Tokens 1994: {us_tokens_n_abs[0]}, US #Tokens 2018: {us_tokens_n_abs[-1]}, Ratio: {us_tokens_n_abs[-1] / us_tokens_n_abs[0]}')

##### Single-line plots

In [None]:
plot_point_statistic(years, de_tokens_n_rel, us_tokens_n_rel,
                     'Year', f'Tokens relative to {years[0]} baseline',
                     savepath='../graphics/tokens-growth-relative.pdf'
                    )

In [None]:
plot_point_statistic(years, de_tokens_n_abs, us_tokens_n_abs,
                     'Year', f'Number of tokens',
                     savepath='../graphics/tokens-growth-absolute.pdf'
                    )

### Number of structural elements (aka nodes) over time

#### Without subseqitems

In [None]:
de_structures_n_abs = np.array([G.number_of_nodes() for G in de_graphs])
de_structures_n_rel = abs_to_rel(de_structures_n_abs)

In [None]:
us_structures_n_abs = np.array([G.number_of_nodes() for G in us_graphs])
us_structures_n_rel = abs_to_rel(us_structures_n_abs)

In [None]:
de_structures_n_abs, us_structures_n_abs

##### Single-line plots

In [None]:
plot_point_statistic(years, de_structures_n_rel, us_structures_n_rel,
                     'Year', f'Structural elements relative to {years[0]} baseline',
                     savepath='../graphics/structures-growth-relative-nosubseqitems.pdf'
                    )

In [None]:
plot_point_statistic(years, de_structures_n_abs, us_structures_n_abs,
                     'Year', f'Number of structural elements',
                     savepath='../graphics/structures-growth-absolute-nosubseqitems.pdf'
                    )

#### With subseqitems

In [None]:
def get_ssi_stats(country):
    crossreference_path_ssi = ('../../legal-networks-data/us/4_crossreference_graph/subseqitems' 
                               if country.lower() == 'us' else '../../legal-networks-data/de/4_crossreference_graph/subseqitems')
    graph_files_ssi = sorted(list_dir(crossreference_path_ssi, 'gpickle.gz')) 
    n_structures_abs_ssi = list()
    for gf in graph_files_ssi:
        ssiG = nx.read_gpickle(f'{crossreference_path_ssi}/{gf}')
        n_structures_abs_ssi.append(ssiG.number_of_nodes())
    n_structures_abs_ssi = np.array(n_structures_abs_ssi)
    n_structures_rel_ssi = abs_to_rel(n_structures_abs_ssi)
    return n_structures_abs_ssi, n_structures_rel_ssi

In [None]:
de_n_structures_abs_ssi, de_n_structures_rel_ssi = get_ssi_stats('de')

In [None]:
us_n_structures_abs_ssi, us_n_structures_rel_ssi = get_ssi_stats('us')

##### Table 1, Line 2

In [None]:
print(f'DE #Structures (incl. subseqitems) 1994: {de_n_structures_abs_ssi[0]}, DE #Structures (incl. subseqitems) 2018: {de_n_structures_abs_ssi[-1]}, Ratio: {de_n_structures_abs_ssi[-1] / de_n_structures_abs_ssi[0]}')
print(f'US #Structures (incl. subseqitems) 1994: {us_n_structures_abs_ssi[0]}, US #Structures (incl. subseqitems) 2018: {us_n_structures_abs_ssi[-1]}, Ratio: {us_n_structures_abs_ssi[-1] / us_n_structures_abs_ssi[0]}')

##### Single-line plots

In [None]:
plot_point_statistic(years, de_n_structures_rel_ssi, us_n_structures_rel_ssi, 'Year', 
                     f'Structural elements relative to {years[0]} baseline',
                    savepath='../graphics/structures-growth-relative.pdf'
                    )

In [None]:
plot_point_statistic(years, de_n_structures_abs_ssi, us_n_structures_abs_ssi,
                     'Year', f'Number of structural elements',
                     savepath='../graphics/structures-growth-absolute.pdf'
                    )

#### Just seqitems (~ "number of sections")

In [None]:
de_seqitems_n_abs = np.array([len([n for n, ndata in G.nodes(data=True) if ndata.get('citekey') is not None])
                         for G in de_graphs])
de_seqitems_n_rel = abs_to_rel(de_seqitems_n_abs)

In [None]:
us_seqitems_n_abs = np.array([len([n for n, ndata in G.nodes(data=True) if ndata.get('citekey') is not None])
                         for G in us_graphs])
us_seqitems_n_rel = abs_to_rel(us_seqitems_n_abs)

In [None]:
de_seqitems_n_abs, us_seqitems_n_abs

##### Single-line plot

In [None]:
plot_point_statistic(years, de_seqitems_n_rel, us_seqitems_n_rel,
                     'Year', f'Numbered elements relative to {years[0]} baseline',
                     savepath='../graphics/presentation-both-seqitems-growth.pdf'
                    )

### Number of cross-references over time

In [None]:
de_crossrefs_n_abs = np.array([len([e[-1] for e in G.edges(data='edge_type') if e[-1] == 'reference'])
                         for G in de_graphs])
de_crossrefs_n_rel = abs_to_rel(de_crossrefs_n_abs)

In [None]:
us_crossrefs_n_abs = np.array([len([e[-1] for e in G.edges(data='edge_type') if e[-1] == 'reference'])
                         for G in us_graphs])
us_crossrefs_n_rel = abs_to_rel(us_crossrefs_n_abs)

##### Table 1, Line 3

In [None]:
print(f'DE #Crossreferences 1994: {de_crossrefs_n_abs[0]}, DE #Crossreferences 2018: {de_crossrefs_n_abs[-1]}, Ratio: {de_crossrefs_n_abs[-1] / de_crossrefs_n_abs[0]}')
print(f'US #Crossreferences 1994: {us_crossrefs_n_abs[0]}, US #Crossreferences 2018: {us_crossrefs_n_abs[-1]}, Ratio: {us_crossrefs_n_abs[-1] / us_crossrefs_n_abs[0]}')

##### Single-line plots

In [None]:
plot_point_statistic(years, de_crossrefs_n_rel, us_crossrefs_n_rel,
                     'Year', f'Cross-references relative to {years[0]} baseline',
                    savepath='../graphics/crossrefs-growth-relative.pdf'
                    )

In [None]:
plot_point_statistic(years, de_crossrefs_n_abs, us_crossrefs_n_abs,
                     'Year', f'Number of cross-references',
                    savepath='../graphics/crossrefs-growth-absolute.pdf'
                    )

### Tokens vs Structures vs References Growth

In [None]:
def plot_statistics(xs, statistics, xlabel, ylabel, savepath=None):
    """
    expects statistics to be list of (stat, marker, color, label) tuples
    """
    linestyles = ['-', '--', ':']
    for idx, (stat, marker, color, label) in enumerate(statistics):
        sns.lineplot(x=xs, y=stat, marker=marker, color=color, markersize=7.5)
        plt.gca().lines[-1].set_linestyle(linestyles[idx])
    plt.xticks(xs, rotation=90, fontsize=14)
    plt.yticks(np.arange(0.9,2.1,0.1), fontsize=14)
    plt.ylim(0.95,2.05)
    plt.xlabel(xlabel, fontsize=16)
    plt.ylabel(ylabel, fontsize=16)
    plt.tight_layout()
    plt.legend([stat[-1] for stat in statistics], loc='upper left', fontsize=16)
    if savepath is not None:
        plt.savefig(savepath, dpi=1000)

In [None]:
plot_statistics(years, [(de_tokens_n_rel, 'o', 'k', 'Tokens'), 
                        (de_n_structures_rel_ssi, '^', 'b', 'Structures'), 
                        (de_crossrefs_n_rel, 'v', 'r', 'Cross-References')
                       ], 
                'Year', 'Growth relative to 1994 baseline',
                savepath='../graphics/de-tokens-structures-crossrefs-growth-rel.pdf'
               )

In [None]:
plot_statistics(years, [(us_tokens_n_rel, 'o', 'k', 'Tokens'), 
                        (us_n_structures_rel_ssi, '^', 'b', 'Structures'), 
                        (us_crossrefs_n_rel, 'v', 'r', 'Cross-References')
                       ], 
                'Year', 'Growth relative to 1994 baseline',
                savepath='../graphics/us-tokens-structures-crossrefs-growth-rel.pdf'
               )

### Additional explorations (not discussed in the paper)

#### Top k indegree and outdegree

In [None]:
# 2017
de_G = de_graphs[-2]
us_G = us_graphs[-2]

In [None]:
def get_top_k(G, func, k):
    return sorted([(n, func(n), G.nodes[n].get('heading')) for n in G.nodes], key=lambda tup:tup[1], reverse=True)[:k]

In [None]:
de_iG = induced_subgraph(de_G, filter_type='edge', 
                         filter_attribute='edge_type', filter_values=['reference'])

us_iG = induced_subgraph(us_G, filter_type='edge', 
                         filter_attribute='edge_type', filter_values=['reference'])

In [None]:
get_top_k(de_iG, de_iG.out_degree, 10)

In [None]:
get_top_k(de_iG, de_iG.in_degree, 10)

In [None]:
get_top_k(us_iG, us_iG.out_degree, 10)

In [None]:
get_top_k(us_iG, us_iG.in_degree, 10)

#### Indegree and Outdegree Histograms

In [None]:
def plot_degree_distribution(G, kind='indegree'):
    """
    country_code: US|DE
    kind: indegree|outdegree
    """
    seqitems = [n[0] for n in G.nodes(data=True) if 'citekey' in n[1]]
    seqitems_degrees = [G.in_degree(n) if kind == 'indegree' else G.out_degree(n) for n in seqitems]
    plt.hist(seqitems_degrees, log=True, bins=55, range=(0,550), color='k')
    plt.ylim(10**-0.25,10**5)
    plt.xlabel(kind.capitalize())
    plt.xticks(range(0,551,50))
    plt.ylabel('Number of Sequence Items')
    plt.tight_layout()

In [None]:
plot_degree_distribution(de_iG, 'indegree')

In [None]:
plot_degree_distribution(us_iG, 'indegree')

In [None]:
plot_degree_distribution(de_iG, 'outdegree')

In [None]:
plot_degree_distribution(us_iG, 'outdegree')

#### Depth histograms

In [None]:
def plot_depth_hist_change(country_code, years, save_path=None):
    """
    Adaptation of plot_depth_hist for comparing two distributions (redundant crap code).
    """
    if country_code == 'DE':
        Gsubseq = nx.read_gpickle(f'../../legal-networks-data/de/4_crossreference_graph/subseqitems/{years[0]}-01-01.gpickle.gz')
        Gsubseq2 = nx.read_gpickle(f'../../legal-networks-data/de/4_crossreference_graph/subseqitems/{years[1]}-01-01.gpickle.gz')
    else:
        Gsubseq = nx.read_gpickle(f'../../legal-networks-data/us/4_crossreference_graph/subseqitems/{years[0]}.gpickle.gz')
        Gsubseq2 = nx.read_gpickle(f'../../legal-networks-data/us/4_crossreference_graph/subseqitems/{years[1]}.gpickle.gz')
    HGsubseq = hierarchy_graph(Gsubseq)
    HGsubseq2 = hierarchy_graph(Gsubseq2)
    
    filtered = [(n[0], nx.shortest_path_length(HGsubseq, 'root', n[0])) 
                for n in HGsubseq.nodes(data=True) if HGsubseq.out_degree(n[0]) == 0]
    c = Counter(x[1]-1 for x in filtered)
    filtered2 = [(n[0], nx.shortest_path_length(HGsubseq2, 'root', n[0])) 
                for n in HGsubseq2.nodes(data=True) if HGsubseq2.out_degree(n[0]) == 0]
    c2 = Counter(x[1]-1 for x in filtered2)
    
    plt.rcParams['figure.figsize'] = (9,9)
    plt.bar(x=c.keys(), height=[x/len(filtered) for x in c.values()], color='r', width=0.8, alpha=0.5)
    plt.bar(x=c2.keys(), height=[x/len(filtered2) for x in c2.values()], color='k', width=0.8, alpha=0.5)
    plt.legend([f'{years[0]}', f'{years[1]}'])
    plt.xlim(0,15)
    plt.xticks(range(15))
    plt.ylim(0,0.35)
    plt.yticks(np.arange(0,0.36, 0.05))
    plt.xlabel('Depth of Leaf Nodes')
    plt.ylabel('Fraction of Leaf Nodes')
    plt.title(('Germany' if country_code == 'DE' else 'United States') + f' ({years[0]} vs. {years[1]})')
    plt.tight_layout()
    if save_path is not None:
        plt.savefig(save_path, dpi=600)

In [None]:
plot_depth_hist_change('US', [1994,2018])

In [None]:
plot_depth_hist_change('DE', [1994,2018])

### The End