# Results section 1: growth
## Basic statistics and degree distributions

### Preparations

In [1]:
%run fix_notebook_imports.py

In [2]:
from matplotlib.lines import Line2D
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import powerlaw
import seaborn as sns

from analysis.utils import get_crossreference_path
from analysis.statics import YEARS, COUNTRIES
from analysis._01_basic_statistics import get_networkx_graph_components, get_node_and_edge_files

In [3]:
sns.set_style('whitegrid')
plt.rcParams['font.size'] = 12

In [4]:
def plot_point_statistic(xs, country_data, xlabel, ylabel, color="k", marker="^", savepath=None):
    plt.rcParams['figure.figsize'] = (9,6)
    sns.lineplot(x=xs, y=country_data, color=color, marker=marker, markersize=5)
    plt.xticks(xs[::3], fontsize=18)
    plt.yticks(fontsize=18)
    plt.xlabel(xlabel, fontsize=24)
    plt.ylabel(ylabel, fontsize=24)
    plt.tight_layout()
    if savepath is not None:
        plt.savefig(savepath)
        plt.close()
        
def plot_statistics(xs, statistics, xlabel, ylabel, ylim=None, savepath=None):
    """
    expects statistics to be list of (stat, marker, color, linestyle, label) tuples
    :param xs: 
    :param statistics: 
    :param xlabel: 
    :param ylabel: 
    :param savepath: 
    :return: 
    """

    plt.rcParams['figure.figsize'] = (9,6)
    for idx, (stat, marker, color, linestyle, label) in enumerate(statistics):
        sns.lineplot(x=xs, y=stat, marker=marker, color=color, markersize=7.5)
        plt.gca().lines[-1].set_linestyle(linestyle)
    plt.xticks(xs[::3], fontsize=24)
    plt.yticks(fontsize=24)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel(xlabel, fontsize=24)
    plt.ylabel(ylabel, fontsize=24)
    plt.tight_layout()
    plt.legend([stat[-1] for stat in statistics], loc='upper left', fontsize=18)
    if savepath is not None:
        plt.savefig(savepath)
        plt.close()
        
def plot_statistics_with_subplots(xs, statistics, xlabel, ylabel, ylim=None, savepath=None):
    _, ax = plt.subplots(1, 2, figsize=(9*2,6), sharex=True, sharey=False)
    for idx, (stat, marker, color, linestyle, label) in enumerate(statistics):
        cax = ax[0] if color == 'k' else ax[1]
        sns.lineplot(x=xs, y=stat, marker=marker, color=color, markersize=7.5, ax=cax)
        cax.lines[-1].set_linestyle(linestyle)
        plt.sca(cax)
        plt.xticks(xs[::3], fontsize=24)
        plt.yticks(fontsize=24)
        if ylim is not None:
            plt.ylim(*ylim)
            plt.yticks(np.arange(0.8,ylim[-1]+0.01,0.2), fontsize=24)
        plt.xlabel(xlabel, fontsize=24)
        plt.ylabel(ylabel, fontsize=24)
    for idx,color in enumerate(['k','b']):
        plt.sca(ax[idx])
        plt.legend([stat[-1] for stat in statistics if stat[2] == color], loc='upper left', fontsize=18)
    plt.tight_layout()
    if savepath is not None:
        plt.savefig(savepath)
        plt.close()
        
def get_degree_distributions(countries, years, normalized=False):
    distributions = {years[0]:{},years[-1]:{}}
    for country in countries:
        crossreference_path = get_crossreference_path(country)
        nodefiles, edgefiles = get_node_and_edge_files(crossreference_path, years)
        for year in distributions.keys():
            nodes, edges = get_networkx_graph_components(crossreference_path, nodefiles[years.index(year)], edgefiles[years.index(year)])
            edges = edges.join(nodes[["document_type"]], on="u").rename(dict(document_type="u_document_type"), axis=1)
            edges = edges.join(nodes[["document_type"]], on="v").rename(dict(document_type="v_document_type"), axis=1)
            sta_out = nodes.join(edges.query("edge_type == 'reference' and u_document_type == 'statute' and v_document_type == 'statute'"
                                            ).groupby('u').count())[['v']].fillna(0)['v'].values
            reg_out = data = nodes.join(edges.query("edge_type == 'reference' and u_document_type == 'regulation' and v_document_type == 'regulation'"
                                                   ).groupby('u').count())[['v']].fillna(0)['v'].values
            all_out = nodes.join(edges.query("edge_type == 'reference'").groupby('u').count())[['v']].fillna(0)['v'].values
            sta_in = nodes.join(edges.query("edge_type == 'reference' and u_document_type == 'statute' and v_document_type == 'statute'"
                                           ).groupby('v').count())[['u']].fillna(0)['u'].values
            reg_in = nodes.join(edges.query("edge_type == 'reference' and u_document_type == 'regulation' and v_document_type == 'regulation'"
                                           ).groupby('v').count())[['u']].fillna(0)['u'].values
            all_in = nodes.join(edges.query("edge_type == 'reference'").groupby('v').count())[['u']].fillna(0)['u'].values
            if normalized:
                sta_out = sta_out / nodes['tokens_n'].replace(0, np.finfo(float).eps).values
                reg_out = reg_out / nodes['tokens_n'].replace(0, np.finfo(float).eps).values
                all_out = all_out / nodes['tokens_n'].replace(0, np.finfo(float).eps).values
                sta_in = sta_in / nodes['tokens_n'].replace(0, np.finfo(float).eps).values
                reg_in = reg_in / nodes['tokens_n'].replace(0, np.finfo(float).eps).values
                all_in = all_in / nodes['tokens_n'].replace(0, np.finfo(float).eps).values
            distributions[year][country] = dict(sta_out=sta_out,reg_out=reg_out,all_out=all_out,sta_in=sta_in,reg_in=reg_in,all_in=all_in,
                                                tokens_n=nodes.tokens_n.values, key=nodes.index.values, 
                                               )
    df1 = pd.DataFrame.from_dict(distributions[years[0]])
    df2 = pd.DataFrame.from_dict(distributions[years[-1]])
    return df1, df2
    
def plot_ccdfs(df1, df2, year1, year2, country, degtype, normalized=False, save_path=None):
    custom_lines = [Line2D([0], [0], color='b', lw=6), Line2D([0], [0], color='r', lw=6)]
    custom_lines2 = [Line2D([0], [0], color='k', lw=3,linestyle='--'),
                     Line2D([0], [0], color='k', lw=3, linestyle=':'),
                     Line2D([0], [0], color='k', lw=3, linestyle='-')]
    fig, ax = plt.subplots(figsize=(12,9))
    powerlaw.plot_ccdf(df1.at[f'sta_{degtype}',country], color='b', ax=ax, linestyle='--', lw=2)
    powerlaw.plot_ccdf(df1.at[f'reg_{degtype}',country], color='b', ax=ax, linestyle=':', lw=2)
    powerlaw.plot_ccdf(df1.at[f'all_{degtype}',country], color='b', ax=ax, linestyle='-', lw=2)
    powerlaw.plot_ccdf(df2.at[f'sta_{degtype}',country], color='r', ax=ax, linestyle='--', lw=2)
    powerlaw.plot_ccdf(df2.at[f'reg_{degtype}',country], color='r', ax=ax, linestyle=':', lw=2)
    powerlaw.plot_ccdf(df2.at[f'all_{degtype}',country], color='r', ax=ax, linestyle='-', lw=2)
    ax.set_xscale('log', basex=2)
    ax.set_yscale('log', basey=2)
    if not normalized:
        plt.xlim(2**-0.5,2**12.5)
        plt.xticks([2**x for x in range(13)],fontsize=24)
        plt.ylim(2**-22,2**1.25)
        plt.yticks([2**(x) for x in range(-22,1,2)],fontsize=24)
    else:
        if country != 'us' or degtype != 'in':
            plt.xlim(2**-15,2**3)
            plt.ylim(2**-22,2**0)
            plt.xticks([2**x for x in range(-15,4)],fontsize=24)
            plt.yticks([2**(x) for x in range(-22,1,2)],fontsize=24)
        else:
            plt.xticks(fontsize=24)
            plt.yticks(fontsize=24)
        
    plt.xlabel(f"{degtype.capitalize()}-Degree", fontsize=24)
    plt.ylabel("Fraction of sections with at least the given degree", fontsize=24)
    legend = plt.legend(custom_lines, [year1, year2], loc="upper left", fontsize=18)
    legend2 = plt.legend(custom_lines2, ['Statutes only','Regulations only', 'All'], loc="upper right", fontsize=18)
    ax.add_artist(legend)
    ax.add_artist(legend2)
    plt.tight_layout()
    if save_path is not None:
        plt.savefig(save_path)
        plt.close()

### Basic statistics

In [5]:
for country in COUNTRIES:
    crossreference_path = get_crossreference_path(country)
    df = pd.read_csv(f"../results/basic-statistics-{country}.csv")
    plot_statistics_with_subplots(YEARS, [(df.tokens_n_rel_sta, 'o', 'k', '-', 'Statute Tokens'),
                        (df.tokens_n_rel_reg, 'o', 'b', '-', 'Regulation Tokens'),
                        (df.structures_rel_sta_ssi, '^', 'k', '--', 'Statute Structural Elements'), 
                        (df.structures_rel_reg_ssi, '^', 'b', '--', 'Regulation Structural Elements'),
                        (df.crossrefs_n_rel_sta, 'v', 'k', ':', 'Intra-Statute References'),         
                        (df.crossrefs_n_rel_reg, 'v', 'b', ':', 'Intra-Regulation References'),
                        ], 
                'Year', f'Growth relative to {YEARS[0]} baseline',
                ylim=(0.95,2.7),
                savepath=f'../graphics/growth-statistics-comparison-separated-{country}.pdf'
               )
    plot_statistics(YEARS, [(df.tokens_n_rel_sta, 'o', 'k', '-', 'Statute Tokens'),
                        (df.tokens_n_rel_reg, 'o', 'b', '-', 'Regulation Tokens'),
                        (df.structures_rel_sta_ssi, '^', 'k', '--', 'Statute Structural Elements'), 
                        (df.structures_rel_reg_ssi, '^', 'b', '--', 'Regulation Structural Elements'), 
                        (df.crossrefs_n_rel_sta, 'v', 'k', ':', 'Intra-Statute References'),         
                        (df.crossrefs_n_rel_reg, 'v', 'b', ':', 'Intra-Regulation References'),
                        ], 
                'Year', f'Growth relative to {YEARS[0]} baseline',
                ylim=(0.95,2.7),
                savepath=f'../graphics/growth-statistics-comparison-combined-{country}.pdf'
               )
    plot_statistics(YEARS, [(df.crossrefs_n_rel_sta, 'v', 'k', ':', 'Statute → Statute'),
                        (df.crossrefs_n_rel_reg, 'v', 'b', ':', 'Regulation → Regulation'),
                        (df.crossrefs_n_rel_reg_sta, 'v', 'dodgerblue', ':', 'Regulation → Statute')
                       ], 
                'Year', f'Growth relative to {YEARS[0]} baseline',
                ylim=(0.95,2.7),
                savepath=f'../graphics/reference-statistics-comparison-{country}.pdf'
               )

### Degree distributions

In [6]:
df1, df2 = get_degree_distributions(COUNTRIES, YEARS, normalized=False)
for country in COUNTRIES:
    for degtype in ["in", "out"]:
        plot_ccdfs(df1, df2, YEARS[0], YEARS[-1], country, degtype, normalized=False, save_path=f"../graphics/{degtype}-degree-{country}-{YEARS[0]}-{YEARS[-1]}.pdf")

In [7]:
df1, df2 = get_degree_distributions(COUNTRIES, YEARS, normalized=True)
for country in COUNTRIES:
    for degtype in ["in", "out"]:
        plot_ccdfs(df1, df2, YEARS[0], YEARS[-1], country, degtype, normalized=True, save_path=f"../graphics/{degtype}-degree-{country}-{YEARS[0]}-{YEARS[-1]}-normalized.pdf")

### The end.