# Results section 3: profiles
## Tracking individual units of law over time

### Preparations

In [1]:
%run fix_notebook_imports.py

In [2]:
import textwrap
import re

import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt

from quantlaw.utils.files import list_dir

from analysis.statics import YEARS, COUNTRIES
from analysis.utils import get_crossreference_path, get_node_and_edge_files, get_preprocessed_graph_path, get_preprocessed_graph_files

In [3]:
plt.rcParams['figure.figsize'] = (16,16)
sns.set_style("whitegrid")

In [4]:
def create_responsibility_graph(ego_node, qG_weighted_edges, threshold=0.05):
    responsibility_graph = nx.OrderedGraph()
    ego_edges = [(u,v,int(w)) for (u,v,w) in [(x.u,x.v,x.weight) for x in qG_weighted_edges.query("v == @ego_node").itertuples()]]
    cutoff = np.quantile(np.array(list(map(lambda x: x[-1], ego_edges))),threshold) if ego_edges else 0.
    responsibility_graph.add_weighted_edges_from(list(filter(lambda e:e[-1] >= cutoff, ego_edges)))
    return responsibility_graph

def create_reliance_graph(ego_node, qG_weighted_edges,threshold=0.05):
    responsibility_graph = nx.OrderedGraph()
    ego_edges = [(u,v,int(w)) for (u,v,w) in [(x.u,x.v,x.weight) for x in qG_weighted_edges.query("u == @ego_node").itertuples()]]
    cutoff = np.quantile(np.array(list(map(lambda x: x[-1], ego_edges))),threshold) if ego_edges else 0.
    responsibility_graph.add_weighted_edges_from(list(filter(lambda e:e[-1] >= cutoff, ego_edges)))
    return responsibility_graph

In [5]:
def get_edge_color(u,v,nodes,node,graph_type):
    if v == node:
        u,v = v,u
    assert u == node
    u_type = nodes.at[u,'document_type']
    v_type = nodes.at[v,'document_type']
    if u_type == 'statute' == v_type:
        return 'k'
    elif u_type == 'regulation' == v_type:
        return 'b'
    else:
        if graph_type == 'responsibility' and u_type == 'statute' or graph_type == 'reliance' and u_type == 'regulation':
            return 'dodgerblue'
        elif graph_type in ['responsibility','reliance']:
            return 'silver'
        else: 
            raise

def get_labels_us(r_graph, nodes, document_type, node):
    return {n:textwrap.fill(nodes.at[n,'law_name'].split("-")[0].split(" ")[-1]
            +"/"+re.sub("—", "-", re.sub("&ndash;", "_", nodes.at[n,'heading'])).split("-")[0].split(" ")[-1],10)
            for n in r_graph.nodes() 
            if nodes.at[n,'document_type'] == document_type and n != node}

def get_labels_de(r_graph, nodes, document_type, node):
    return {n:textwrap.fill(nodes.at[n,'abbr_1'],10) if not pd.isna(nodes.at[n,'abbr_1']) 
            else textwrap.fill(nodes[(nodes.law_name == nodes.at[n,'law_name']) & ~ pd.isna(nodes.abbr_1)].iloc[0].abbr_1 + "/" 
            + " ".join(nodes.at[n,'heading'].split(" ")[:2]),10)
            for n in r_graph.nodes() 
            if nodes.at[n,'document_type'] == document_type and n != node}
            
def draw_graph(r_graph, nodes, node, graph_type='responsibility', node_name=None, labels=None, save_path=None):
    pos = nx.circular_layout(r_graph, center=(0,0), dim=2)
    pos[node] = (0,0)
    edge_colors = [get_edge_color(u,v,nodes,node,graph_type) for u,v in r_graph.edges()]
    node_sizes = [min(nodes.at[n,'tokens_n']/100,2000) for n in r_graph.nodes()]
    nx.draw_networkx_nodes(r_graph, pos=pos, nodelist=r_graph.nodes(), 
                           node_size=node_sizes,
                           node_color='silver'
                          )
    nx.draw_networkx_edges(r_graph, pos=pos, edgelist=r_graph.edges(), arrows=True,
                           width=[w/10 for u,v,w in r_graph.edges(data='weight')],
                           nodelist=r_graph.nodes(), edge_color=edge_colors,
                           node_size=node_sizes
                          )
    if labels:
        if country == 'us':
            sta_labels = get_labels_us(r_graph, nodes, document_type='statute', node=node)
            reg_labels = get_labels_us(r_graph, nodes, document_type='regulation', node=node)
            nx.draw_networkx_labels(r_graph, pos=pos, font_color='b', 
                                    labels=reg_labels)
            nx.draw_networkx_labels(r_graph, pos=pos, font_color='k', 
                                    labels=sta_labels)
        else: # "de"
            sta_labels = get_labels_de(r_graph, nodes, document_type='statute', node=node)
            reg_labels = get_labels_de(r_graph, nodes, document_type='regulation', node=node)
            nx.draw_networkx_labels(r_graph, pos=pos, font_color='b', labels=reg_labels)
            nx.draw_networkx_labels(r_graph, pos=pos, font_color='k', labels=sta_labels)
        # draw label of central node
        if graph_type == 'responsibility':
            nx.draw_networkx_labels(r_graph, pos=pos, font_color='g', 
                                    labels={node:node_name if node_name is not None else nodes.at[node,'abbr_1']}, font_size=20)
        elif graph_type == 'reliance':
            nx.draw_networkx_labels(r_graph, pos=pos, font_color='r', 
                                    labels={node:node_name if node_name is not None else nodes.at[node,'abbr_1']}, font_size=20)
        else:
            raise
    plt.axis('off')
    plt.tight_layout()
    if save_path is not None:
        plt.savefig(save_path)
        plt.close()

In [6]:
def plot_profile(sdf, years, save_path=None):
    group1 = ['reliance_diversity_n', 'responsibility_diversity_n']
    group2 = ['self_loops_n','reliance_n','responsibility_n']
    group3 = ['tokens_n','tokens_unique']
    fig,ax = plt.subplots(2,5,figsize=(50,10), sharex=True)
    ticklabelfontsize = 50
    for title, attr, axis in zip(
        ["Tokens", "Unique Tokens",  # group 3
         "Items above Section Level", "Items on Section Level", "Items below Section Level"],  # group 2
        ['tokens_n','tokens_unique',  # group 3
         'items_n','seqitems_n','subseqitems_n'],  # group 2
        [ax[0,x] for x in range(5)]):
        sdf.plot.line(x='year',y=attr, ax=axis,color='k',xticks=years,marker='o',legend=False, lw=5, markersize=10)
        axis.set_ylabel("", fontsize=16)
        axis.set_title(title, fontsize=ticklabelfontsize-4)
        axis.set_ylim(0,np.ceil(axis.get_ylim()[-1])+0.05*sdf[attr].max())
        axis.set_xlim(1996,2021)
        if attr in group3:
            axis.set_ylim(0,np.ceil(sdf[group3].max().max()+sdf[group3].max().max()*0.05))
        axis.set_yticks(axis.get_yticks()[::3])
        axis.set_yticklabels([int(x) if x < 1000 else (str(int(round(x/1000,1)))+"K" if x%1000 == 0 else str(round(x/1000,1))+"K") for x in axis.get_yticks()], fontsize=ticklabelfontsize)
    for title, attr, axis in zip(
        ["Self-Loops", "Weighted Out-Degree", "Weighted In-Degree", "Binary Out-Degree", "Binary In-Degree"],
        ['self_loops_n','reliance_n','responsibility_n','reliance_diversity_n','responsibility_diversity_n'],
        [ax[1,x] for x in range(5)]):
        sdf.plot.line(x='year',y=attr, ax=axis,color='k',xticks=years,marker='o',legend=False, lw=5, markersize=10)
        axis.set_ylabel("", fontsize=16)
        axis.set_title(title, fontsize=ticklabelfontsize-4) 
        if attr in group1:
            axis.set_ylim(0,np.ceil(sdf[group1].max().max()+sdf[group1].max().max()*0.05))
        elif attr in group2:
            axis.set_ylim(0,np.ceil(sdf[group2].max().max()+sdf[group2].max().max()*0.05))
        else:
            axis.set_ylim(0,np.ceil(axis.get_ylim()[-1]))
        axis.set_xlabel('',fontsize=ticklabelfontsize)
        axis.set_xticks(years[::7])
        axis.set_xticklabels(axis.get_xticks(), rotation=0, fontsize=ticklabelfontsize)
        axis.set_yticks(axis.get_yticks()[::3])
        axis.set_yticklabels([int(x) if x < 1000 else (str(int(round(x/1000,1)))+"K" if x%1000 == 0 else str(round(x/1000,1))+"K") for x in axis.get_yticks()], fontsize=ticklabelfontsize)
    plt.tight_layout()
    if save_path is not None:
        plt.savefig(save_path)
        plt.close()

In [7]:
def generate_ego_nodes_from_queries(queries, country, years):
    crossreference_path = get_crossreference_path(country)
    nodefiles,edgefiles = get_node_and_edge_files(crossreference_path, years)
    ego_nodes = {abbr:{} for (abbr,q) in queries}
    for year in years:
        print("Generating ego nodes for year", year, end='\r')
        nf = nodefiles[years.index(year)]
        if country == 'us':
            indices_to_skip = pd.read_csv(f"{crossreference_path}/{nf}", low_memory=True, usecols=['heading']).query("@pd.isna(heading)").index.values + 1
            nodes = pd.read_csv(f"{crossreference_path}/{nf}", low_memory=True, usecols=['key','law_name', 'heading'], skiprows=indices_to_skip).set_index("key")
        else: # 'de'
            indices_to_skip = pd.read_csv(f"{crossreference_path}/{nf}", low_memory=True, usecols=['abbr_1']).query("@pd.isna(abbr_1)").index.values + 1
            nodes = pd.read_csv(f"{crossreference_path}/{nf}", low_memory=True, usecols=['key','abbr_1', 'abbr_2', 'type'], skiprows=indices_to_skip).set_index("key")
        for abbr,q in queries:
            ego_nodes[abbr][year] = nodes.query(q).index.values
    for k,v in ego_nodes.items():
        ego_nodes[k] = {k2:v[0] for k2,v in ego_nodes[k].items() if v.size > 0}
    return ego_nodes

### Network graphics for selected laws (this takes circa 5 minutes)

In [8]:
node_names = {
    'dodd-frank':'Dodd-Frank',
    'glb':'Gramm-Leach-Bliley',
    'kredwg':'KredWG',
    'börsg':'BörsG',
    'wphg':'WpHG'
}

for country in COUNTRIES:
    crossreference_path = get_crossreference_path(country)
    preprocessed_graph_path = get_preprocessed_graph_path(country)
    preprocessed_graph_files = get_preprocessed_graph_files(preprocessed_graph_path, YEARS)
    nodefiles, _ = get_node_and_edge_files(crossreference_path, YEARS)
    if country == 'us':
        queries = [
            ("dodd-frank","law_name.str.contains('TITLE 12') and heading.str.contains('CHAPTER 53')"),
            ("glb","law_name.str.contains('TITLE 12') and heading.str.contains('CHAPTER 16')"),
        ]
    elif country == 'de':
        queries = [
            (abbr,f"(abbr_1.str.lower() == '{abbr}' or abbr_2.str.lower() == '{abbr}') and type == 'document'")
            for abbr in ["wphg","kredwg","börsg"]
        ]
    ego_nodes = generate_ego_nodes_from_queries(queries, country, YEARS)
    
    for abbr,indiv_ego_nodes in ego_nodes.items():
        for year, ego_node in indiv_ego_nodes.items():
            print("Plotting", year, ego_node, end='\r')
            nf = nodefiles[YEARS.index(year)]
            indices_to_skip = pd.read_csv(f"{crossreference_path}/{nf}", low_memory=True, usecols=['type']
                                         ).query("type in ['seqitem','subseqitem']").index.values + 1
            nodes = pd.read_csv(f"{crossreference_path}/{nf}", low_memory=True, 
                                usecols=['key','law_name', 'heading', 'abbr_1', 'document_type', 'type', 'tokens_n'], 
                                skiprows=indices_to_skip).set_index("key")
            
            qG = nx.read_gpickle(f"{preprocessed_graph_path}/{preprocessed_graph_files[YEARS.index(year)]}")
            qG_weighted_edges = pd.DataFrame(qG.edges(keys=True), columns=['u','v','weight']).groupby(['u','v']).count().reset_index()
            
            responsibility_graph = create_responsibility_graph(ego_node, qG_weighted_edges, threshold=0.0)
            reliance_graph = create_reliance_graph(ego_node, qG_weighted_edges, threshold=0.0)
            
            draw_graph(reliance_graph, nodes, ego_node, graph_type='reliance', node_name=node_names.get(abbr,abbr.upper()), 
                       save_path=f"../graphics/evolution/reliance-{country}-{abbr}-{year}.pdf")
            draw_graph(responsibility_graph, nodes, ego_node, graph_type='responsibility', node_name=node_names.get(abbr,abbr.upper()), 
                       save_path=f"../graphics/evolution/responsibility-{country}-{abbr}-{year}.pdf")

Plotting 2019 BJNR135100007_BörsG-2007_99999999_000001

### Case studies for summaries of individual profiles

In [9]:
profile_path = "../results/chapter-profiles"
chapter_profiles = [f"{profile_path}/{f}" for f in list_dir(profile_path,".csv")]

In [10]:
us_profiles = [p for p in chapter_profiles if p.endswith("us.csv")]
us_dfs = {int(p.split("/")[-1].split("-")[0]):pd.read_csv(p) for p in us_profiles}

In [11]:
de_profiles = [p for p in chapter_profiles if p.endswith("de.csv")]
de_dfs = {int(p.split("/")[-1].split("-")[0]):pd.read_csv(p) for p in de_profiles}

### Case study for the United States

In [12]:
chapters = {}
for title in set(us_dfs[2019].query("document_type == 'regulation'").law_name.values):
    chapters[title] = set(us_dfs[2019].query("document_type == 'regulation' and law_name == @title").heading.values)
reg_queries = []
for title,chapters in chapters.items():
    for chapter in chapters:
        title_name = title.split()[-1]
        chapter_number = chapter.split("—")[0].split()[-1]
        reg_queries.append((f"{title_name}cfr{chapter_number}",f'law_name == "{title}" and heading.str.startswith("{chapter}") and tokens_n > 0'))
reg_queries.sort(key=lambda tup:tup[0])

In [13]:
us_case_study_dfs = []
queries = [
    ("dodd-frank","law_name.str.contains('TITLE 12') and heading.str.contains('CHAPTER 53')"),
    ("glb","law_name.str.contains('TITLE 12') and heading.str.contains('CHAPTER 16')"),
    ("obamacare","law_name.str.contains('TITLE 42') and heading.str.contains('CHAPTER 157')"),
]
for abbr,query in queries + [x for x in reg_queries if x[0] in ['17cfrI','17cfrII']]:
    sdf = pd.DataFrame(index=YEARS, columns=us_dfs[1998].columns)
    for idx,df in us_dfs.items():
        qf = df.query(query)
        if len(qf.index) > 0:
            sdf.loc[idx] = qf.iloc[0]
    if not sdf.key.isna().all():
        sdf = sdf.reset_index().rename(dict(index='year'),axis=1)
        us_case_study_dfs.append((abbr,sdf))

In [14]:
country = 'us'
for abbr, sdf in us_case_study_dfs:
    plot_profile(sdf, YEARS, f"../graphics/evolution/{country}-{abbr}.pdf")

### Case study for Germany

In [15]:
de_capm = pd.read_csv("../supplements/KapmR2019content.csv")
de_bank = pd.read_csv("../supplements/BankR2020content.csv")
de_abbreviations = [x.lower() for x in list(pd.concat([de_capm,de_bank]).dropna(
).set_index('abbreviation').drop_duplicates().sort_index().index.values)]

In [16]:
for df in de_dfs.values():
    df['abbr'] = [x.split("_")[1].split("-")[0].lower() for x in df.key]

In [17]:
set(de_abbreviations) - set(de_dfs[2019].query("abbr in @de_abbreviations").abbr.values)

{'anlentg', 'depotg', 'finarisikov', 'prospv', 'wechselg', 'wpav'}

In [18]:
de_case_study_dfs = []
for abbr in de_abbreviations:
    abbr_file = abbr # + "-Buch-5" # when treating German laws with books, remember to also change the iloc from 0 to something else
    sdf = pd.DataFrame(index=YEARS, columns=de_dfs[1998].columns)
    for idx,df in de_dfs.items():
        qf = df.query("abbr == @abbr")
        if len(qf.index) > 0:
            sdf.loc[idx] = qf.iloc[0]
    if not sdf.key.isna().all():
        sdf = sdf.reset_index().rename(dict(index='year'),axis=1)
        de_case_study_dfs.append((abbr,sdf))

In [19]:
country = 'de'
for abbr, sdf in de_case_study_dfs:
    plot_profile(sdf, YEARS, f"../graphics/evolution/{country}-{abbr}.pdf")

### The end.