# 2019 USC - Long Item Analysis

In [None]:
import networkx as nx
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import powerlaw
import numpy as np
import regex
import natsort
import json
from lxml import etree
import os
import re
import glob
from collections import defaultdict

from quantlaw.utils.networkx import load_graph_from_csv_files

%run longitem_support.py

In [None]:
%matplotlib inline

### Load Data

In [None]:
G = load_graph_from_csv_files("../../legal-networks-data/us/4_crossreference_graph/detailed/", "2019")

# find out the chapter and title of each node
quotient_G, quotient_nodes_mapping = quotient_graph_with_merge(G, self_loops=False, heading_regex=chapter_regex)
nx.set_node_attributes(G, quotient_nodes_mapping, 'chapter_mapping')
quotient_G, quotient_nodes_mapping = quotient_graph_with_merge(G, self_loops=False, heading_regex=title_regex)
nx.set_node_attributes(G, quotient_nodes_mapping, 'title_mapping')

# mark all chapters and titles as such
attrib_update =  {x: 'chapter' 
                      for x,y in G.nodes(data=True)
                          if 'heading' in y
                          and chapter_regex.match(y['heading'])
                  }
nx.set_node_attributes(G, values = attrib_update, name='type')

attrib_update =  {x: 'title'
                      for x,y in G.nodes(data=True)
                          if 'heading' in y
                          and title_regex.match(y['heading'])
                 }
nx.set_node_attributes(G, values = attrib_update, name='type')

### Utility Functions for Plotting / Printing

In [None]:
def get_data_for_groups(G, group_mapping, item_type, grouped_by, use_heading = True):
    """
    Takes grouped input (i.e., a list of mapping from groups to all items belonging to that group),
    and builds up DataFrame containing token statistic for each item of each group
    """
    data = []
    for group in group_mapping.keys():
        for compare_item in group_mapping[group]:
            heading_str = normalize_heading(G.nodes[group]['heading'].split('-',1)[0]) if use_heading else group
            data.append([compare_item, heading_str, G.nodes[compare_item]['tokens_n']])
    
    df = pd.DataFrame(data, columns = [item_type, grouped_by, 'Tokens'])
    return df

In [None]:
def plot_group_data(grouped_by, group_df, sample_size = None, plot_func=sns.swarmplot, figsize=(15,8)):
    """
    Plots token statistic for all groups as plot_func (e.g., swarmplot or boxplot)
    """
    if sample_size is not None and sample_size < group_df.shape[0]:
        group_df = group_df.sample(n=sample_size)
        
    group_df = group_df.sort_values(grouped_by, key=natsort.natsort_keygen())    
        
    plt.figure(figsize=figsize)
    sns.set_theme(style="whitegrid")
    if plot_func not in [sns.boxplot, sns.boxenplot]:
        ax = plot_func(x=grouped_by, y="Tokens", data=group_df, size=2) #boxplot-variants don't support size argument
    else:
        ax = plot_func(x=grouped_by, y="Tokens", data=group_df)

In [None]:
def build_ranking_df(compare_level, group_df, G):
    """
    Builds a pd.DataFrame that is sorted by Token count, and resolves foreign keys
    to information like chapter titles.
    """
    
    _df = group_df.copy().sort_values(['Tokens'], ascending=False)
    additional_given_columns = list(_df.columns)
    
    if 'chapter' not in additional_given_columns:
        _df['chapter'] = _df.apply(lambda row: G.nodes(data=True)[row[compare_level]]['chapter_mapping'], axis=1)
        _df['chapter'] = _df.apply(lambda row: normalize_heading(G.nodes(data=True)[row['chapter']]['heading'].split('-',1)[0]), axis=1)
    else:
        additional_given_columns.remove('chapter') 
        
    if 'title' not in additional_given_columns:
        _df['title'] = _df.apply(lambda row: G.nodes(data=True)[row[compare_level]]['title_mapping'], axis=1)
        _df['title'] = _df.apply(lambda row: normalize_heading(G.nodes(data=True)[row['title']]['heading'].split('-',1)[0]), axis=1)
    else:
        additional_given_columns.remove('title')
    
    additional_given_columns.remove('Tokens')
    if compare_level in additional_given_columns:
        additional_given_columns.remove(compare_level)
        
    _df['law_name'] = _df.apply(lambda row: G.nodes(data=True)[row[compare_level]]['law_name'], axis=1)
    _df['heading'] = _df.apply(lambda row: G.nodes(data=True)[row[compare_level]]['heading'], axis=1)
        
    _df = _df.reindex(columns=[compare_level, 'Tokens', 'title', 'chapter', 'law_name', 'heading'] + additional_given_columns)
    
    return _df

In [None]:
def build_relative_ranking_df(ranking_df, reference=None, limit_within_group=None, global_sort=False):
    """
    Builds up relative ranked data frame (Table 6 of paper), i.e., adds `RelTokens` column to a ranking_df.
    If reference is None, we compare the lengths always to the global next longest item,
    otherwise, within the same reference.
    """
    _df = ranking_df.copy()

    if reference is not None:
        _grouped_df = _df.sort_values([reference, 'Tokens']).groupby(reference)
        _df["RelTokens"] = _grouped_df["Tokens"].transform(lambda x: x.rolling(2).apply(lambda x: x.iloc[1] / max(x.iloc[0],1)))

        if global_sort:
            if limit_within_group is not None:
                _df = _df.sort_values([reference, 'RelTokens'], ascending=False)
                _df = _df.groupby(reference).head(limit_within_group).reset_index(drop=True)

            _df = _df.sort_values(['RelTokens'], ascending=False)
        else:
             _df = _df.sort_values([reference, 'RelTokens'], ascending=False)
             if limit_within_group is not None:
                 _df = _df.groupby(reference).head(limit_within_group).reset_index(drop=True)

    else:
        _df = _df.sort_values(['Tokens'])
        _df["RelTokens"] = _df.rolling(2).apply(lambda x: x.iloc[1] / x.iloc[0])
        _df = _df.sort_values(['Tokens'], ascending=False)


    return _df


In [None]:
title_no_mapping = dict() # global state that maps title no to node of title, we build this lazily

def belongs_to_usc_title_filter(G, node, title_no):
    """
    Returns true if node `node` is part of title `title_no`
    """
    if title_no not in title_no_mapping.keys():
        candidates =  list(set([x for x,y in G.nodes(data=True) if not x.startswith('cfr') and 'heading' in y and normalize_heading(y['heading']).startswith(f'Title {title_no}-')]))
        if len(candidates) != 1:
            raise ValueError(f'Did not find title uniquely in G: Possible titles = {candidates}')
        
        title_no_mapping[title_no] = candidates[0]
    
    # as soon as we know what node it is, just check whether the mapping points to the correct node (or if we are the title node itself)
    return node == title_no_mapping[title_no] or G.nodes(data=True)[node]['title_mapping'] == title_no_mapping[title_no]

def usc_only_filter(G, node):
    return not G.nodes(data=True)[node]['key'].startswith('cfr')

## Analysis 1: Distribution of Item Length

### Example 1: Analyze Seqitems of Title 20

In [None]:
COMPARE_LEVEL = "seqitem"
GROUP_BY = "chapter"
TITLE = 20

# we want to compare the chapters of title 17 USC, based on the seqitems of the chapters
group_by = group_items_by(G,
                          COMPARE_LEVEL,
                          GROUP_BY,
                          group_by_filter = lambda x: belongs_to_usc_title_filter(G, x, TITLE)
                         )

group_df = get_data_for_groups(G, group_by, COMPARE_LEVEL, GROUP_BY, use_heading=True)
ranking_df = build_ranking_df(COMPARE_LEVEL, group_df, G)

In [None]:
# Longest Seqitems of Title 20
ranking_df.head(10)

In [None]:
# Distribution of Seqitems, grouped by chapter of Title 20
plot_group_data(GROUP_BY, group_df, sample_size = 20000, plot_func=sns.stripplot) 

### Example 2: Analyze all USC Seqitems

In [None]:
COMPARE_LEVEL = "seqitem"
GROUP_BY = "title"

# we filter out CFR nodes
group_by = group_items_by(G,
                          COMPARE_LEVEL,
                          GROUP_BY,
                          group_by_filter = lambda x: usc_only_filter(G, x)
                         )

group_df = get_data_for_groups(G, group_by, COMPARE_LEVEL, GROUP_BY, use_heading=True)
ranking_df = build_ranking_df(COMPARE_LEVEL, group_df, G)

In [None]:
# Relative longest seqitems (min. 150 tokens to filter out shorts like 20 vs 2 tokens) with reference to the next longest seqitem _within the same Title_
relative_df = build_relative_ranking_df(ranking_df, reference=GROUP_BY, global_sort=True)
relative_df.loc[relative_df["Tokens"] > 150].head(10)

In [None]:
# Absolute longest seqitems, with information on the relative length to the next longest seqitem _over all titles_
result_df = build_relative_ranking_df(ranking_df, global_sort=True) # because there is no reference given, this function just sorts globally by token length and adds the RelToken column
result_df.head(10)

In [None]:
# Distribution of Seqitems, grouped by all titles
plot_group_data(GROUP_BY, group_df, sample_size = 20000, plot_func=sns.stripplot) 
plt.savefig('../writing/figures/seqitems_group_by_title.pdf')

In [None]:
# Log-Log-Plot of USC Seqitems
tokens = group_df.loc[(group_df['Tokens'] > 0)]['Tokens'].to_numpy()
fig = powerlaw.plot_pdf(tokens)
fig.set_xlabel("# Tokens")

### Example 3: Analyze all USC Chapters

In [None]:
COMPARE_LEVEL = "chapter"
GROUP_BY = "title"

# we filter out CFR nodes
group_by = group_items_by(G,
                          COMPARE_LEVEL,
                          GROUP_BY,
                          group_by_filter = lambda x: usc_only_filter(G, x)
                         )

group_df = get_data_for_groups(G, group_by, COMPARE_LEVEL, GROUP_BY, use_heading=True)
ranking_df = build_ranking_df(COMPARE_LEVEL, group_df, G)

In [None]:
ranking_df.head(10)

In [None]:
# Distribution of chapter lengths, grouped by all titles
plot_group_data(GROUP_BY, group_df, sample_size = 20000, plot_func=sns.stripplot) 

### Example 4: Icicle Zoom Plot

### Icicle Function

In [None]:
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection

def draw_icicle(df, last_page, levelheight, facecolor, figsize=(9,6), savepath=None, extensions=['svg', 'pdf', 'png'], with_toplevel=True, extra_y_gap = 0):
    token_scale_factor = 1000 # in our terms, pages are tokens - scale down by 1000
    
    last_page = last_page / token_scale_factor 
    
    # initialize plot
    plt.rcParams['figure.figsize'] = figsize
    y_upper = 10*(max(df.level)+1)+max(df.level)*extra_y_gap
    fig, ax = plt.subplots(1)
    
    for idx, row in df.iterrows():
        level = row['level']
        width = row['tokens'] / token_scale_factor
        page = row['start_token'] / token_scale_factor
        
        if level >= (0 if with_toplevel else 1):
            _x = page if width > 0 else page-0.5
            _y = y_upper-levelheight*(level+1 if with_toplevel else level)-level*(0.05+extra_y_gap)
            _width = width-0.2 if width > 0 else 0.3
            ax.add_patch(Rectangle(xy=(_x,_y), 
                                   width=_width, height=levelheight, facecolor=(facecolor if row['zoomed'] == 0 else 'red'), 
                                   edgecolor='k', linewidth=0.1))
            
            if row['zoomed'] != 0:
                ax.plot([_x, 0], [_y, y_upper-levelheight*(level+2 if with_toplevel else level+1)-(level+1)*(0.05+extra_y_gap) + levelheight], color='red', linestyle='-', linewidth=1)
                ax.plot([_x + _width, last_page], [_y, y_upper-levelheight*(level+2 if with_toplevel else level+1)-(level+1)*(0.05+extra_y_gap) + levelheight], color='red', linestyle='-', linewidth=1)

                plt.xlim(0,last_page+2)
                
    plt.ylim(0,y_upper+2)
    plt.axis('off')
    plt.tight_layout()
    
    if savepath is not None:
        for ext in extensions:
            plt.savefig(f'{savepath}.{ext}', dpi=1200)

### Data Preparation

In [None]:
# after executing this cell, the `ici_data` list object contains all information necessary to plot an zoomed icicle plot
chosen_title = "Title 15"
chosen_chap = "Chapter 14A"

# Initialize variables
ici_data = [] # key, name, level, tokens, start_token
titles = [(x, int(normalize_heading(y['heading'].split('-',1)[0]).split(' ')[1])) for x,y in G.nodes(data=True) if 'type' in y and y['type'] == 'title' and not y['key'].startswith('cfr')]
sorted_titles = sorted(titles, key=lambda tup: tup[1])
count = 0
curr_title_token = 1
title_idx = 0
important_chapter_rows = []

for title_key, title_no in sorted_titles:
    chapter_rows = []
    chapters = [(x, normalize_heading(y['heading'].split('-',1)[0]).split(' ')[1]) for x,y in G.nodes(data=True) if 'type' in y and y['type'] == 'chapter' and y['title_mapping'] == title_key]
    sorted_chapters = natsort.natsorted(chapters, key=lambda tup: tup[1])
    curr_chapter_token = curr_title_token
    
    for chapter_key, chapter_no in sorted_chapters:
        seqitems = [(x, y['heading'].split(' ')[1]) for x,y in G.nodes(data=True) if 'type' in y and y['type'] == 'seqitem' and y['chapter_mapping'] == chapter_key]
        sorted_items = natsort.natsorted(seqitems, key=lambda tup: tup[1])
        seqitem_rows = []
        
        curr_seq_token = curr_chapter_token
        for seq, _ in sorted_items:
            seqitem_rows.append([seq, G.nodes(data=True)[seq]['heading'], 2, G.nodes(data=True)[seq]['tokens_n'], curr_seq_token, 0])
            curr_seq_token = curr_seq_token + G.nodes(data=True)[seq]['tokens_n']
        
        tokens = np.array([row[3] for row in seqitem_rows])
        zoomed = 1 if normalize_heading(G.nodes(data=True)[chapter_key]['heading'].split('-')[0]) == chosen_chap else 0
        chapter_row = [chapter_key, G.nodes(data=True)[chapter_key]['heading'], 1, np.sum(tokens), curr_chapter_token, zoomed]
        chapter_rows.append(chapter_row)
        if normalize_heading(G.nodes(data=True)[title_key]['heading'].split('-')[0]) == chosen_title:
            important_chapter_rows.append(chapter_row)
            
        chapter_rows = chapter_rows + seqitem_rows
        
        if normalize_heading(G.nodes(data=True)[chapter_key]['heading'].split('-')[0]) == chosen_chap and normalize_heading(G.nodes(data=True)[title_key]['heading'].split('-')[0]) == chosen_title:
            important_chapter_rows = important_chapter_rows + seqitem_rows
                
        curr_chapter_token = curr_chapter_token + np.sum(tokens)
    
    zoomed = 1 if normalize_heading(G.nodes(data=True)[title_key]['heading'].split('-')[0]) == chosen_title else 0
    tokens = np.array([row[3] for row in chapter_rows if row[2] == 1])
    ici_data.append([title_key, G.nodes(data=True)[title_key]['heading'], 0, np.sum(tokens), curr_title_token, zoomed])
        
    curr_title_token = curr_title_token + np.sum(tokens)
    title_idx = title_idx + 1
    
ici_data = ici_data + important_chapter_rows

In [None]:
# now, we need to normalize and rescale our data
ici_data_df = pd.DataFrame(data=ici_data, columns=['key', 'heading', 'level', 'tokens', 'start_token', 'zoomed'])
max_token_row = ici_data_df.loc[ici_data_df['start_token'].idxmax()]
level0_tokens = max_token_row['start_token'] + max_token_row['tokens']

# first, move level1 to the left
level1_starttoken = ici_data_df.loc[ici_data_df.loc[ici_data_df['level'] == 1]['start_token'].idxmin()]['start_token']
ici_data_df.loc[ici_data_df['level'] == 1, 'start_token'] = ici_data_df.loc[ici_data_df['level'] == 1, 'start_token'].subtract(level1_starttoken)
# now, rescale
max_level1_token_row = ici_data_df.loc[ici_data_df.loc[ici_data_df['level'] == 1]['start_token'].idxmax()]
level1_tokens = max_level1_token_row['start_token'] + max_level1_token_row['tokens']
level1_factor = level0_tokens / level1_tokens
ici_data_df.loc[ici_data_df['level'] == 1, 'tokens'] = ici_data_df.loc[ici_data_df['level'] == 1, 'tokens'].mul(level1_factor)
ici_data_df.loc[ici_data_df['level'] == 1, 'start_token'] = ici_data_df.loc[ici_data_df['level'] == 1, 'start_token'].mul(level1_factor)

# shift level 2
level2_starttoken = ici_data_df.loc[ici_data_df.loc[ici_data_df['level'] == 2]['start_token'].idxmin()]['start_token']
ici_data_df.loc[ici_data_df['level'] == 2, 'start_token'] = ici_data_df.loc[ici_data_df['level'] == 2, 'start_token'].subtract(level2_starttoken)
# rescale
max_level2_token_row = ici_data_df.loc[ici_data_df.loc[ici_data_df['level'] == 2]['start_token'].idxmax()]
level2_tokens = max_level2_token_row['start_token'] + max_level2_token_row['tokens']
level2_factor = level0_tokens / level2_tokens
ici_data_df.loc[ici_data_df['level'] == 2, 'tokens'] = ici_data_df.loc[ici_data_df['level'] == 2, 'tokens'].mul(level2_factor)
ici_data_df.loc[ici_data_df['level'] == 2, 'start_token'] = ici_data_df.loc[ici_data_df['level'] == 2, 'start_token'].mul(level2_factor)

In [None]:
draw_icicle(ici_data_df,
         curr_title_token + 1,
         10,
         'silver',
         figsize=(20,5),
         savepath='../writing/figures/longitem_icicles',
         extensions=['pdf'],
         extra_y_gap=5)

## Analysis 2: Flagging Long Elements

In [None]:
def find_long_elements(G, element_type, absolute_threshold=10000, relative_threshold=1.5, relative_reference_group="title", relative_min_tokens=50, usc_only=True):
    absolute_long_elements = []
    relative_long_elements = []
    mapping = f"{relative_reference_group}_mapping"
    
    use_relative = relative_threshold is not None and relative_reference_group is not None
    
    # 0. Build up relative_df to have statistics even for only absolute flags
    group_by = group_items_by(G,
                              element_type,
                              relative_reference_group,
                              group_by_filter = lambda x: usc_only_filter(G, x) if usc_only else lambda x: True
                             )

    group_df = get_data_for_groups(G, group_by, element_type, relative_reference_group, use_heading=True)
    ranking_df = build_ranking_df(element_type, group_df, G)
    relative_df = build_relative_ranking_df(ranking_df, reference=relative_reference_group, global_sort=True)
    
    # 1. Find all absolutely long elements
    if absolute_threshold is not None:
        absolute_long_elements = [x for x,y in G.nodes(data=True) if is_node_of_type(y, element_type) and y["tokens_n"] > absolute_threshold and (not y["key"].startswith('cfr') or not usc_only)]
    
    # 2. Find all relative long elements
    if use_relative:
        relative_long_elements = relative_df.loc[(relative_df["Tokens"] > relative_min_tokens) & (relative_df["RelTokens"] > relative_threshold)][element_type].tolist()
    
    # 3. Build pd.DataFrame (Vertex_ID, Tokens, Absolute_or_Relative_or_Both, ReferenceGroup)
    
    shared_long_elements = list(set(absolute_long_elements) & set(relative_long_elements))
    absolute_long_elements = [e for e in absolute_long_elements if e not in shared_long_elements]
    relative_long_elements = [e for e in relative_long_elements if e not in shared_long_elements]
    
    data = [[e, G.nodes(data=True)[e]["tokens_n"], relative_df[relative_df[element_type] == e].iloc[0]["RelTokens"],"Both", G.nodes(data=True)[e][mapping] if use_relative else ""] for e in shared_long_elements]
    data += [[e, G.nodes(data=True)[e]["tokens_n"], relative_df[relative_df[element_type] == e].iloc[0]["RelTokens"],"Absolute", G.nodes(data=True)[e][mapping] if use_relative else ""] for e in absolute_long_elements]
    data += [[e, G.nodes(data=True)[e]["tokens_n"], relative_df[relative_df[element_type] == e].iloc[0]["RelTokens"],"Relative", G.nodes(data=True)[e][mapping] if use_relative else ""] for e in relative_long_elements]
    
    return pd.DataFrame(data=data, columns=["VertexID", "Tokens", "RelTokens", "Absolute_or_Relative_or_Both", "ReferenceGroup"])

In [None]:
find_long_elements(G, "seqitem").head(100)