In [None]:
import networkx as nx
import json
import pandas as pd
from glob import glob
import re
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker
from collections import Counter, OrderedDict
import numpy as np
import os

In [None]:
%matplotlib inline

In [None]:
sns.set_style('whitegrid')
plt.rcParams["font.family"] = "serif"
plt.rcParams["font.serif"] = "Times New Roman"
plt.rcParams["font.size"] = 24
plt.rcParams["mathtext.fontset"] = "stix"
plt.rcParams["legend.labelspacing"] = 0.1

In [None]:
def splitted_plot(*dfs):
    fig, axs = plt.subplots(2,1, sharex=True, figsize=(9,6))
    
    color_cycle = plt.rcParams['axes.prop_cycle']()
    for df_n, ax_n in zip(dfs, axs):
        for col in df_n:
            ax_n.plot(df_n.index, df_n[col], label=col, **next(color_cycle), marker='o', lw=2, markeredgecolor='w')
            ax_n.set_xticks(range(1998,2020,3))
            handles, labels = ax_n.get_legend_handles_labels()
            ax_n.legend(handles, [all_patterns[p] for p in labels], loc='center left', bbox_to_anchor=(1.0, 0.5))
    return fig, axs

## Get size of US Code per year as reference size

In [None]:
with open('../data/corpus_size_token_us.json') as f:
    data = json.load(f)
    data = {
        int(a): {
            int(c): d for c, d in b.items()
        } for a, b in data.items()
    }
    token_size_df = pd.DataFrame(data, dtype="Int64").sort_index()

## Create dataframe

Can be skipped if `../data/pattern_abs.csv` already generated

In [None]:
def get_match_count(filepath):
    try:
        return dict(Counter(int(k[:2]) for k in pd.read_csv(filepath).key))
    except FileNotFoundError:
        print('File not found', filepath)
        return dict()

In [None]:
years = range(1998,2019+1)

In [None]:
patterns = OrderedDict()
patterns['and_or'] = 'and...or|or...and'
patterns['not_and_or'] = 'not...(and|or)';
patterns['no_and_or'] = 'no...(and|or)';
patterns['unless'] = '(and|or)...unless';
patterns['but_not'] = '(and|or)...but not';

all_patterns = patterns.copy()
all_patterns['and_slash_or'] = 'and/or'; # excluded from figure
all_patterns['except'] = '(and|or)...except';
all_patterns['notwithstanding'] = 'notwithstanding...(and|or)'; 
all_patterns['or_both'] = 'or...or both';  # excluded from figure
all_patterns['or_or'] = 'or...or';
all_patterns['and_and'] = 'and...and';

In [None]:
data_dict = {
    p: {
        str(y): get_match_count(f'../data/patterns/{p}_pattern_{y}.csv')
        for y in years
    }
    for p in all_patterns
}
df_abs_data = {
    p: pd.DataFrame(data_dict[p])
    for p in data_dict
}
df_abs = pd.concat(df_abs_data.values(), axis=1, keys=df_abs_data.keys())
df_abs.to_csv('../data/pattern_abs.csv')

### Work with saved data

In [None]:
df_abs = pd.read_csv('../data/pattern_abs.csv', index_col=0, header=[0,1], dtype="Int64").fillna(0)
df_abs.columns = pd.MultiIndex.from_tuples([(p,int(y)) for p, y in df_abs.columns], names=df_abs.columns.names)
df_abs.head()

In [None]:
df_abs_sum = df_abs.sum().reset_index().pivot(index='level_1', columns='level_0', values=0)
df_abs_sum.head()

In [None]:
df_rel = df_abs / pd.concat([token_size_df for _ in all_patterns.keys()], axis=1, keys=all_patterns.keys())
df_rel = df_rel * 1000
df_rel.head(5)

In [None]:
df_rel_sum = df_abs_sum / pd.DataFrame({c: token_size_df.sum() for c in df_abs_sum.columns})
df_rel_sum = df_rel_sum * 1000
df_rel_sum.head()

## Heatmap

In [None]:
df_heatmap = df_abs.copy()[patterns]
df_heatmap.columns = df_heatmap.columns.swaplevel()
df_heatmap = df_heatmap[2019]
df_heatmap = df_heatmap.T

# Frequency of pattern / token
df_heatmap = df_heatmap / token_size_df[2019]

#  Normalize over patterns
df_heatmap = df_heatmap.T
df_heatmap = df_heatmap / df_heatmap.max()
df_heatmap = df_heatmap.T

df_heatmap.index = [patterns[i] for i in df_heatmap.index]

df_heatmap = df_heatmap.astype('float')
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(df_heatmap, ax=ax)
ax.set_xlabel('Title')
# plt.savefig(
#     f'../graphics/operator_patterns_heatmap_us_2019_titles.pdf', 
#     bbox_inches='tight'
# )

# Frequency of pattern in titles. Normalized for each pattern (row)

# Line Plots

In [None]:
for abs_rel, df_abs_rel in [['abs', df_abs], ['rel', df_rel]]:
    for pattern in all_patterns:
        df_top = df_abs_rel[pattern].sort_values(
            df_abs_rel.columns[-1][-1], ascending=False
        )[:10]
        for df, top in [[df_abs_rel[pattern], False], [df_top, True]]:
            fig, ax = plt.subplots(figsize=(9,6))
            df.sort_values(int(years[-1]), ascending=False).T.plot(
                ax=ax, marker='.', title=all_patterns[pattern], alpha=.6
            )
            ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), ncol=1 if top else 4)
            ax.set_xticks(range(1998,2020,3))
            ax.set_xlabel('Year')
            if abs_rel == 'abs':
                ax.set_ylabel('Occurrences')
            elif abs_rel == 'rel':
                ax.set_ylabel('Occurrences/1000 tokens')
#             fig.savefig(
#                 f'../graphics/operator_patterns_{abs_rel}{"-top" if top else ""}_us_{pattern}.pdf', 
#                 bbox_inches='tight'
#             )
            
            if not top:
                 plt.close(fig)

In [None]:
plt.rcParams["legend.labelspacing"] = 0.2

In [None]:
fig, ax = plt.subplots(figsize=(9,6))
df_abs_sum.sort_values(int(years[-1]), axis=1, ascending=False).plot(ax=ax, marker='.')
ax.set_xlabel('Year')
ax.set_ylabel('Occurrences')
ax.set_xticks(range(1998,2020,3))
handles, labels = ax.get_legend_handles_labels()
plt.legend(handles, [all_patterns[p] for p in labels], loc='center left', bbox_to_anchor=(1.0, 0.5))
# plt.savefig('../graphics/operator_patterns_abs_us.pdf', bbox_inches='tight')

In [None]:
selected_patterns = [
    #'and_slash_or', 
    'but_not', 'except', 'no_and_or', 'notwithstanding', #'or_both', 
    'unless'
]

In [None]:
fontsize = 24
extra = 4

In [None]:
df1 = df_abs_sum[['and_and', 'or_or', 'and_or', 'not_and_or']].sort_values(int(years[-1]), axis=1, ascending=False)
df2 = df_abs_sum[selected_patterns].sort_values(int(years[-1]), axis=1, ascending=False)
fig, ax = splitted_plot(df1, df2)
ax[0].annotate("Number of Occurrences", (-0.14,-0.11), fontsize=fontsize+extra, rotation=90, xycoords="axes fraction", ha='center', va='center')
ax[1].set_xlabel('Year', fontsize=fontsize+4)
ax[0].set_yticks(range(20000,60001,10000))
ax[0].set_yticklabels([f"{x//1000} K" for x in range(20000,60001,10000)])
ax[0].set_ylim(16000,64000)
ax[1].set_yticks(range(0,5001,1000))
ax[1].set_yticklabels([f"{x//1000} K" for x in range(0,5001,1000)])
ax[1].set_ylim(-500,5500)
ax[0].set_xlim(1997.5,2019.5)
plt.savefig('../writing/figures/operator_patterns_abs_us_splitted.pdf', bbox_inches='tight', transparent=True)

In [None]:
fig, ax = plt.subplots(figsize=(9,6))
df_abs_sum[selected_patterns].sort_values(
    int(years[-1]), axis=1, ascending=False
).plot(ax=ax, marker='.')
handles, labels = ax.get_legend_handles_labels()
plt.legend(handles, [all_patterns[p] for p in labels], loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.ylabel('Occurrences')
plt.xlabel('Year')
ax.set_xticks(range(1998,2020,3))
# plt.savefig('../graphics/operator_patterns_abs_us_selected.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(9,6))
df_rel_sum.sort_values(int(years[-1]), axis=1, ascending=False).plot(ax=ax, marker='.')
handles, labels = ax.get_legend_handles_labels()
plt.legend(handles, [all_patterns[p] for p in labels], loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.ylabel('Occurrences/1000 tokens')
plt.xlabel('Year')
ax.set_xticks(range(1998,2020,3))
# plt.savefig('../graphics/operator_patterns_rel_us.pdf', bbox_inches='tight')

In [None]:
df1 = df_rel_sum[['and_and', 'or_or', 'and_or', 'not_and_or']].sort_values(int(years[-1]), axis=1, ascending=False)
df2 = df_rel_sum[selected_patterns].sort_values(int(years[-1]), axis=1, ascending=False)
fig, ax = splitted_plot(df1, df2)
ax[0].annotate(f"Occurrences per {1000:n} Tokens", (-0.14,-0.11), fontsize=fontsize+extra, rotation=90, xycoords="axes fraction", ha='center', va='center')
ax[1].set_xlabel('Year', fontsize=fontsize+4)
ax[1].set_yticks(np.arange(0,0.31,0.1))
ax[1].set_ylim(-0.025,0.325)
ax[0].set_ylim(1.35,3.15)
ax[0].set_yticks(np.arange(1.5,3.1,0.5))
ax[0].set_xlim(1997.5,2019.5)
plt.savefig('../writing/figures/operator_patterns_rel_us_splitted.pdf', bbox_inches='tight', transparent=True)

In [None]:
fig, ax = plt.subplots(figsize=(9,6))
df_rel_sum[selected_patterns].sort_values(
    int(years[-1]), axis=1, ascending=False
).plot(ax=ax, marker='.')
handles, labels = ax.get_legend_handles_labels()
plt.legend(handles, [all_patterns[p] for p in labels], loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.ylabel('Occurrences/1000 tokens')
plt.xlabel('Year')
ax.set_xticks(range(1998,2020,3))
# plt.savefig('../graphics/operator_patterns_rel_us_selected.pdf', bbox_inches='tight')

# Stats

In [None]:
abs_stats = df_abs_sum.loc[2019] / df_abs_sum.loc[1998]
abs_stats

In [None]:
abs_stats.describe()

In [None]:
rel_stats = df_rel_sum.loc[2019] / df_rel_sum.loc[1998]
rel_stats

In [None]:
rel_stats.describe()

# Table

In [None]:
top_dfs = []
for pattern in patterns:
    top = df_rel[pattern][2019].sort_values(ascending=False)[:5]
    df = pd.DataFrame(top, dtype=float).reset_index()
    df.columns = ['Title', 'Freq.']
    top_dfs.append(df)
top_df = pd.concat(top_dfs, axis=1, keys=[patterns[p] for p in patterns])
top_df = top_df.round(2)

In [None]:
with open('../data/operator_patterns_rel_us_top_title.tex', 'w') as f:
    for i in range(1):
        latex = top_df[top_df.columns[i*10:(i+1)*10]].to_latex(None, index=False)
        latex = re.sub(r'\\toprule\n(\S+)', '\\\\multicolumn{2}{l}{\g<1>}', latex)
        f.write(latex)
        f.write('\n\n')