In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

In [None]:
%run smell_helpers.py

In [None]:
corpus_sizes = {int(k):{int(k2):v2 for k2,v2 in v.items()} for k,v in load_json("../data/corpus_size_token_us.json").items()}
corpus_sizes[2019].keys()

In [None]:
df = pd.read_csv("../data/dupex_mf-10000_results_all-years.csv", index_col=0)
df["title_str"] = df.title.map(str)
df["compression_alt"] = df.compression.map(lambda x:(1-x)*100)
df = df.query("year >= 1998").copy()
df["n_tokens"] = [corpus_sizes[y][t] for t,y in zip(df.title, df.year)]
df = df.query("n_tokens > 0 and year >= 1998").copy()

In [None]:
df_cdeltas = pd.DataFrame(columns=["min_year", "max_year", "c_delta_abs", "select_delta", "select_extreme"])
for title in df.title.unique():
    df_partial = df.query("title == @title")
    if len(df_partial):
        first = df_partial.iloc[0]
        first_c = df_partial.compression.min()
        first_y = df_partial.at[df_partial.compression.idxmin(),"year"]
        last = df_partial.iloc[-1]
        last_c = df_partial.compression.max()
        last_y = df_partial.at[df_partial.compression.idxmax(),"year"]
        c_delta_abs = abs(round(last_c - first_c,4))
        df_cdeltas.loc[title] = (first_y, last_y, c_delta_abs,
                                 c_delta_abs >= 0.02, # LARGE DELTAS
                                 all(df_partial.compression < 0.75) or all(df_partial.compression > 0.85) # EXTREME COMPRESSIBILITY VALUES IN GENERAL
                                )
df_cdeltas.sort_values("c_delta_abs", ascending=False)

In [None]:
selected_titles = df_cdeltas.query("select_delta").index
len(selected_titles)

In [None]:
extreme_titles = df_cdeltas.query("select_extreme").index
len(extreme_titles)

In [None]:
fontsize = 36
plt.rcParams["font.family"] = "serif"
plt.rcParams["font.serif"] = "Times New Roman"
plt.rcParams["mathtext.fontset"] = "stix"
fig, ax = plt.subplots(1,2,figsize=(24,9))
sns.lineplot(data=df.query("title in @selected_titles"), 
             x="year", y="compression_alt", hue="title_str", ax=ax[0], palette=['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#b15928'],#"tab20", 
             markers=["o"]*len(df.query("title in @selected_titles and n_tokens > 0").title_str.unique()), 
             dashes=False, lw=2,
             style="title_str")
sns.lineplot(data=df.query("title not in @selected_titles and title not in @extreme_titles"), 
             x="year", y="compression_alt", hue="title_str", ax=ax[1], 
             palette={t:"k" for t in df.title_str.unique()}, legend=False)
sns.lineplot(data=df.query("title not in @selected_titles and title in @extreme_titles"), 
             x="year", y="compression_alt", hue="title_str", ax=ax[1], palette=['#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02','#a6761d'],#"tab10_r", 
             markers=["o"]*len(df.query("title not in @selected_titles and title in @extreme_titles").title_str.unique()), 
             dashes=False, lw=2,
             style="title_str")
ax[0].legend(title="Title", ncol=3, fontsize=fontsize-8, title_fontsize=fontsize-4, labelspacing=0.2, loc="upper left")
ax[1].legend(title="Title", ncol=2, fontsize=fontsize-8, title_fontsize=fontsize-4, labelspacing=0.2)
for ax in [ax[0], ax[1]]:
    ax.set_ylim(-2.5,42.5)
    yticks = [round(x,2) for x in np.arange(0,41,5)]
    ax.set_yticks(yticks)
    ax.set_yticklabels([f"${x}\%$" for x in yticks], fontsize=fontsize)
    xticks = range(1998,2020,3)
    ax.set_xticks(xticks)
    ax.set_xticklabels(xticks, fontsize=fontsize)
    ax.set_xlabel("Year", fontsize=fontsize)
    ax.set_ylabel("Compression", fontsize=fontsize)
    ax.set_xlim(1998-0.75,2019+0.75)
plt.tight_layout()
plt.savefig("../writing/figures/dp-compression.pdf", transparent=True)

### Table Numbers

In [None]:
import regex as re
from collections import Counter

In [None]:
patterns = {
    "fpt": "for_(?:the_)?purposes_of_this_\w+",
    "ear": "except_as_(?:defined|provided)_(?:by|in)_[{]reference[}]",
    "aba": "there_(?:are|is)_authorized_to_be_appropriated",
    "mbn": "as_may_be_necessary_to_carry_out_the",
    "ttm": "the_term_[{]term[}]_means",
    "s1": "bureau_of_citizenship_and_immigration_services",
    "s2": "natural_disasters_,_acts_of_terrorism_,_",
    "s3": "the_committee_on_homeland_security_(?:and_governmental_affairs_of_the_senate|of_the_house_of_representatives)",
    "s4": "cyber_threat_indicators_(?:and|or)_defensive_measures",
    "s5": "weapons_of_mass_destruction_information",
}

In [None]:
path = "../dupex_mf-10000_results"
files = get_files(path, "_2019.json")
results = {k:{} for k in patterns.keys()}
for file in files:
    print(file)
    data = load_json(f"{path}/{file}")
    seq = data["current_sequence"]
    for k,p in patterns.items():
        res = [t for t in seq if re.search(p,t)]
        results[k][int(file[:2])] = dict(Counter(res))
        if res:
            print(file, k, len(res), len(results[k][int(file[:2])].keys()))

In [None]:
df_generic_patterns = pd.DataFrame(results)

In [None]:
df_generic_patterns_melted = df_generic_patterns.melt(var_name="pattern", value_name="instances")
df_generic_patterns_melted

In [None]:
rows = []
for col in df_generic_patterns.columns:
    for idx,row in df_generic_patterns[col].items():
        for instance, count in row.items():
            rows.append([idx,col,instance,count])

In [None]:
long_df_generic_patterns = pd.DataFrame(rows, columns=["title","pattern","instance","count_absolute"])
long_df_generic_patterns["count_relative"] = [c/(corpus_sizes[2019][t]/1000) for t,c in zip(long_df_generic_patterns.title, long_df_generic_patterns.count_absolute)]

In [None]:
long_df_generic_patterns_grouped = long_df_generic_patterns.groupby(["title","pattern"]).sum().reset_index().sort_values("count_relative", ascending=False)
long_df_generic_patterns_grouped

In [None]:
long_df_generic_patterns_grouped.pattern.unique()

In [None]:
long_df_generic_patterns_grouped.query("pattern == 'ttm'")