In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

In [None]:
%run smell_helpers.py

In [None]:
corpus_sizes = {int(k):{int(k2):v2 for k2,v2 in v.items()} for k,v in load_json("../data/corpus_size_token_us.json").items()}
corpus_sizes[2019].keys()

In [None]:
df_dict = {x[:-4]:pd.read_csv(f"../ner_counts/{x}") for x in os.listdir("../ner_counts")}

In [None]:
df_dict.keys()

In [None]:
for k,df in df_dict.items():
    df["title"] = df.file.map(lambda x:x.split("_")[0][:-1])
    df["year"]  = df.file.map(lambda x:int(x.split("_")[-1][:-4]))
    df["ner_type"] = k if k not in ["monthday", "month", "day", "year", "date", "period"] else ("time_point" if k in ["monthday","date", "year"] else "time_period")

In [None]:
df_concat = pd.concat(list(df_dict.values()))

In [None]:
# how many (non-unique) mentions of ners by type do we find in 2019?
df_concat.query("year == 2019").groupby(["year","ner_type"]).sum()

In [None]:
# how many UNIQUE mentions of ners by type do we find in 2019?
df_concat.query("year == 2019").groupby(["year","ner_type"]).count() # interestingly, the Office seems to have started using terms only in 2013?

In [None]:
df_concat.query("ner_type == 'money'").groupby('text').count().reset_index().sort_values("counts", ascending=False)[:50]

In [None]:
plot_dfs = {}
for year in corpus_sizes.keys():
    df_concat_t = df_concat.query("year == @year and ner_type != 'rawreference' and ner_type != 'enum' and ner_type != 'term'"
                                 ).groupby(["title","ner_type"]).sum().reset_index().pivot(index="title", columns="ner_type", values="counts").fillna(0).astype(int)
    tokens_n = pd.DataFrame([corpus_sizes[year][int(x)]/1000 for x in df_concat_t.index], index=df_concat_t.index, columns=["tokens_n"])
    df_concat_t_tokennorm = df_concat_t.div(tokens_n.tokens_n, axis=0) # number of occurrences per 1000 tokens
    df_concat_t_normed_rows = df_concat_t.div(df_concat_t.sum(axis=1), axis=0)
    df_concat_t_normed_cols = (df_concat_t / df_concat_t.sum(axis=0))
    plot_dfs[year] = df_concat_t_tokennorm.copy()

In [None]:
plot_dfs[1998]

In [None]:
two_dfs = pd.merge(plot_dfs[2019].reset_index(),plot_dfs[1998].reset_index(), on="title", suffixes=["2019","1998"], how="outer").fillna(-1).set_index('title')

In [None]:
two_dfs.index = two_dfs.index.map(int)

In [None]:
two_dfs

In [None]:
sorted(two_dfs.columns)

In [None]:
fontsize = 34
extra=6
plt.rcParams["font.family"] = "serif"
plt.rcParams["font.serif"] = "Times New Roman"
plt.rcParams["mathtext.fontset"] = "stix"
fig, ax = plt.subplots(1,2,figsize=(36.5,12),gridspec_kw={'width_ratios': [36, 0.5]})
sns.heatmap(two_dfs[sorted(two_dfs.columns)].T, robust=False, cmap='viridis', vmin=0, vmax=np.quantile(two_dfs.values, 0.99),
            cbar=True, mask=two_dfs[sorted(two_dfs.columns)].T < 0, ax=ax[0], cbar_ax=ax[1]
           )
ax[0].hlines([2, 4, 6], *ax[0].get_xlim(), color='k', lw=2)
ax[0].set_xticklabels(ax[0].get_xticklabels(),fontsize=fontsize)
ax[0].set_xlabel("Title", fontsize=fontsize+extra)
ax[0].set_yticklabels([int(x[-4:]) for x in sorted(two_dfs.columns)],fontsize=fontsize, va="center")
ax[0].set_ylabel("")
ax[1].tick_params(labelsize=fontsize)
ax[0].annotate("Data Type",(-0.045,0.5), fontsize=fontsize+extra, rotation=90, xycoords="axes fraction", ha='center', va='center')
ax[0].annotate("Time Point",(-0.025,0.125), fontsize=fontsize, rotation=90, xycoords="axes fraction", ha='center', va='center')
ax[0].annotate("Time Period",(-0.025,0.375), fontsize=fontsize, rotation=90, xycoords="axes fraction", ha='center', va='center')
ax[0].annotate("Percentage",(-0.025,0.625), fontsize=fontsize, rotation=90, xycoords="axes fraction", ha='center', va='center')
ax[0].annotate("Money",(-0.025,0.875), fontsize=fontsize, rotation=90, xycoords="axes fraction", ha='center', va='center')
plt.tight_layout()
plt.savefig("../writing/figures/named_entities_per_thousand_tokens.pdf", transparent=True, bbox_inches='tight')

In [None]:
two_dfs[sorted(two_dfs.columns)].sort_values("percentage2019", ascending=False)

In [None]:
from collections import Counter

In [None]:
patterns = {
    "senate_committee": "committee.*senate|senate_.*committee",
    "house_committee": "committee.*house|house_.*committee",
    "committee": "committee",
    "budget": "budget",
}

In [None]:
path = "../dupex_mf-10000_results"
files = get_files(path, "_2019.json")
results = {k:{} for k in patterns.keys()}
for file in files:
    print(file)
    data = load_json(f"{path}/{file}")
    seq = data["current_sequence"]
    for k,p in patterns.items():
        res = [t for t in seq if re.search(p,t)]
        results[k][int(file[:2])] = dict(Counter(res))
        if res:
            print(k, len(res), len(results[k][int(file[:2])].keys()))

In [None]:
df_senate = pd.DataFrame(results["senate_committee"]).T.fillna(0).astype(int)
df_house = pd.DataFrame(results["house_committee"]).T.fillna(0).astype(int)

In [None]:
excluded = ['committee_of_the_senate', 'committees_of_the_senate', 'house_of_representatives', 'and_the_committee']
excluded_house = ['senate', '_and_the_committee']

In [None]:
senate_committees = set([re.findall("(?:select.)?committee.*?senate", x)[0]
                         .split("_on_",1)[-1].split("_of_the_",1)[0] 
                         for x in df_senate.columns.values 
                         if not any(y in x for y in excluded) 
                         and re.findall("committee.*?senate", x)])

In [None]:
house_committees = set([re.findall("committee.*?house_of_representatives", x)[0]
                        .split("_on_",1)[-1].split("_of_the_",1)[0] 
                        for x in df_house.columns.values 
                        if not any(y in x for y in excluded_house) 
                        and re.findall("committee.*?house_of_representatives", x)])

In [None]:
len(house_committees), sorted(house_committees)

all house committees are currently active, except:
- we see a "united states" variant again for natural resources (where we also see the senate variant)
- education and the workforce (renamed 2019, but previously held this name until 1883)
- house oversight (renamed, now oversight and reform?)
- oversight and government reform (renamed, 116th congress?)
- public works and transportation (renamed, 1994?)
- science and technology (renamed, 112th congress)
- resources (renamed, now natural resources?)

we are missing (of the currently active committees):
- budget
- ethics
- rules
- veterans' affairs

In [None]:
len(senate_committees), sorted(senate_committees)

all senate committees are currently active, except:
- committee_on_energy_and_natural_resources_of_the_united_states_senate (variant) / removed above via splitting
- committee_on_governmental_affairs_of_the_senate (renamed)

we are missing (of the currently active committees):
- budget committee
- special committee on aging
- joint committees
- select committee on ethics

In [None]:
len(senate_committees)

### now with the filtered committees

In [None]:
senate_patterns = {x:f"on_{x}_of_the_(?:united_states_)?senate" for x in sorted(senate_committees)}
house_patterns = {x:f"on_{x}_of_the_(?:united_states_)?house_of_representatives" for x in sorted(house_committees)}

In [None]:
path = "../dupex_mf-10000_results"
files = get_files(path, "_2019.json")
senate_results = {k:{} for k in senate_patterns.keys()}
house_results = {k:{} for k in house_patterns.keys()}

for file in files:
    print(file)
    data = load_json(f"{path}/{file}")
    seq = data["current_sequence"]
    for k,p in senate_patterns.items():
        res = [t for t in seq if re.search(p,t)]
        senate_results[k][int(file[:2])] = dict(Counter(res))
        if res:
            print("s", k, len(res), len(senate_results[k][int(file[:2])].keys()))
    for k,p in house_patterns.items():
        res = [t for t in seq if re.search(p,t)]
        house_results[k][int(file[:2])] = dict(Counter(res))
        if res:
            print("h", k, len(res), len(house_results[k][int(file[:2])].keys()))

In [None]:
df_senate_results = pd.DataFrame(senate_results).applymap(lambda x_dict: sum(x_dict.values())).astype(float)
df_senate_results.columns = [f"s_{x}" for x in df_senate_results.columns]
df_senate_results

In [None]:
df_house_results = pd.DataFrame(house_results).applymap(lambda x_dict: sum(x_dict.values())).astype(float)
df_house_results.columns = [f"h_{x}" for x in df_house_results.columns]
df_house_results

In [None]:
df_results_concat = pd.concat([df_senate_results, df_house_results], axis=1)
df_results_concat.index.name = 'title'

In [None]:
df_results_concat

In [None]:
df_results_concat_normed = df_results_concat.apply(lambda x:x/tokens_n.tokens_n,axis=0,raw=True)
df_results_concat_normed_selected = df_results_concat_normed[df_results_concat_normed.max(axis=1) > 0]

In [None]:
fig, ax = plt.subplots(figsize=(24,12))
sns.heatmap(df_results_concat_normed_selected.T, vmin=0, vmax=np.quantile(df_results_concat_normed_selected.values, 0.99),
            cmap='viridis', 
           )

In [None]:
outdated = [
    "H education and the workforce",
    "H banking, finance and urban affairs",
    "H education and the workforce",
    "H house oversight",
    "H public works and transportation",
    "H science and technology",
    "H resources",
    "S governmental affairs"
]

In [None]:
g = sns.clustermap(df_results_concat_normed_selected.T, method='average', metric='correlation', 
               vmin=0, vmax=np.quantile(df_results_concat_normed_selected.values, 0.99), figsize=(24,20), cmap='viridis', 
               dendrogram_ratio=0.125, cbar_kws=dict(shrink=0.5), row_cluster=True, col_cluster=True
              )
g.ax_heatmap.set_xlabel("Title", fontsize=fontsize+extra)
g.ax_heatmap.set_ylabel("Committee", fontsize=fontsize+extra)
g.ax_heatmap.set_yticklabels([x.get_text().replace("_", " ").replace(" ,", ",").capitalize() for x in g.ax_heatmap.get_yticklabels()], fontsize=fontsize-4);
g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), fontsize=fontsize);
g.ax_cbar.set_yticks(np.arange(0,0.15,0.02))
g.ax_cbar.set_yticklabels([f"{round(x,2):.2f}" for x in np.arange(0,0.15,0.02)], fontsize=fontsize-8)
for a in g.ax_row_dendrogram.collections:
    a.set_linewidth(2)
for a in g.ax_col_dendrogram.collections:
    a.set_linewidth(2)
for l in g.ax_heatmap.get_yticklabels():
    if l.get_text() in outdated:
        l.set_fontstyle("italic")
g.ax_heatmap.hlines(range(5,41,5), *g.ax_heatmap.get_xlim(), color='k', lw=2)
g.ax_heatmap.vlines(range(3,31,3), *g.ax_heatmap.get_ylim(), color='k', lw=2)
g.savefig("../writing/figures/committees_per_thousand_tokens.pdf", transparent=True)

In [None]:
len(house_committees) + len(senate_committees)

In [None]:
len(g.dendrogram_col.reordered_ind)

In [None]:
senate_committees