### Training a corpus-wide RMN with tfidf embeddings

In [72]:
import os
import sys
import pandas as pd

In [73]:
sys.path.append("/home/rocassius/w266_final/scripts/assembly")
sys.path.append("/home/rocassius/w266_final/scripts/modeling")

In [74]:
from document import load_documents
from constant import DOC_PRAYER_PATH, MIN_SESSION, MAX_SESSION, DOC_ALL_PATH
from subject import subject_keywords

sessions = list(range(MIN_SESSION, MAX_SESSION+1))

In [75]:
from helper import *
from rmn import *
from rmn_data_generator import RMN_DataGenerator
from rmn_analyzer import RMN_Analyzer

In [5]:
# load embedding tools
prayer_tools_path = "/home/rocassius/gen-data/tools/prayer_tools"
metadata_dict = load_pickled_object(os.path.join(prayer_tools_path, "metadata_dict"))
tokenizer_dict = load_pickled_object(os.path.join(prayer_tools_path, "tokenizer_dict"))
embedding_matrix = load_pickled_object(os.path.join(prayer_tools_path, "idf_embedding_matrix"))
global_embedding_matrix = load_pickled_object(os.path.join(prayer_tools_path, "embedding_matrix_wg"))
global_tokenizer_dict = load_pickled_object(os.path.join(prayer_tools_path, "tokenizer_dict_wg"))

In [6]:
docs_df = load_documents([66], DOC_PRAYER_PATH)

In [7]:
data_df = docs_df.sample(2347)

In [8]:
docs_df.shape

(35475, 10)

In [9]:
local_models_path = "/home/rocassius/gen-data/models"

In [10]:
rmn = RigidRMN()
rmn.load_rmn("SuaveRanger", local_models_path)
rmn.infer_embedding_matrix = global_embedding_matrix
rmn.infer_tokenizer_dict = global_tokenizer_dict

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [15]:
analyzer = RMN_Analyzer(rmn, data_df)

In [16]:
analyzer.predict_topics()



In [77]:
analyzer.rmn.inspect_topics(which_topics='all', k_neighbors=6)

  Ds = np.dot(Wv, v) / (np.linalg.norm(v) * np.linalg.norm(Wv, axis = 1))
  Ds = np.dot(Wv, v) / (np.linalg.norm(v) * np.linalg.norm(Wv, axis = 1))



Topic 0
['necessary', 'adequate', 'appropriate', 'useful', 'properly', 'efficiently']

Topic 1
['end', 'next', 'strike', 'ends', 'week', 'planned']

Topic 2
['territories', 'lands', 'granted', 'abolished', 'legally', 'privileges']

Topic 3
['father', 'friend', 'mother', 'wife', 'brother', 'son']

Topic 4
['companies', 'company', 'business', 'software', 'manufacturers', 'industry']

Topic 5
['drink', 'amount', 'liquid', 'amounts', 'concoction', 'soda']

Topic 6
['countries', 'asia', 'nations', 'europe', 'africa', 'trade']

Topic 7
['religion', 'religious', 'freedom', 'beliefs', 'equality', 'ideals']

Topic 8
['e', 'k', 'w', 'text', 'article', 'note']

Topic 9
['downtown', 'city', 'located', 'avenue', 'railroad', 'road']

Topic 10
['importance', 'challenges', 'understand', 'focus', 'situation', 'perspective']

Topic 11
['navy', 'naval', 'command', 'fleet', 'corps', 'squadron']

Topic 12
['coastal', 'forests', 'lakes', 'wetlands', 'area', 'river']

Topic 13
['reconstruction', 'rebuilding

[[('necessary', 0.686),
  ('adequate', 0.677),
  ('appropriate', 0.666),
  ('useful', 0.664),
  ('properly', 0.658),
  ('efficiently', 0.647)],
 [('end', 0.608),
  ('next', 0.593),
  ('strike', 0.576),
  ('ends', 0.574),
  ('week', 0.572),
  ('planned', 0.562)],
 [('territories', 0.663),
  ('lands', 0.645),
  ('granted', 0.636),
  ('abolished', 0.633),
  ('legally', 0.614),
  ('privileges', 0.607)],
 [('father', 0.928),
  ('friend', 0.901),
  ('mother', 0.882),
  ('wife', 0.881),
  ('brother', 0.879),
  ('son', 0.875)],
 [('companies', 0.823),
  ('company', 0.787),
  ('business', 0.784),
  ('software', 0.779),
  ('manufacturers', 0.766),
  ('industry', 0.762)],
 [('drink', 0.599),
  ('amount', 0.574),
  ('liquid', 0.571),
  ('amounts', 0.566),
  ('concoction', 0.566),
  ('soda', 0.566)],
 [('countries', 0.851),
  ('asia', 0.798),
  ('nations', 0.79),
  ('europe', 0.748),
  ('africa', 0.744),
  ('trade', 0.743)],
 [('religion', 0.81),
  ('religious', 0.783),
  ('freedom', 0.773),
  ('be

In [13]:
# nn = rmn.inspect_topics([1,2,3])

In [17]:
# analyzer.topic_preds[0,2] = np.nan
# analyzer.topic_preds[2,44] = np.nan
# analyzer.topic_preds[944,44] = np.nan
# analyzer.topic_preds[944,1] = np.nan
# analyzer.topic_preds[900,1] = np.nan
# analyzer.topic_preds[200,5] = np.nan
# analyzer.topic_preds[245,5] = np.nan

In [20]:
analyzer.analyze_subset(conditions={'subject':'immigration'}, n=5000)

{'n_records': 45,
 'n_records_R': 26,
 'n_records_D': 19,
 'n_nan_preds_R': 0,
 'n_nan_preds_D': 0,
 'js': {'mean': 0.9027244240909584,
  'lower': 0.8997593822268837,
  'upper': 0.9056894659550327},
 'js_R': {'mean': 0.8949276773157079,
  'lower': 0.8917689967334704,
  'upper': 0.8980863578979444},
 'js_D': {'mean': 0.9083369769069605,
  'lower': 0.9052334035716194,
  'upper': 0.9114405502423015},
 'js_RD': {'mean': 0.9048001430166008,
  'lower': 0.901856220239465,
  'upper': 0.9077440657937367},
 'hh': {'mean': 0.40965795516967773,
  'lower': 0.40224158510687436,
  'upper': 0.4170743405329934},
 'hh_R': {'mean': 0.38056713342666626,
  'lower': 0.3732381199484541,
  'upper': 0.38789615000133976},
 'hh_D': {'mean': 0.4344913959503174,
  'lower': 0.4273369889340688,
  'upper': 0.44164586669585215},
 'hh_topic_use': 0.032669757,
 'hh_topic_use_R': 0.033592302,
 'hh_topic_use_D': 0.054120168,
 'js_topic_use': 0.15917184728493192}

In [22]:
analyzer.analyze_subset(conditions={}, n=5000)

{'n_records': 2347,
 'n_records_R': 1359,
 'n_records_D': 986,
 'n_nan_preds_R': 0,
 'n_nan_preds_D': 0,
 'js': {'mean': 0.8957717776919176,
  'lower': 0.8925751218947856,
  'upper': 0.8989684334890496},
 'js_R': {'mean': 0.8999332239026008,
  'lower': 0.896963153267926,
  'upper': 0.9029032945372751},
 'js_D': {'mean': 0.9028203100053394,
  'lower': 0.8999136297951069,
  'upper': 0.9057269902155722},
 'js_RD': {'mean': 0.9001306863795672,
  'lower': 0.8971534524220804,
  'upper': 0.9031079203370541},
 'hh': {'mean': 0.3677561581134796,
  'lower': 0.3608768026818241,
  'upper': 0.3746355109568035},
 'hh_R': {'mean': 0.36135047674179077,
  'lower': 0.3545306169262916,
  'upper': 0.3681703463086098},
 'hh_D': {'mean': 0.36098992824554443,
  'lower': 0.35425571793748495,
  'upper': 0.36772412918226366},
 'hh_topic_use': 0.023394933,
 'hh_topic_use_R': 0.02350846,
 'hh_topic_use_D': 0.0235127,
 'js_topic_use': 0.020763270013108733}

In [23]:
analyzer.mean_entropy({'subject':'trade'})

2.4610295

In [94]:
analyzer.n_nan_preds()

6

In [95]:
analyzer.topic_use()

14    0.037360
8     0.035398
5     0.033360
44    0.031464
0     0.030891
34    0.030410
15    0.029673
45    0.029430
2     0.029329
1     0.028507
11    0.028051
25    0.026124
33    0.024791
28    0.024385
12    0.024194
24    0.023645
38    0.023498
46    0.022405
31    0.021437
49    0.021356
37    0.021292
26    0.021249
9     0.021246
17    0.020038
3     0.019613
39    0.019430
22    0.018316
32    0.017907
40    0.017642
4     0.017435
20    0.017127
6     0.017115
36    0.016969
7     0.016921
47    0.016913
35    0.015669
13    0.015399
18    0.015156
21    0.014893
30    0.014650
10    0.011828
29    0.011460
41    0.010965
42    0.010087
48    0.009989
27    0.009341
23    0.008044
16    0.007311
19    0.007285
43    0.003004
dtype: float32

In [96]:
analyzer.first_topic_counts()

14    98
8     90
1     87
44    84
45    83
5     78
0     77
2     73
34    72
15    71
11    69
25    67
33    65
12    59
31    58
37    57
24    56
38    56
49    56
9     54
46    50
26    48
28    46
6     42
17    42
20    41
3     40
35    40
7     40
39    39
22    39
32    38
40    37
13    36
4     35
47    35
30    34
21    33
18    29
36    28
41    26
10    24
42    22
48    22
29    21
19    13
27    12
16    11
23    10
43     4
dtype: int64

In [97]:
analyzer.primary_topics()

array([[20, 45, 48, 26, 35],
       [42, 44, 20, 22,  4],
       [37, 10, 17, 39, 14],
       ...,
       [44, 20, 45, 38,  8],
       [39,  8, 38,  1,  5],
       [ 2, 14, 40, 12, 19]])

In [98]:
analyzer.analyze_subset(conditions={}, n =100)

{'n_records': 2347,
 'n_records_R': 1382,
 'n_records_D': 959,
 'n_nan_preds_R': 3,
 'n_nan_preds_D': 3,
 'js': {'mean': 0.8863957659417478,
  'lower': 0.863518849703897,
  'upper': 0.9092726821795984},
 'js_R': {'mean': 0.8949470355028366,
  'lower': 0.869312252791087,
  'upper': 0.920581818214586},
 'js_D': {'mean': 0.8947676850644782,
  'lower': 0.8680333250166014,
  'upper': 0.9215020451123551},
 'js_RD': {'mean': 0.9064493207901105,
  'lower': 0.8866138726161371,
  'upper': 0.9262847689640844},
 'hh': {'mean': 0.3773353099822998,
  'lower': 0.32984055783772437,
  'upper': 0.424829997306824},
 'hh_R': {'mean': 0.37966644763946533,
  'lower': 0.3324582539427007,
  'upper': 0.4268746207726275},
 'hh_D': {'mean': 0.3546164333820343,
  'lower': 0.308238170541423,
  'upper': 0.4009946578521555},
 'hh_topic_use': 0.023090007,
 'hh_topic_use_R': 0.023104973,
 'hh_topic_use_D': 0.023600442,
 'js_topic_use': 0.02396601826013351}

In [99]:
analyzer.topic_preds[0]

array([2.5700021e-03, 1.8807802e-04,           nan, 1.2839150e-02,
       5.9409621e-03, 2.7256053e-02, 1.1624908e-02, 4.0412087e-02,
       6.9517833e-03, 2.0715038e-03, 2.9260023e-03, 6.0561683e-04,
       9.1330469e-02, 3.5805947e-03, 1.3342206e-01, 2.2432106e-02,
       4.0908293e-03, 3.9539553e-02, 5.4975979e-02, 5.9212290e-02,
       7.0578512e-03, 4.7504166e-03, 4.8512153e-02, 4.8274137e-03,
       3.9465006e-02, 1.1614815e-02, 1.6407501e-02, 1.1009588e-03,
       3.1605072e-03, 1.9031172e-03, 2.1471735e-02, 7.2166225e-04,
       1.1253923e-02, 1.9795235e-02, 2.4883274e-02, 4.2066994e-04,
       1.0909189e-03, 1.2548159e-02, 9.9336830e-05, 7.8000868e-04,
       1.1094594e-01, 1.0729488e-03, 5.9383465e-03, 1.7232537e-04,
       1.9048652e-02, 4.9032527e-03, 1.3484785e-02, 1.6078722e-02,
       2.8263815e-02, 1.7701630e-02], dtype=float32)

In [100]:
analyzer.topic_preds[1].round(3)

array([0.003, 0.101, 0.008, 0.002, 0.   , 0.066, 0.   , 0.   , 0.254,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.008, 0.002, 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.001, 0.   , 0.   , 0.003, 0.001, 0.   ,
       0.002, 0.042, 0.   , 0.   , 0.   , 0.001, 0.016, 0.   , 0.   ,
       0.05 , 0.   , 0.11 , 0.322, 0.002, 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.001, 0.001, 0.   , 0.001], dtype=float32)

In [101]:
analyzer.compute_JS(index_A = [1, 0, 2, 55], index_B = [0, 1, 2, 3])

{'mean': 0.245307112418126,
 'lower': -0.5353696011568905,
 'upper': 1.0259838259931424}

In [102]:
analyzer.topic_use_RD_js()

0.02396601826013351

In [103]:
analyzer.analyze_subset(conditions={}, n=200)

{'n_records': 2347,
 'n_records_R': 1382,
 'n_records_D': 959,
 'n_nan_preds_R': 3,
 'n_nan_preds_D': 3,
 'js': {'mean': 0.8974244483532509,
  'lower': 0.8821201834503916,
  'upper': 0.9127287132561104},
 'js_R': {'mean': 0.8945284970134967,
  'lower': 0.8789208578950092,
  'upper': 0.9101361361319842},
 'js_D': {'mean': 0.8986361644720359,
  'lower': 0.8837740604689276,
  'upper': 0.9134982684751437},
 'js_RD': {'mean': 0.8976757319342795,
  'lower': 0.8822714814192671,
  'upper': 0.9130799824492912},
 'hh': {'mean': 0.3572431206703186,
  'lower': 0.32602880961325076,
  'upper': 0.3884574844029912},
 'hh_R': {'mean': 0.36144551634788513,
  'lower': 0.32829305081360816,
  'upper': 0.3945980301050154},
 'hh_D': {'mean': 0.36123907566070557,
  'lower': 0.32841593191950025,
  'upper': 0.3940622190293819},
 'hh_topic_use': 0.023090007,
 'hh_topic_use_R': 0.023104973,
 'hh_topic_use_D': 0.023600442,
 'js_topic_use': 0.02396601826013351}

In [105]:
analyzer.n_nan_preds()

6

Testing the RMN

In [106]:
data_df

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,subject,session
31709,66072621,KIRBY,WILLIAM,S,AR,M,D,clearly tile states of the union could not exi...,tax,66
24750,66075331,BRANDEGEE,FRANK,S,CT,M,R,to the unemployed in our country for years i ...,immigration,66
18135,66083070,GARNER,JOHN,H,TX,M,D,known in the american congress i that it is de...,foreign,66
17577,66079540,DUNBAR,JAMES,H,IN,M,R,present time are matters which the house put i...,foreign,66
21443,66073781,TOWNSEND,CHARLES,S,MI,M,R,president i offer the following resolution whi...,foreign,66
...,...,...,...,...,...,...,...,...,...,...
17678,66091060,CONNALLY,THOMAS,H,TX,M,D,did the court of clains find ihe amount ickey ...,foreign,66
27234,66075011,STERLING,THOMAS,S,SD,M,R,ratify the treaty of peace they have utterly f...,labor,66
18050,66075690,GOOD,JAMES,H,IA,M,R,discuss anything except the situation that has...,foreign,66
12495,66077040,GRAHAM,WILLIAM,H,IL,M,R,prices by bearing down entirely on the cost of...,economy,66


In [107]:
analyzer.df

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,subject,session
0,66072621,KIRBY,WILLIAM,S,AR,M,D,clearly tile states of the union could not exi...,tax,66
1,66075331,BRANDEGEE,FRANK,S,CT,M,R,to the unemployed in our country for years i ...,immigration,66
2,66083070,GARNER,JOHN,H,TX,M,D,known in the american congress i that it is de...,foreign,66
3,66079540,DUNBAR,JAMES,H,IN,M,R,present time are matters which the house put i...,foreign,66
4,66073781,TOWNSEND,CHARLES,S,MI,M,R,president i offer the following resolution whi...,foreign,66
...,...,...,...,...,...,...,...,...,...,...
2342,66091060,CONNALLY,THOMAS,H,TX,M,D,did the court of clains find ihe amount ickey ...,foreign,66
2343,66075011,STERLING,THOMAS,S,SD,M,R,ratify the treaty of peace they have utterly f...,labor,66
2344,66075690,GOOD,JAMES,H,IA,M,R,discuss anything except the situation that has...,foreign,66
2345,66077040,GRAHAM,WILLIAM,H,IL,M,R,prices by bearing down entirely on the cost of...,economy,66


In [100]:
i = 1800

In [101]:
analyzer.df.loc[[i]]

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,subject,session
1800,66083031,FLETCHER,DUNCAN,S,FL,M,D,congress so faras appropriations are pay thede...,defense,66


In [102]:
analyzer.df.document.values[i]

'congress so faras appropriations are pay thedeficit brought about by the operation of therailroads concerned the burden of debt left by the warand the demandi by the united states railroad administration the amount for increased appropriations occasioned make it neces required depends'

In [103]:
analyzer.rmn.inspect_topics(analyzer.primary_topics()[i])


Topic 42
['inflation', 'growth', 'rise', 'decline', 'rising', 'increases', 'slowing', 'rates', 'rate', 'slowdown']

Topic 45
['salary', 'payment', 'pay', 'paid', 'fee', 'fees', 'salaries', 'payments', 'compensation', 'minimum']

Topic 20
['million', 'percent', 'billion', 'dollars', 'compared', 'total', 'estimated', 'average', 'cents', 'cent']

Topic 26
['loans', 'assets', 'funds', 'investments', 'debt', 'mortgage', 'bonds', 'credit', 'securities', 'asset']

Topic 48
['spending', 'cuts', 'deficit', 'fiscal', 'taxes', 'expenditures', 'deficits', 'stimulus', 'trillion', 'expenditure']


[[('inflation', 0.851),
  ('growth', 0.846),
  ('rise', 0.844),
  ('decline', 0.84),
  ('rising', 0.83),
  ('increases', 0.822),
  ('slowing', 0.82),
  ('rates', 0.82),
  ('rate', 0.803),
  ('slowdown', 0.802)],
 [('salary', 0.839),
  ('payment', 0.812),
  ('pay', 0.798),
  ('paid', 0.777),
  ('fee', 0.775),
  ('fees', 0.77),
  ('salaries', 0.766),
  ('payments', 0.762),
  ('compensation', 0.751),
  ('minimum', 0.736)],
 [('million', 0.857),
  ('percent', 0.856),
  ('billion', 0.854),
  ('dollars', 0.831),
  ('compared', 0.829),
  ('total', 0.784),
  ('estimated', 0.754),
  ('average', 0.745),
  ('cents', 0.745),
  ('cent', 0.74)],
 [('loans', 0.815),
  ('assets', 0.812),
  ('funds', 0.81),
  ('investments', 0.795),
  ('debt', 0.786),
  ('mortgage', 0.785),
  ('bonds', 0.783),
  ('credit', 0.782),
  ('securities', 0.77),
  ('asset', 0.766)],
 [('spending', 0.833),
  ('cuts', 0.763),
  ('deficit', 0.751),
  ('fiscal', 0.737),
  ('taxes', 0.733),
  ('expenditures', 0.724),
  ('deficits',