### Training a corpus-wide RMN with tfidf embeddings

In [1]:
import os
import sys
import pandas as pd

In [2]:
sys.path.append("/home/rocassius/w266_final/scripts/assembly")
sys.path.append("/home/rocassius/w266_final/scripts/modeling")

In [3]:
from document import load_documents
from constant import DOC_PRAYER_PATH, MIN_SESSION, MAX_SESSION, DOC_ALL_PATH
from subject import subject_keywords

sessions = list(range(MIN_SESSION, MAX_SESSION+1))

In [4]:
from helper import *
from rmn import *
from rmn_data_generator import RMN_DataGenerator
from rmn_analyzer import RMN_Analyzer

In [5]:
# load embedding tools
prayer_tools_path = "/home/rocassius/gen-data/tools/prayer_tools"
metadata_dict = load_pickled_object(os.path.join(prayer_tools_path, "metadata_dict"))
tokenizer_dict = load_pickled_object(os.path.join(prayer_tools_path, "tokenizer_dict"))
embedding_matrix = load_pickled_object(os.path.join(prayer_tools_path, "idf_embedding_matrix"))
global_embedding_matrix = load_pickled_object(os.path.join(prayer_tools_path, "embedding_matrix_wg"))
global_tokenizer_dict = load_pickled_object(os.path.join(prayer_tools_path, "tokenizer_dict_wg"))

In [6]:
docs_df = load_documents([66], DOC_PRAYER_PATH)

In [7]:
data_df = docs_df.sample(2347)

In [8]:
docs_df.shape

(35475, 10)

In [9]:
local_models_path = "/home/rocassius/gen-data/models"

In [10]:
rmn = RigidRMN()
rmn.load_rmn("SuaveRanger", local_models_path)
rmn.infer_embedding_matrix = global_embedding_matrix
rmn.infer_tokenizer_dict = global_tokenizer_dict

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [11]:
analyzer = RMN_Analyzer(rmn, data_df)

In [12]:
analyzer.predict_topics()



In [13]:
analyzer.rmn.inspect_topics(which_topics='all', k_neighbors=2)

  Ds = np.dot(Wv, v) / (np.linalg.norm(v) * np.linalg.norm(Wv, axis = 1))
  Ds = np.dot(Wv, v) / (np.linalg.norm(v) * np.linalg.norm(Wv, axis = 1))



Topic 0
['necessary', 'adequate']

Topic 1
['day', 'end']

Topic 2
['territories', 'lands']

Topic 3
['father', 'friend']

Topic 4
['companies', 'company']


KeyboardInterrupt: 

In [14]:
# nn = rmn.inspect_topics([1,2,3])

In [15]:
# analyzer.topic_preds[0,2] = np.nan
# analyzer.topic_preds[2,44] = np.nan
# analyzer.topic_preds[944,44] = np.nan
# analyzer.topic_preds[944,1] = np.nan
# analyzer.topic_preds[900,1] = np.nan
# analyzer.topic_preds[200,5] = np.nan
# analyzer.topic_preds[245,5] = np.nan

In [16]:
analyzer.analyze_subset(conditions={'subject':'immigration'}, n=5000)

{'n_records': 46,
 'n_records_R': 29,
 'n_records_D': 17,
 'n_nan_preds_R': 0,
 'n_nan_preds_D': 0,
 'js': {'mean': 0.9103171210741057,
  'lower': 0.9076598773427386,
  'upper': 0.9129743648054728},
 'js_R': {'mean': 0.9091955517222737,
  'lower': 0.9065130713595239,
  'upper': 0.9118780320850232},
 'js_D': {'mean': 0.9026465195395105,
  'lower': 0.900038849589323,
  'upper': 0.9052541894896977},
 'js_RD': {'mean': 0.9101964590035202,
  'lower': 0.9074541035339271,
  'upper': 0.9129388144731133},
 'hh': {'mean': 0.3904586434364319,
  'lower': 0.38386951549452236,
  'upper': 0.3970477613915832},
 'hh_R': {'mean': 0.3818866014480591,
  'lower': 0.37540282216230775,
  'upper': 0.38837035552402593},
 'hh_D': {'mean': 0.3936111629009247,
  'lower': 0.38677758055102596,
  'upper': 0.400444721403005},
 'hh_topic_use': 0.029113077,
 'hh_topic_use_R': 0.032633252,
 'hh_topic_use_D': 0.04609549,
 'js_topic_use': 0.1438401815281331}

In [22]:
analyzer.analyze_subset(conditions={}, n=5000)

{'n_records': 2347,
 'n_records_R': 1359,
 'n_records_D': 986,
 'n_nan_preds_R': 0,
 'n_nan_preds_D': 0,
 'js': {'mean': 0.8957717776919176,
  'lower': 0.8925751218947856,
  'upper': 0.8989684334890496},
 'js_R': {'mean': 0.8999332239026008,
  'lower': 0.896963153267926,
  'upper': 0.9029032945372751},
 'js_D': {'mean': 0.9028203100053394,
  'lower': 0.8999136297951069,
  'upper': 0.9057269902155722},
 'js_RD': {'mean': 0.9001306863795672,
  'lower': 0.8971534524220804,
  'upper': 0.9031079203370541},
 'hh': {'mean': 0.3677561581134796,
  'lower': 0.3608768026818241,
  'upper': 0.3746355109568035},
 'hh_R': {'mean': 0.36135047674179077,
  'lower': 0.3545306169262916,
  'upper': 0.3681703463086098},
 'hh_D': {'mean': 0.36098992824554443,
  'lower': 0.35425571793748495,
  'upper': 0.36772412918226366},
 'hh_topic_use': 0.023394933,
 'hh_topic_use_R': 0.02350846,
 'hh_topic_use_D': 0.0235127,
 'js_topic_use': 0.020763270013108733}

In [23]:
analyzer.mean_entropy({'subject':'trade'})

2.4610295

In [94]:
analyzer.n_nan_preds()

6

In [95]:
analyzer.topic_use()

14    0.037360
8     0.035398
5     0.033360
44    0.031464
0     0.030891
34    0.030410
15    0.029673
45    0.029430
2     0.029329
1     0.028507
11    0.028051
25    0.026124
33    0.024791
28    0.024385
12    0.024194
24    0.023645
38    0.023498
46    0.022405
31    0.021437
49    0.021356
37    0.021292
26    0.021249
9     0.021246
17    0.020038
3     0.019613
39    0.019430
22    0.018316
32    0.017907
40    0.017642
4     0.017435
20    0.017127
6     0.017115
36    0.016969
7     0.016921
47    0.016913
35    0.015669
13    0.015399
18    0.015156
21    0.014893
30    0.014650
10    0.011828
29    0.011460
41    0.010965
42    0.010087
48    0.009989
27    0.009341
23    0.008044
16    0.007311
19    0.007285
43    0.003004
dtype: float32

In [96]:
analyzer.first_topic_counts()

14    98
8     90
1     87
44    84
45    83
5     78
0     77
2     73
34    72
15    71
11    69
25    67
33    65
12    59
31    58
37    57
24    56
38    56
49    56
9     54
46    50
26    48
28    46
6     42
17    42
20    41
3     40
35    40
7     40
39    39
22    39
32    38
40    37
13    36
4     35
47    35
30    34
21    33
18    29
36    28
41    26
10    24
42    22
48    22
29    21
19    13
27    12
16    11
23    10
43     4
dtype: int64

In [97]:
analyzer.primary_topics()

array([[20, 45, 48, 26, 35],
       [42, 44, 20, 22,  4],
       [37, 10, 17, 39, 14],
       ...,
       [44, 20, 45, 38,  8],
       [39,  8, 38,  1,  5],
       [ 2, 14, 40, 12, 19]])

In [98]:
analyzer.analyze_subset(conditions={}, n =100)

{'n_records': 2347,
 'n_records_R': 1382,
 'n_records_D': 959,
 'n_nan_preds_R': 3,
 'n_nan_preds_D': 3,
 'js': {'mean': 0.8863957659417478,
  'lower': 0.863518849703897,
  'upper': 0.9092726821795984},
 'js_R': {'mean': 0.8949470355028366,
  'lower': 0.869312252791087,
  'upper': 0.920581818214586},
 'js_D': {'mean': 0.8947676850644782,
  'lower': 0.8680333250166014,
  'upper': 0.9215020451123551},
 'js_RD': {'mean': 0.9064493207901105,
  'lower': 0.8866138726161371,
  'upper': 0.9262847689640844},
 'hh': {'mean': 0.3773353099822998,
  'lower': 0.32984055783772437,
  'upper': 0.424829997306824},
 'hh_R': {'mean': 0.37966644763946533,
  'lower': 0.3324582539427007,
  'upper': 0.4268746207726275},
 'hh_D': {'mean': 0.3546164333820343,
  'lower': 0.308238170541423,
  'upper': 0.4009946578521555},
 'hh_topic_use': 0.023090007,
 'hh_topic_use_R': 0.023104973,
 'hh_topic_use_D': 0.023600442,
 'js_topic_use': 0.02396601826013351}

In [99]:
analyzer.topic_preds[0]

array([2.5700021e-03, 1.8807802e-04,           nan, 1.2839150e-02,
       5.9409621e-03, 2.7256053e-02, 1.1624908e-02, 4.0412087e-02,
       6.9517833e-03, 2.0715038e-03, 2.9260023e-03, 6.0561683e-04,
       9.1330469e-02, 3.5805947e-03, 1.3342206e-01, 2.2432106e-02,
       4.0908293e-03, 3.9539553e-02, 5.4975979e-02, 5.9212290e-02,
       7.0578512e-03, 4.7504166e-03, 4.8512153e-02, 4.8274137e-03,
       3.9465006e-02, 1.1614815e-02, 1.6407501e-02, 1.1009588e-03,
       3.1605072e-03, 1.9031172e-03, 2.1471735e-02, 7.2166225e-04,
       1.1253923e-02, 1.9795235e-02, 2.4883274e-02, 4.2066994e-04,
       1.0909189e-03, 1.2548159e-02, 9.9336830e-05, 7.8000868e-04,
       1.1094594e-01, 1.0729488e-03, 5.9383465e-03, 1.7232537e-04,
       1.9048652e-02, 4.9032527e-03, 1.3484785e-02, 1.6078722e-02,
       2.8263815e-02, 1.7701630e-02], dtype=float32)

In [100]:
analyzer.topic_preds[1].round(3)

array([0.003, 0.101, 0.008, 0.002, 0.   , 0.066, 0.   , 0.   , 0.254,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.008, 0.002, 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.001, 0.   , 0.   , 0.003, 0.001, 0.   ,
       0.002, 0.042, 0.   , 0.   , 0.   , 0.001, 0.016, 0.   , 0.   ,
       0.05 , 0.   , 0.11 , 0.322, 0.002, 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.001, 0.001, 0.   , 0.001], dtype=float32)

In [101]:
analyzer.compute_JS(index_A = [1, 0, 2, 55], index_B = [0, 1, 2, 3])

{'mean': 0.245307112418126,
 'lower': -0.5353696011568905,
 'upper': 1.0259838259931424}

In [102]:
analyzer.topic_use_RD_js()

0.02396601826013351

In [103]:
analyzer.analyze_subset(conditions={}, n=200)

{'n_records': 2347,
 'n_records_R': 1382,
 'n_records_D': 959,
 'n_nan_preds_R': 3,
 'n_nan_preds_D': 3,
 'js': {'mean': 0.8974244483532509,
  'lower': 0.8821201834503916,
  'upper': 0.9127287132561104},
 'js_R': {'mean': 0.8945284970134967,
  'lower': 0.8789208578950092,
  'upper': 0.9101361361319842},
 'js_D': {'mean': 0.8986361644720359,
  'lower': 0.8837740604689276,
  'upper': 0.9134982684751437},
 'js_RD': {'mean': 0.8976757319342795,
  'lower': 0.8822714814192671,
  'upper': 0.9130799824492912},
 'hh': {'mean': 0.3572431206703186,
  'lower': 0.32602880961325076,
  'upper': 0.3884574844029912},
 'hh_R': {'mean': 0.36144551634788513,
  'lower': 0.32829305081360816,
  'upper': 0.3945980301050154},
 'hh_D': {'mean': 0.36123907566070557,
  'lower': 0.32841593191950025,
  'upper': 0.3940622190293819},
 'hh_topic_use': 0.023090007,
 'hh_topic_use_R': 0.023104973,
 'hh_topic_use_D': 0.023600442,
 'js_topic_use': 0.02396601826013351}

In [105]:
analyzer.n_nan_preds()

6

Testing the RMN

In [106]:
data_df

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,subject,session
31709,66072621,KIRBY,WILLIAM,S,AR,M,D,clearly tile states of the union could not exi...,tax,66
24750,66075331,BRANDEGEE,FRANK,S,CT,M,R,to the unemployed in our country for years i ...,immigration,66
18135,66083070,GARNER,JOHN,H,TX,M,D,known in the american congress i that it is de...,foreign,66
17577,66079540,DUNBAR,JAMES,H,IN,M,R,present time are matters which the house put i...,foreign,66
21443,66073781,TOWNSEND,CHARLES,S,MI,M,R,president i offer the following resolution whi...,foreign,66
...,...,...,...,...,...,...,...,...,...,...
17678,66091060,CONNALLY,THOMAS,H,TX,M,D,did the court of clains find ihe amount ickey ...,foreign,66
27234,66075011,STERLING,THOMAS,S,SD,M,R,ratify the treaty of peace they have utterly f...,labor,66
18050,66075690,GOOD,JAMES,H,IA,M,R,discuss anything except the situation that has...,foreign,66
12495,66077040,GRAHAM,WILLIAM,H,IL,M,R,prices by bearing down entirely on the cost of...,economy,66


In [107]:
analyzer.df

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,subject,session
0,66072621,KIRBY,WILLIAM,S,AR,M,D,clearly tile states of the union could not exi...,tax,66
1,66075331,BRANDEGEE,FRANK,S,CT,M,R,to the unemployed in our country for years i ...,immigration,66
2,66083070,GARNER,JOHN,H,TX,M,D,known in the american congress i that it is de...,foreign,66
3,66079540,DUNBAR,JAMES,H,IN,M,R,present time are matters which the house put i...,foreign,66
4,66073781,TOWNSEND,CHARLES,S,MI,M,R,president i offer the following resolution whi...,foreign,66
...,...,...,...,...,...,...,...,...,...,...
2342,66091060,CONNALLY,THOMAS,H,TX,M,D,did the court of clains find ihe amount ickey ...,foreign,66
2343,66075011,STERLING,THOMAS,S,SD,M,R,ratify the treaty of peace they have utterly f...,labor,66
2344,66075690,GOOD,JAMES,H,IA,M,R,discuss anything except the situation that has...,foreign,66
2345,66077040,GRAHAM,WILLIAM,H,IL,M,R,prices by bearing down entirely on the cost of...,economy,66


In [17]:
i = 180

In [18]:
analyzer.df.loc[[i]]

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,subject,session
180,66074370,GARD,WARREN,H,OH,M,D,president the bill under discussion seeks to p...,defense,66


In [19]:
analyzer.df.document.values[i]

'president the bill under discussion seeks to perpetuate one of the great government boards organized for wartime emergency it seeks to single out one great american industry to place restrictions on it and to give power to a cor poration which for more'

In [24]:
analyzer.primary_topics()

array([[21, 32, 15, 17, 23],
       [30, 11, 45, 17, 49],
       [37, 17, 10, 34, 14],
       ...,
       [20, 22, 44,  4,  9],
       [37, 17, 14, 21, 46],
       [ 5,  4, 28, 26, 44]])

In [27]:
a = [[1,2,3], [4,9,7],[8,0,5]]

In [29]:
a

[[1, 2, 3], [4, 9, 7], [8, 0, 5]]

In [37]:
np.flip(a, axis=-1)

array([[3, 2, 1],
       [7, 9, 4],
       [5, 0, 8]])

In [20]:
analyzer.rmn.inspect_topics(analyzer.primary_topics()[i])

  Ds = np.dot(Wv, v) / (np.linalg.norm(v) * np.linalg.norm(Wv, axis = 1))
  Ds = np.dot(Wv, v) / (np.linalg.norm(v) * np.linalg.norm(Wv, axis = 1))



Topic 34
['violate', 'applicable', 'precedent', 'violation', 'applies', 'limitation', 'laws', 'regard', 'unfair', 'certain']

Topic 21
['vote', 'votes', 'democrats', 'candidates', 'election', 'voters', 'candidate', 'democratic', 'republican', 'republicans']

Topic 25
['constitutional', 'amended', 'approved', 'court', 'decree', 'injunction', 'upheld', 'amendment', 'supreme', 'amendments']

Topic 29
['implementing', 'implement', 'implementation', 'blueprint', 'framework', 'proposals', 'compromise', 'agreement', 'accord', 'timetable']

Topic 37
['know', 'maybe', 'everybody', 'really', 'think', 'else', "'m", 'nobody', 'anybody', 'thing']


[[('violate', 0.711),
  ('applicable', 0.711),
  ('precedent', 0.71),
  ('violation', 0.709),
  ('applies', 0.704),
  ('limitation', 0.701),
  ('laws', 0.693),
  ('regard', 0.689),
  ('unfair', 0.686),
  ('certain', 0.685)],
 [('vote', 0.895),
  ('votes', 0.868),
  ('democrats', 0.86),
  ('candidates', 0.852),
  ('election', 0.847),
  ('voters', 0.84),
  ('candidate', 0.836),
  ('democratic', 0.833),
  ('republican', 0.812),
  ('republicans', 0.812)],
 [('constitutional', 0.765),
  ('amended', 0.759),
  ('approved', 0.758),
  ('court', 0.743),
  ('decree', 0.739),
  ('injunction', 0.738),
  ('upheld', 0.734),
  ('amendment', 0.731),
  ('supreme', 0.729),
  ('amendments', 0.725)],
 [('implementing', 0.807),
  ('implement', 0.805),
  ('implementation', 0.795),
  ('blueprint', 0.777),
  ('framework', 0.754),
  ('proposals', 0.742),
  ('compromise', 0.742),
  ('agreement', 0.733),
  ('accord', 0.726),
  ('timetable', 0.715)],
 [('know', 0.904),
  ('maybe', 0.903),
  ('everybody', 0.901),
 

In [106]:
row = analyzer.df.iloc[[1]]

In [126]:
row['document'] = ['covid 19 is a disease that spreads very easily obviously but we dont understand exactly how threfore we mustall behave as if we have it and we are trying to prevent passing it']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [127]:
row

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,subject,session
1,66079101,SMOOT,REED,S,UT,M,R,covid 19 is a disease that spreads very easily...,abortion,66


In [128]:
row.document.values


array(['covid 19 is a disease that spreads very easily obviously but we dont understand exactly how threfore we mustall behave as if we have it and we are trying to prevent passing it'],
      dtype=object)

In [129]:
analyzer2 = RMN_Analyzer(rmn, row)

In [130]:
analyzer2.predict_topics()

In [131]:
analyzer2.primary_topics()

array([[37, 19, 17, 30, 10]])

In [132]:
analyzer2.rmn.inspect_topics(analyzer2.primary_topics()[0])


Topic 37
['know', 'maybe', 'everybody', 'really', 'think', 'else', 'nobody', 'anybody', 'thing', 'guess']

Topic 19
['cancer', 'disease', 'patients', 'diseases', 'infection', 'diabetes', 'treatment', 'illness', 'hiv', 'treatments']

Topic 17
['try', 'able', 'let', 'want', 'give', 'take', 'willing', 'make', 'intend', 'decide']

Topic 30
['families', 'parents', 'children', 'youngsters', 'kids', 'mothers', 'living', 'elderly', 'teenagers', 'unemployed']

Topic 10
['importance', 'challenges', 'understand', 'focus', 'situation', 'perspective', 'fundamental', 'understanding', 'important', 'progress']


[[('know', 0.904),
  ('maybe', 0.903),
  ('everybody', 0.902),
  ('really', 0.901),
  ('think', 0.901),
  ('else', 0.89),
  ('nobody', 0.884),
  ('anybody', 0.882),
  ('thing', 0.88),
  ('guess', 0.876)],
 [('cancer', 0.878),
  ('disease', 0.87),
  ('patients', 0.844),
  ('diseases', 0.832),
  ('infection', 0.82),
  ('diabetes', 0.812),
  ('treatment', 0.802),
  ('illness', 0.794),
  ('hiv', 0.781),
  ('treatments', 0.777)],
 [('try', 0.857),
  ('able', 0.802),
  ('let', 0.801),
  ('want', 0.801),
  ('give', 0.797),
  ('take', 0.791),
  ('willing', 0.769),
  ('make', 0.765),
  ('intend', 0.76),
  ('decide', 0.76)],
 [('families', 0.812),
  ('parents', 0.773),
  ('children', 0.77),
  ('youngsters', 0.766),
  ('kids', 0.748),
  ('mothers', 0.736),
  ('living', 0.73),
  ('elderly', 0.721),
  ('teenagers', 0.714),
  ('unemployed', 0.7)],
 [('importance', 0.764),
  ('challenges', 0.747),
  ('understand', 0.728),
  ('focus', 0.725),
  ('situation', 0.724),
  ('perspective', 0.721),
  ('funda