# RMN Baseline

Training an RMN on sessions 105 - 111

In [1]:
import os
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
os.chdir("../../../scripts/assembly/")

In [3]:
from preprocess import *
from document import *
from subject import subject_keywords
from constant import SPEECHES, SPEAKER_MAP, HB_PATH, EMBEDDINGS, DOC_PATH, DOCUMENT

In [4]:
os.chdir("../modeling")
from token_mapping import *

In [5]:
DOC_SAMPLE_PATH = os.path.join(DOC_PATH, "doc-sample/")

In [58]:
docs_df = load_documents(subject_keywords.keys(), DOC_SAMPLE_PATH)

In [60]:
docs_df

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,congress,subject
12793,105115670,HOEKSTRA,PETER,H,MI,M,R,effort need learn process need share learning ...,105,foreign
106939,110117200,WOOLSEY,LYNN,H,CA,F,D,live fear afraid go market send children schoo...,110,labor
88259,109115190,SHAYS,CHRISTOPHER,H,CT,M,R,title x kept inflation since additional incre...,109,labor
63461,107112870,CALLAHAN,H.,H,AL,M,R,watching american people think dumb cannot see...,107,government
87748,111118101,BROWN,SHERROD,S,OH,M,D,school college graduates men women returning s...,111,health
...,...,...,...,...,...,...,...,...,...,...
147953,111121510,THOMPSON,MIKE,H,CA,M,D,later opened private practice served first com...,111,labor
65435,107119030,CONYERS,JOHN,H,MI,M,D,greatest unpopular group moment happens subjec...,107,government
38588,106116380,FILNER,BOB,H,CA,M,D,united states democracy exercised right floor ...,106,defense
62860,110113851,SMITH,GORDON,S,OR,M,R,father braxton says shouted attackers ran four...,110,justice


In [61]:
N = 100000
docs_df = docs_df.sample(N)

In [62]:
docs_df.dtypes

speakerid    object
lastname     object
firstname    object
chamber      object
state        object
gender       object
party        object
document     object
congress     object
subject      object
dtype: object

In [63]:
feature_columns = [
 'speakerid',
 'chamber',
 'state',
 'gender',
 'party',
 'congress',
 'subject']

In [64]:
token_dict = build_tokenizer_dict(docs_df, feature_columns)

In [65]:
metadata_dict = build_metadata_dict(docs_df, feature_columns)
metadata_dict.keys()

dict_keys(['speakerid', 'chamber', 'state', 'gender', 'party', 'congress', 'subject'])

In [18]:
os.chdir("../modeling")
os.listdir(os.getcwd())

from embeddings import *
# from orthoganlity_constraint import Orthoganal
from rmn import RMN

In [66]:
import pickle
embedding_file = "/home/rocassius/gen-data/tools/embbedding_index_50d"

f = open(embedding_file, "rb")
embeddings_index = pickle.load(f)
f.close()

embeddings_matrix = build_embedding_matrix(token_dict['document']['token_index'], embeddings_index).astype('float16')

In [67]:
# average of spane embeddings
Vst_train = embeddings_matrix[token_dict['document']['tokenized']].mean(axis=1)

In [68]:
Vst_train.shape

(100000, 50)

In [69]:
type(Vst_train)

numpy.ndarray

In [70]:
inputs = [Vst_train] + [np.array(token_dict[col]['tokenized']) for col in metadata_dict.keys()]

In [71]:
np.random.seed(565)
rmn = RMN()
rmn.build_model(metadata_dict)
rmn.model.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
speakerid.Input (InputLayer)    [(None, 1)]          0                                            
__________________________________________________________________________________________________
chamber.Input (InputLayer)      [(None, 1)]          0                                            
__________________________________________________________________________________________________
state.Input (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
gender.Input (InputLayer)       [(None, 1)]          0                                            
____________________________________________________________________________________________

In [72]:
rmn.model.fit(x = inputs, y = Vst_train, batch_size = 10, epochs = 5)

Train on 100000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f3e41eb1a10>

In [88]:
rmn.model.predict(inputs)

array([[-2.4160594e-03, -1.4025476e-02, -7.4551947e-02, ...,
        -1.7215486e-03, -2.7226191e-03, -6.8040267e-02],
       [-1.7546168e-01, -6.0355365e-03, -9.1219349e-03, ...,
         1.1009477e-02, -4.8256375e-02,  4.9715247e-02],
       [ 1.1811171e-01,  2.2775386e-01,  1.0932796e-03, ...,
        -7.1774743e-02, -3.3051699e-02,  4.6352688e-02],
       ...,
       [-5.1692128e-02, -2.4251862e-01,  3.9374605e-03, ...,
        -2.8007929e-03, -1.7776385e-02,  1.4029663e-02],
       [-1.6459972e-02,  6.4563848e-02,  2.7467124e-04, ...,
        -1.2140328e-02,  2.4235472e-03,  1.1764930e-01],
       [ 1.1380946e-02,  1.1761386e-02, -1.0027994e-02, ...,
         2.9696906e-01,  1.4413607e-01,  2.2544074e-01]], dtype=float32)

In [73]:
#from keras.models import load_model
from tensorflow.keras.models import load_model
rmn.model.save('/home/rocassius/gen-data/models/baseline_rmn.h5')  # creates a HDF5 file 'my_model.h5'


In [83]:
file = open('/home/rocassius/gen-data/models/baseline_rmn_arch.txt', "w")
file.write(rmn.model.to_json())
file.close()

In [None]:
rmn.model

In [74]:
from tensorflow.keras.models import load_model, model_from_json

In [86]:
openfile = open('/home/rocassius/gen-data/models/baseline_rmn_arch.txt', "r")
m = model_from_json(openfile.read())
openfile.close()
m.load_weights('/home/rocassius/gen-data/models/baseline_rmn.h5')

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


[<tf.Tensor 'Span.Input_5:0' shape=(?, 50) dtype=float32>,
 <tf.Tensor 'speakerid.Input_5:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'chamber.Input_1:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'state.Input_1:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'gender.Input_1:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'party.Input_1:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'congress.Input_1:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'subject.Input_1:0' shape=(?, 1) dtype=float32>]

In [95]:
import tensorflow as tf
import tensorflow.keras.backend as K

from tensorflow.keras.layers import Embedding, Dense, Lambda, Input, Reshape
from embeddings import EMBEDDING_DIM

In [97]:
layer_name = 'Wd'
intermediate_layer_model =tf.keras.Model(inputs=rmn.model.input,
                                 outputs=rmn.model.get_layer(layer_name).output)


In [99]:
dts = intermediate_layer_model.predict(inputs)

In [100]:
dts.shape

(100000, 20)