# RMN Baseline

Training an RMN on sessions 105 - 111

In [1]:
import os
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
os.chdir("../../../scripts/assembly/")

In [3]:
from preprocess import *
from document import *
from subject import subject_keywords
from constant import SPEECHES, SPEAKER_MAP, HB_PATH, EMBEDDINGS, DOC_PATH, DOCUMENT

In [4]:
os.chdir("../modeling")
from token_mapping import ohe_attributes, build_tokenizer_dict, build_metadata_dict

In [5]:
DOC_SAMPLE_PATH = os.path.join(DOC_PATH, "doc-sample/")
os.path.join(DOC_SAMPLE_PATH, DOCUMENT % "health")

'gs://rwc1/gen-data/doc/doc-sample/documents_health.txt'

In [6]:
docs_df = load_documents(subject_keywords.keys(), DOC_SAMPLE_PATH).sample(100)

In [7]:
docs_df

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,congress,subject
73656,108120961.0,REID,HARRY,S,NV,M,D,selection shared responsibility president sena...,108,trade
131007,109117690.0,MCCARTHY,CAROLYN,H,NY,F,D,feel safe know terrorists strike one paying at...,109,foreign
24283,108118701.0,BOXER,BARBARA,S,CA,F,D,challenges face billion tons carbon dioxide ca...,108,women
71398,107119920.0,JACKSON LEE,SHEILA,H,TX,F,D,approach nations energy needs one bipartisan c...,107,foreign
123798,110120540.0,MILLER,CANDICE,H,MI,F,R,throughout career supporter workers rights bar...,110,labor
...,...,...,...,...,...,...,...,...,...,...
140393,109117680.0,MATHESON,JIM,H,UT,M,D,opportunity share remarks hr patient navigato...,109,defense
5046,105114080.0,DELAY,THOMAS,H,TX,M,R,governments relying sound science resisting me...,105,economy
145908,109116800.0,FRANK,BARNEY,H,MA,M,D,mars happen amendment defeated well voted way ...,109,foreign
121038,108113851.0,SMITH,GORDON,S,OR,M,R,bravery heroism polish citizens revolted bruta...,108,foreign


In [8]:
feature_columns = docs_df.columns.drop('document')

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


def ohe_attributes(subject_df):
    
    # extract speaker metadata attributes
    attributes = subject_df.columns.drop('document')
    
    # set attributes to string
    subject_df['speakerid'] = subject_df['speakerid'].astype(str)
    subject_df['congress'] = subject_df['congress'].astype(str)

    # one-hot-encode speaker metadata
    for col in attributes:
        subject_df = pd.concat([subject_df,pd.get_dummies(subject_df[col])], axis = 1)


    return subject_df


def build_metadata_dict(feature_columns, subject_df):
    
    # one-hot-encoded speaker metadata inputs

    metadata_dict = {}

    for col in feature_columns:
        df = subject_df[subject_df[col].unique()].values
        dim = df.shape[1]
        metadata_dict[col] = {'input': df, 'input_dim': dim}
        
    return metadata_dict




In [10]:
subject_df = ohe_attributes(docs_df)
subject_df

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,congress,subject,...,guns,health,immigration,justice,labor,minorities,money,tax,trade,women
73656,108120961.0,REID,HARRY,S,NV,M,D,selection shared responsibility president sena...,108,trade,...,0,0,0,0,0,0,0,0,1,0
131007,109117690.0,MCCARTHY,CAROLYN,H,NY,F,D,feel safe know terrorists strike one paying at...,109,foreign,...,0,0,0,0,0,0,0,0,0,0
24283,108118701.0,BOXER,BARBARA,S,CA,F,D,challenges face billion tons carbon dioxide ca...,108,women,...,0,0,0,0,0,0,0,0,0,1
71398,107119920.0,JACKSON LEE,SHEILA,H,TX,F,D,approach nations energy needs one bipartisan c...,107,foreign,...,0,0,0,0,0,0,0,0,0,0
123798,110120540.0,MILLER,CANDICE,H,MI,F,R,throughout career supporter workers rights bar...,110,labor,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140393,109117680.0,MATHESON,JIM,H,UT,M,D,opportunity share remarks hr patient navigato...,109,defense,...,0,0,0,0,0,0,0,0,0,0
5046,105114080.0,DELAY,THOMAS,H,TX,M,R,governments relying sound science resisting me...,105,economy,...,0,0,0,0,0,0,0,0,0,0
145908,109116800.0,FRANK,BARNEY,H,MA,M,D,mars happen amendment defeated well voted way ...,109,foreign,...,0,0,0,0,0,0,0,0,0,0
121038,108113851.0,SMITH,GORDON,S,OR,M,R,bravery heroism polish citizens revolted bruta...,108,foreign,...,0,0,0,0,0,0,0,0,0,0


In [213]:
def build_tokenizer_dict(subject_df):
    
    max_len = MAX_SPAN_LENGTH
    
    # building tokenizers, word indecies, and train data
    speech_tokenizer = Tokenizer()
    speech_tokenizer.fit_on_texts(subject_df['document'].values)
    speeches_word_index = speech_tokenizer.word_index
    speeches_train = speech_tokenizer.texts_to_sequences(subject_df['document'].values)
    speeches_train_padded = pad_sequences(speeches_train, maxlen=MAX_SPAN_LENGTH, padding="post")
    


    tokenizers = {}
    tokenizers['speech'] = {'tokenizer': speech_tokenizer,
                            'train': speeches_train,
                            'train_padded': speeches_train_padded,
                            'word_index': speeches_word_index}        
        
    return tokenizers

In [214]:
token_dict = build_tokenizer_dict(subject_df)

In [215]:
speeches_word_index = token_dict['speech']['word_index']
vocab_size = len(speeches_word_index)
vocab_size

1636

In [216]:
speeches_train = token_dict['speech']['train']
len(speeches_train)

100

In [217]:
token_dict['speech']['tokenizer']

<keras_preprocessing.text.Tokenizer at 0x7faeecf09b90>

In [218]:
token_dict['speech']['tokenizer'].texts_to_sequences(["hello my names is rowan"])

[[978]]

In [219]:
speeches_train_padded = token_dict['speech']['train_padded']
speeches_train_padded

array([[ 594,  595,  116, ...,    0,    0,    0],
       [ 173,  307,   12, ...,    0,    0,    0],
       [ 616,  120,   13, ...,    0,    0,    0],
       ...,
       [ 588,  244,  319, ...,    0,    0,    0],
       [1609, 1610,  590, ...,    0,    0,    0],
       [   5,  109, 1628, ...,    0,    0,    0]], dtype=int32)

In [197]:
subject_df['speakerid'].astype(float)

73656     108120961.0
131007    109117690.0
24283     108118701.0
71398     107119920.0
123798    110120540.0
             ...     
140393    109117680.0
5046      105114080.0
145908    109116800.0
121038    108113851.0
65475     111117870.0
Name: speakerid, Length: 100, dtype: float64

In [198]:
speaker_tokenizer = Tokenizer()
speaker_tokenizer.fit_on_texts(subject_df['speakerid'].astype(float))
speaker_train = speaker_tokenizer.texts_to_sequences(subject_df['speakerid'])

AttributeError: 'float' object has no attribute 'lower'

In [199]:
speaker_train = np.array(speaker)

NameError: name 'speaker' is not defined

In [158]:
subject_df['speakerid']

73656     108120961.0
131007    109117690.0
24283     108118701.0
71398     107119920.0
123798    110120540.0
             ...     
140393    109117680.0
5046      105114080.0
145908    109116800.0
121038    108113851.0
65475     111117870.0
Name: speakerid, Length: 100, dtype: object

In [161]:
#speaker_train

In [162]:
# speaker_tokenizer.word_index

In [16]:
# Imports
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Embedding, Dense, Lambda, Input, Masking

In [17]:
os.chdir("../modeling")
os.listdir(os.getcwd())

['embeddings.py',
 'rmn.py',
 'token_mapping.py',
 '__init__.py',
 '__pycache__',
 'orthoganlity_constraint.py']

In [19]:
# run this cell two or three times for some reason
os.chdir("../modeling")
from embeddings import *
from orthoganlity_constraint import Orthoganal
from rmn import RMN

In [20]:
# build embedding matrix # takes about 5 minutes
embeddings_index = fetch_embeddings()
embeddings_matrix = build_embedding_matrix(speeches_word_index, embeddings_index)

In [21]:
embeddings_matrix.shape[0]

1637

In [263]:
embeddings_matrix[speeches_train_padded[0]].mean(axis=1)

array([-0.01791566,  0.08316332, -0.01519588, -0.1933912 , -0.19751326,
        0.01797121,  0.20103093, -0.14794472, -0.01518396, -0.12063314,
        0.01714208,  0.00101226, -0.21665704,  0.05622975,  0.10482712,
        0.09804558,  0.07224398,  0.11483998, -0.10565284, -0.11654144,
       -0.06322335,  0.11483998, -0.06322335, -0.16331345, -0.16331345,
       -0.06347832,  0.05172918,  0.04884605, -0.05167143, -0.16331345,
        0.05172918,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

In [22]:
# average of spane embeddings
Vst_train = embeddings_matrix[speeches_train_padded].mean(axis=1)
Vst_train.shape

(100, 50)

In [22]:
type(embeddings_index['the'][0])

float

In [23]:
cols = ['speakerid', 'chamber', 'state', 'gender', 'party', 'congress', 'subject']
metadata_dict = build_metadata_dict(["speakerid"], subject_df)
metadata_dict.keys()

dict_keys(['speakerid'])

In [31]:
print(2)

2


In [165]:
from gensim.corpora import Dictionary

In [166]:
docs_df["speakerid"]

73656     108120961.0
131007    109117690.0
24283     108118701.0
71398     107119920.0
123798    110120540.0
             ...     
140393    109117680.0
5046      105114080.0
145908    109116800.0
121038    108113851.0
65475     111117870.0
Name: speakerid, Length: 100, dtype: object

In [167]:
speaker_dct = Dictionary([docs_df["speakerid"]])

In [200]:
speaker_train = [[speaker_dct.token2id[s]] for s in docs_df["speakerid"]]

In [201]:
speaker_train

[[58],
 [66],
 [56],
 [42],
 [85],
 [74],
 [79],
 [77],
 [57],
 [18],
 [53],
 [23],
 [69],
 [86],
 [62],
 [28],
 [91],
 [64],
 [22],
 [51],
 [43],
 [30],
 [14],
 [70],
 [16],
 [2],
 [83],
 [89],
 [25],
 [44],
 [72],
 [3],
 [95],
 [88],
 [1],
 [60],
 [94],
 [37],
 [7],
 [50],
 [31],
 [87],
 [24],
 [87],
 [82],
 [35],
 [55],
 [40],
 [19],
 [39],
 [13],
 [36],
 [45],
 [48],
 [5],
 [9],
 [33],
 [21],
 [61],
 [0],
 [80],
 [49],
 [17],
 [59],
 [6],
 [81],
 [68],
 [52],
 [54],
 [92],
 [97],
 [15],
 [32],
 [8],
 [76],
 [10],
 [12],
 [46],
 [75],
 [73],
 [27],
 [20],
 [11],
 [61],
 [96],
 [84],
 [26],
 [93],
 [29],
 [67],
 [34],
 [71],
 [41],
 [38],
 [78],
 [65],
 [4],
 [63],
 [47],
 [90]]

In [27]:
metadata_dict['speakerid']['input'].shape

(100, 98)

In [37]:
docs_df

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,congress,subject
728,105117660.0,KINGSTON,JACK,H,GA,M,R,want make sure understand talking child child ...,105,tax
51361,109114430.0,RYUN,JIM,H,KS,M,R,uninsured united states number grown since ei...,109,health
26040,105112580.0,LIVINGSTON,ROBERT,H,LA,M,R,fact appreciate comments talking bill bill bit...,105,foreign
42768,106113210.0,MORELLA,CONSTANCE,H,MD,F,R,impaired health antisocial behavior floor ofte...,106,government
3190,107118111.0,CANTWELL,MARIA,S,WA,F,D,commitment legislation together also appreciat...,107,mail
...,...,...,...,...,...,...,...,...,...,...
90678,108119920.0,JACKSON LEE,SHEILA,H,TX,F,D,human intelligence problem intelligence system...,108,trade
31359,106112451.0,FITZGERALD,PETER,S,IL,M,R,something million project back million million...,106,tax
133626,111113981.0,BOND,CHRISTOPHER,S,MO,M,R,expected get something else working found expe...,111,labor
33413,110117710.0,MCKEON,HOWARD,H,CA,M,R,entitlement spending expense lowand middleinco...,110,crime


In [289]:
embeddings_matrix.shape

(1637, 50)

In [353]:
import tensorflow as tf
import tensorflow.keras.backend as K

from tensorflow.keras.layers import Embedding, Dense, Lambda, Input, Reshape
from embeddings import EMBEDDING_DIM


# constants
MAX_SPAN_LENGTH = 50
NUM_TOPICS = 20
OPTIMIZER = 'adam'


class RMN(object):
    
    def __init__(
        self, 
        embedding_dim = EMBEDDING_DIM, 
        num_topics = NUM_TOPICS):
        
        self.embedding_dim = embedding_dim
        self.num_topics = num_topics
        self.model = None
    
    def model_loss(self, rt_layer, embedding_layer, lamb = 1.0):
        R = K.transpose(rt_layer)
        
        def custom_loss(y_true, y_pred):
            hinge_loss = tf.keras.losses.hinge(y_true, y_pred)
            RR_t = K.dot(R, K.transpose(R))
            Id_mat = K.eye(self.embedding_dim)
            orth_penalty = K.sqrt(K.sum(K.square(RR_t - Id_mat)))
            
            return hinge_loss + lamb*orth_penalty
        
        return custom_loss
    
    def build_model(self, metadata_dict):

        vt = Input(shape=(MAX_SPAN_LENGTH,), name='Span.Input')
    
        input_layers = [vt]
        embedding_layers = [vt]
        for col in metadata_dict.keys():
            # emedding layers
            input_layer = Input(shape=(1,), name= col + '.Input')
            embedding_init = Embedding(
                input_dim = metadata_dict[col]['input_dim'], 
                output_dim = self.embedding_dim,
                input_length = 1)(input_layer)
            
            # reshape
            embedding_layer = Reshape((self.embedding_dim,), name=col + '.Embed.Layer')(embedding_init)

            input_layers.append(input_layer)
            embedding_layers.append(embedding_layer)

        # concat speaker metadata embeddings
        _ht = tf.keras.layers.Concatenate(axis=0, name = 'Concat.Layer')(embedding_layers)

        # dense layer
        ht = Dense(units = self.embedding_dim, 
                   input_shape = (_ht.shape[1], ), 
                   activation = "relu", name = "Wh")(_ht)

        # dense layer with softmax activation, (where previous states will eventually be inserted) 
        dt = Dense(units = self.num_topics, 
                   input_shape = (self.embedding_dim, ), 
                   activation = "softmax", name = "Wd")(ht)

        # reconstruction layer
        rt = Dense(units = self.embedding_dim,
                   input_shape = (self.num_topics, ),
                   activation = "linear",
                   # kernel_regularizer = Orthoganal(),
                   name = "R")(dt)

        model = tf.keras.Model(inputs=input_layers, outputs=rt)
        model.compile(optimizer = OPTIMIZER, loss = self.model_loss(rt, st_embedded))

        self.model = model
        return model   

In [354]:
np.random.seed(565)
rmn = RMN()
rmn.build_model(metadata_dict)
rmn.model.summary()

Model: "model_46"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
speakerid.Input (InputLayer)    [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_131 (Embedding)       (None, 1, 50)        4900        speakerid.Input[0][0]            
__________________________________________________________________________________________________
Span.Input (InputLayer)         [(None, 50)]         0                                            
__________________________________________________________________________________________________
speakerid.Embed.Layer (Reshape) (None, 50)           0           embedding_131[0][0]              
___________________________________________________________________________________________

In [355]:
inputs = [Vst_train, speaker_train]
# model.fit(x = inputs, y = Vst_train, batch_size=2, epochs = 5)
model.fit(x = inputs, y = Vst_train, batch_size = 1, epochs = 10)

Train on 100 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7faf10013b10>

In [339]:
E  = K.transpose(st_embedded)

In [341]:
E[2::]

ValueError: Cannot evaluate tensor using `eval()`: No default session is registered. Use `with sess.as_default()` or pass an explicit session to `eval(session=sess)`

In [337]:
kvar = K.variable(np.array([13,5,6]), dtype='int8')

In [332]:
kvar

<tf.Variable 'Variable_4:0' shape=(3,) dtype=float32>

In [322]:
K.transpose(rt)

<tf.Tensor 'transpose_24:0' shape=(50, ?) dtype=float32>

In [321]:
embeddings_matrix.shape

(1637, 50)