## Training an RMN

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import os

In [2]:
os.chdir("../../../scripts/assembly")
from session_speaker_assembly import *
from preprocess import *
from document import *
from constant import SPEECHES, SPEAKER_MAP, HB_PATH, EMBEDDINGS

In [358]:
session = 111
speak_map_cols = ['speakerid','chamber','state','gender','party']

speaker_map_df = pd.read_csv(os.path.join(HB_PATH,SPEAKER_MAP % session), sep = '|')[speak_map_cols]
speaker_map_df = speaker_map_df.groupby('speakerid').last().reset_index()
speaker_map_df

Unnamed: 0,speakerid,chamber,state,gender,party
0,111113931,S,IN,M,D
1,111113951,S,UT,M,R
2,111113981,S,MO,M,R
3,111114011,S,KS,M,R
4,111114021,S,KY,M,R
...,...,...,...,...,...
552,111121840,H,NV,M,R
553,111121930,H,IL,M,D
554,111121940,H,FL,M,D
555,111121950,H,AZ,F,D


In [359]:
subject_df = subject_docs(session = session, path = HB_PATH, subject = "health", min_len_tokens=100)
subject_df.head()

Unnamed: 0,speakerid,speech
0,111118060.0,pay their bills and keep their homes. small bu...
1,111120160.0,honest and fair prosperity for the many. not j...
2,111121410.0,rarely has our great Nation faced such grave c...
3,111120961.0,together. With the middle class struggling to ...
4,111114091.0,amount of pride in noting that in each of thes...


In [600]:
# megre speech and speaker metadata
session_df = subject_df.merge(speaker_map_df, how = 'inner', on = 'speakerid')

# ensure proper merge
assert(subject_df.shape[0]==session_df.shape[0])
assert(subject_df.shape[1] + len(speak_map_cols) - 1 == session_df.shape[1])

In [631]:
# subset data for prelim building
size = session_df.shape[0]
sample_df = session_df.iloc[:size,:]

# one-hot-encode speaker metadata
for col in speak_map_cols:
    sample_df = pd.concat([sample_df,pd.get_dummies(sample_df[col])], axis = 1)
    
sample_df

Unnamed: 0,speakerid,speech,chamber,state,gender,party,111113931.0,111113951.0,111113981.0,111114011.0,...,VT,WA,WI,WV,WY,F,M,D,I,R
0,111118060.0,pay their bills and keep their homes. small bu...,H,OH,M,R,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,111118060.0,of is contained in the bill. And we also belie...,H,OH,M,R,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,111118060.0,The bill is supposed to be about creating jobs...,H,OH,M,R,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,111118060.0,administration and enforcement team consisting...,H,OH,M,R,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
4,111118060.0,President and our Democrat colleagues here in ...,H,OH,M,R,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13297,111119610.0,Asset Relief Program. TARP. repeals TARP. repe...,H,GA,M,R,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
13298,111119610.0,on here how it was created. and it indicates n...,H,GA,M,R,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
13299,111119610.0,You wonder why this component would be in a he...,H,GA,M,R,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
13300,111119610.0,party. But. instead. we have proposed positive...,H,GA,M,R,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [632]:
sample_speakers = sample_df['speakerid'].unique()
print('speaker count:', len(sample_speakers))

speaker count: 536


There are a total of 535 Members of Congress. 100 serve in the U.S. Senate and 435 serve in the U.S. House of Representatives. A length of 50 suggests that nearly everyone commented on "health" (in a speech of more than 50 words) at some point.

In [633]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [634]:
tokenizer = Tokenizer()

In [635]:
tokenizer.fit_on_texts(session_df["speech"].values)

In [636]:
tokenizer.word_index

{'the': 1,
 'to': 2,
 'and': 3,
 'health': 4,
 'of': 5,
 'care': 6,
 'that': 7,
 'in': 8,
 'a': 9,
 'is': 10,
 'we': 11,
 'for': 12,
 'this': 13,
 'i': 14,
 'have': 15,
 'it': 16,
 'are': 17,
 'on': 18,
 'our': 19,
 'bill': 20,
 'as': 21,
 'they': 22,
 'with': 23,
 'will': 24,
 'their': 25,
 'about': 26,
 'not': 27,
 'be': 28,
 'you': 29,
 'people': 30,
 'reform': 31,
 'from': 32,
 'insurance': 33,
 'has': 34,
 'by': 35,
 'all': 36,
 'mr': 37,
 'but': 38,
 'who': 39,
 'what': 40,
 'my': 41,
 'was': 42,
 'going': 43,
 'do': 44,
 'speaker': 45,
 'would': 46,
 'an': 47,
 'so': 48,
 'more': 49,
 'at': 50,
 'president': 51,
 'or': 52,
 'there': 53,
 'one': 54,
 'american': 55,
 'been': 56,
 'if': 57,
 'when': 58,
 'americans': 59,
 'which': 60,
 'government': 61,
 'system': 62,
 'now': 63,
 'he': 64,
 'can': 65,
 'because': 66,
 'want': 67,
 'these': 68,
 'know': 69,
 'over': 70,
 'its': 71,
 'country': 72,
 'were': 73,
 'many': 74,
 'today': 75,
 'out': 76,
 'new': 77,
 'us': 78,
 'need': 

In [637]:
word_index = tokenizer.word_index
vocab_size = len(word_index)
vocab_size

17985

In [640]:
x_train = tokenizer.texts_to_sequences(sample_df['speech'].values)
len(x_train)

13302

In [641]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [642]:
max_len = WINDOW_DEFAULT + 1
x_train_padded = pad_sequences(x_train, maxlen=max_len, padding="post")

In [643]:
x_train_padded

array([[ 155,   25,  297, ...,    0,    0,    0],
       [   5,   10, 2508, ...,    0,    0,    0],
       [   1,   20,   10, ...,    0,    0,    0],
       ...,
       [  29, 1615,  183, ...,    0,    0,    0],
       [ 601,   38,  595, ...,    0,    0,    0],
       [ 444,    5,  131, ...,    0,    0,    0]], dtype=int32)

I think that the sentences need to be in integer-tokenized form.

From Iyyer et el.

"Each input to the RMN is a tuple that contains identifiers for a book and two character, as well as the spans corresponding to their relationship: $(b, c_1, c_2, S_{c_1,c_2})$. Given one such input, our objective is to reconstruct $S_(c_1,c_2)$ using a linear combination of relationship descriptors from R as shown in Figure 2; we now describe this process formally."


### Needs for Baseline goal

Let...
* $s_{v_t}$ be the $t_{th}$ span of text in the span set $S_{c_1,c_2}$
* $v_{s_t}$ be the vector that results from taking the element-wise average of the word vectors in $s_{v_t}$
* $C$ be the set metadata embeddings
* $m_{t,c}$ be the metadata embeddings vector for metadata $c$ with 
* $d$ be the dimension of the embedding
* $k$ be the number of decsriptors


Compute Sequence: Given $s_{v_t}$, do the following steps:
1. compute avg speech vector, $v_{s_t}$,
    * $v_{s_t} \in \mathbb{R}^{d}$
2. concat avg span and metadate embeddings
    * $ m_{t,c} \in \mathbb{R}^{d}$
    * [$v_{s_t}; m_{t,1};...; m_{t,|C|}$]
2. compute hidden state with Relu activation: 
    * $h_t =  relu \space (W_h \cdot [v_{s_t}; m_{t,1};...; m_{t,|C|}])$
    * $W_h \in \mathbb{R}^{d \times (d + d|C|)}$ 
    * $h_t \in  \mathbb{R}^{d}$
3. get distribution over topics using another hidden layer: 
    * $d_t = softmax \space (W_d \cdot h_t)$
    * $W_d \in  \mathbb{R}^{k \times d}$
    * $d_t \in  \mathbb{R}^{k}$
    * $d_{t,i} \in (0,1) \space \forall i$ 
4. recompose original sentence using the distribution over descriptors and the descriptor matrix:
    * $r_t = R^Td_t$
    * $R^T \in \mathbb{R}^{d \times k}$
    * $r_t \in \mathbb{R}^{d}$
5. score distance between $r_t$ and $v_{s_t}$
    * $distance = dist(r_t, v_{s_t})$
    
    
#### Notes on implementing it with keras
Every step that uses a matrix multiplication above can be implemented in keras using a dense layer, formatted like this:
* `h = keras.layers.Dense(units = a, input_shape = (b, ), activation= "the_activation")(prev_layer)`
    * This will make the dense layer use a weight matrix $W \in \mathbb{R}^{a \times b}$, and activation "`the_activation`"

In [644]:
# Imports
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Embedding, Dense, Lambda, Input, Masking

The GloVe embeddings are on a local VM, and are not yet in `gs://rwc1/embeddings/`. Attemtps to access embeddings from the gcloud bucket had bugs. You can find the embeddings used [here](https://nlp.stanford.edu/projects/glove/), which are the Wikipedia + Gigaword 5 trained embeddings with 6 billion tokens.

In [647]:
k = 20
GLOVE_DIMS = [50, 100, 200, 300]
EMBEDDING_DIM = GLOVE_DIMS[0]

embeddings_index = {}
glove = open('../../../glove/glove.6B.%dd.txt' % EMBEDDING_DIM)
for line in glove:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except Exception as e:
        print(values[1:])
        raise
        
    embeddings_index[word] = coefs
glove.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [648]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [649]:
# average of spane embeddings
Vst_train = embedding_matrix[x_train_padded].mean(axis=1)
Vst_train.shape

(13302, 50)

In [665]:
# speaker metadata inputs

metadata_dict = {}

for col in speak_map_cols:
    df = sample_df[sample_df[col].unique()].values
    dim = df.shape[1]
    metadata_dict[col] = {'input': df, 'input_dim': dim}

metadata_dict.keys()

dict_keys(['speakerid', 'chamber', 'state', 'gender', 'party'])

In [681]:
# input avg span embeddings
vt = Input(shape=(EMBEDDING_DIM,), name='Avg.Span.Embed.Input')

# masking layer to account for padding
masking_layer = Masking(mask_value=0.0, input_shape = (EMBEDDING_DIM,), name = "Mask")(vt)

## initializing speaker metadata embeddings

input_layers = [vt]
embedding_layers = [masking_layer]
for col in speak_map_cols:
    input_layer = Input(shape=(metadata_dict[col]['input_dim'],), name= col + '.Embed.Input')
    embedding_init = (Dense(units = EMBEDDING_DIM,
                            kernel_initializer = 'glorot_normal',
                            input_shape = (metadata_dict[col]['input_dim'], ),
                            activation = "linear",
                            name = 'W_' + col)(input_layer))
    input_layers.append(input_layer)
    embedding_layers.append(embedding_init)

# concat speaker metadata embeddings
_ht = tf.keras.layers.concatenate(embedding_layers, axis=1, name = 'Concat.Layer')

# dense layer
ht = Dense(units = EMBEDDING_DIM, input_shape = (_ht.shape[1], ), activation = "relu", name = "Wh")(_ht)

# dense layer with softmax activation, (where previous states will eventually be inserted) 
dt = Dense(units = k, input_shape = (EMBEDDING_DIM, ), activation = "softmax", name = "Wd")(ht)

# reconstruction layer
rt = Dense(units = EMBEDDING_DIM, input_shape = (k, ), activation = "linear", name = "R")(dt)

In [682]:
#compile model
model = tf.keras.Model(inputs=input_layers, outputs=rt)
model.compile(optimizer = 'adam', loss="hinge")

In [683]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Avg.Span.Embed.Input (InputLaye (None, 50)           0                                            
__________________________________________________________________________________________________
speakerid.Embed.Input (InputLay (None, 536)          0                                            
__________________________________________________________________________________________________
chamber.Embed.Input (InputLayer (None, 2)            0                                            
__________________________________________________________________________________________________
state.Embed.Input (InputLayer)  (None, 56)           0                                            
__________________________________________________________________________________________________
gender.Emb

In [680]:
inputs = [Vst_train]
for key in metadata_dict.keys():
    inputs.append(metadata_dict[key]['input'])

model.fit(x=inputs, y=Vst_train, batch_size=25, epochs = 3)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fa98fa59250>

In [630]:
# for l in model.layers:
#     print(l)
#     print(50*"=")
#     print("input shape", l.input_shape)
#     print("output shape", l.output_shape)

In [628]:
model.predict(inputs)

array([[ 0.47553602,  0.41967615,  0.40148813, ...,  0.5893099 ,
        -0.39425093,  0.29394513],
       [ 0.47567827,  0.41992933,  0.40180397, ...,  0.5896522 ,
        -0.39440036,  0.29401407],
       [ 0.47560775,  0.4197976 ,  0.4016418 , ...,  0.5894731 ,
        -0.39432645,  0.29397035],
       ...,
       [ 0.47676247,  0.42206305,  0.4044199 , ...,  0.5924783 ,
        -0.3954901 ,  0.29482332],
       [ 0.47675496,  0.42203835,  0.40439522, ...,  0.59245217,
        -0.3954876 ,  0.29480982],
       [ 0.47676533,  0.42207053,  0.40442747, ...,  0.5924863 ,
        -0.3954913 ,  0.29482657]], dtype=float32)

In [629]:
Vst_train

array([[ 0.07116484,  0.00912573,  0.06147658, ...,  0.00436425,
        -0.02306813, -0.00891188],
       [ 0.0666854 ,  0.01963167, -0.00729108, ...,  0.03347551,
         0.00260285,  0.03218179],
       [ 0.05064938,  0.00156331,  0.00330125, ...,  0.03080652,
        -0.0103821 ,  0.021787  ],
       ...,
       [ 0.06979422,  0.00540201, -0.0045938 , ...,  0.01659401,
        -0.01673172, -0.02367188],
       [ 0.04147102,  0.011088  ,  0.00466978, ...,  0.00896139,
        -0.01126754,  0.00458058],
       [ 0.05942265,  0.01376962,  0.01775837, ...,  0.01814727,
        -0.01915259,  0.0160123 ]])