## Training a an RMN

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import os

In [2]:
os.chdir("../../../scripts/assembly")
from session_speaker_assembly import *
from preprocess import *
from document import *
from constant import SPEECHES, SPEAKER_MAP

In [3]:
df = subject_docs(session = 111, subject = "health", min_length=100)

In [4]:
df

Unnamed: 0,speech_id,speech,speakerid
12,1110000013,"[today, begin, new, congress, great, time, cha...",111118060.0
15,1110000016,"[retirement, savings, homes, facing, foreclosu...",111120160.0
44,1110000045,"[statement, yielding, time, present, opening, ...",111121410.0
69,1110000070,"[back, floor, forthwith, vote, send, senate, s...",111121410.0
199,1110000200,"[troops, fighting, two, wars, overseas, togeth...",111120961.0
...,...,...,...
179127,1110179128,"[shined, aftermath, doubt, one, difficult, day...",111117690.0
179130,1110179131,"[madam, speaker, rise, strong, support, james,...",111120130.0
179131,1110179132,"[thing, support, heroes, number, agreement, to...",111120740.0
179160,1110179161,"[government, discrimination, retaliation, fede...",111120190.0


In [5]:
speaker_speeches = df.groupby("speakerid")

In [6]:
speaker_keys = list(speaker_speeches.groups.keys())

In [7]:
speaker_keys[:10]

[111113931.0,
 111113951.0,
 111113981.0,
 111114011.0,
 111114021.0,
 111114091.0,
 111114101.0,
 111114121.0,
 111114171.0,
 111114321.0]

In [8]:
len(speaker_keys)

530

There are a total of 535 Members of Congress. 100 serve in the U.S. Senate and 435 serve in the U.S. House of Representatives. A length of 50 suggests that nearly everyone commented on "health" (in a speech of more than 50 words) at some point.

In [67]:
from keras.preprocessing.text import Tokenizer

In [111]:
tokenizer = Tokenizer()

In [112]:
tokenizer.fit_on_texts(df["speech"].values)

In [113]:
tokenizer.word_index

{'health': 1,
 'number': 2,
 'care': 3,
 'bill': 4,
 'people': 5,
 'mr': 6,
 'would': 7,
 'president': 8,
 'insurance': 9,
 'going': 10,
 'reform': 11,
 'one': 12,
 'speaker': 13,
 'us': 14,
 'american': 15,
 'today': 16,
 'government': 17,
 'want': 18,
 'americans': 19,
 'know': 20,
 'many': 21,
 'country': 22,
 'time': 23,
 'legislation': 24,
 'new': 25,
 'years': 26,
 'also': 27,
 'get': 28,
 'think': 29,
 'senator': 30,
 'need': 31,
 'act': 32,
 'congress': 33,
 'system': 34,
 'year': 35,
 'percent': 36,
 'work': 37,
 'support': 38,
 'jobs': 39,
 'make': 40,
 'important': 41,
 'public': 42,
 'medicare': 43,
 'like': 44,
 'well': 45,
 'million': 46,
 'national': 47,
 'every': 48,
 'states': 49,
 'families': 50,
 'program': 51,
 'dont': 52,
 'costs': 53,
 'say': 54,
 'billion': 55,
 'education': 56,
 'house': 57,
 'senate': 58,
 'services': 59,
 'state': 60,
 'said': 61,
 'last': 62,
 'small': 63,
 'federal': 64,
 'cost': 65,
 'children': 66,
 'committee': 67,
 'take': 68,
 'help': 6

In [115]:
vocab_size = len(tokenizer.word_index)
vocab_size

23838

In [118]:
x_train = tokenizer.texts_to_sequences(speaker_speeches.get_group(speaker_keys[0]).speech.values)

In [119]:
x_train

[[334,
  7805,
  7662,
  3065,
  5281,
  466,
  426,
  1488,
  548,
  232,
  14,
  2719,
  479,
  15710,
  5957,
  606,
  4776,
  625,
  7939,
  4575,
  334,
  888,
  606,
  15711,
  276,
  1,
  276,
  11232,
  5161,
  259,
  638,
  1015,
  1425,
  961,
  293,
  2,
  5492,
  21,
  6522,
  158,
  741,
  7023,
  534,
  15712,
  11233,
  5492,
  692,
  186,
  3717,
  7026],
 [125,
  153,
  14,
  981,
  103,
  73,
  30,
  2218,
  5036,
  4465,
  223,
  99,
  273,
  20,
  694,
  549,
  557,
  273,
  45,
  30,
  577,
  2814,
  77,
  904,
  2093,
  1,
  3,
  461,
  99,
  16,
  266,
  624,
  84,
  632,
  1189,
  1962,
  1524,
  2,
  143,
  1757,
  59,
  67,
  1820,
  85,
  38,
  549,
  30,
  577,
  2926,
  259],
 [89,
  8,
  266,
  616,
  19947,
  275,
  226,
  1892,
  5457,
  332,
  208,
  2229,
  3623,
  810,
  63,
  94,
  142,
  1892,
  142,
  22,
  18,
  11,
  1,
  3,
  34,
  20,
  293,
  375,
  63,
  94,
  160,
  25,
  39,
  974,
  3,
  116,
  706,
  31,
  3,
  1,
  9,
  53,
  905,
  94,


In [120]:
from keras.preprocessing.sequence import pad_sequences

In [125]:
max_len = WINDOW_DEFAULT + 1
x_train_padded = pad_sequences(x_train, maxlen=max_len, padding="post")

In [126]:
x_train_padded

array([[  334,  7805,  7662,  3065,  5281,   466,   426,  1488,   548,
          232,    14,  2719,   479, 15710,  5957,   606,  4776,   625,
         7939,  4575,   334,   888,   606, 15711,   276,     1,   276,
        11232,  5161,   259,   638,  1015,  1425,   961,   293,     2,
         5492,    21,  6522,   158,   741,  7023,   534, 15712, 11233,
         5492,   692,   186,  3717,  7026,     0],
       [  125,   153,    14,   981,   103,    73,    30,  2218,  5036,
         4465,   223,    99,   273,    20,   694,   549,   557,   273,
           45,    30,   577,  2814,    77,   904,  2093,     1,     3,
          461,    99,    16,   266,   624,    84,   632,  1189,  1962,
         1524,     2,   143,  1757,    59,    67,  1820,    85,    38,
          549,    30,   577,  2926,   259,     0],
       [   89,     8,   266,   616, 19947,   275,   226,  1892,  5457,
          332,   208,  2229,  3623,   810,    63,    94,   142,  1892,
          142,    22,    18,    11,     1,    

I think that the sentences need to be in integer-tokenized form.

From Iyyer et el.

"Each input to the RMN is a tuple that contains identifiers for a book and two character, as well as the spans corresponding to their relationship: $(b, c_1, c_2, S_{c_1,c_2})$. Given one such input, our objective istoreconstruct S_(c1,c2) using alinear combination of relationship descriptors from R as shown in Figure 2; we now describe this process formally."


### Needs for Baseline goal

Let...
* $s_{v_t}$ be the $t_{th}$ span of text in the span set $S_{c_1,c_2}$
* $v_{s_t}$ be the vector that results from taking the element-wise average of the word vectors in $s_{v_t}$
* $d$ be the dimension of the embedding
* $k$ be the number of decsriptors


Compute Sequence: Given $s_{v_t}$, do the following steps:
1. compute avg speech vector, $v_{s_t}$,
    * $v_{s_t} \in \mathbb{R}^{d}$
2. compute hidden state with Relu activation: 
    * $h_t =  relu \space (W_h \cdot v_{s_t})$
    * $W_h \in \mathbb{R}^{d \times d}$ 
    * $h_t \in  \mathbb{R}^{d}$
3. get distribution over topics using another hidden layer: 
    * $d_t = softmax \space (W_d \cdot h_t)$
    * $W_d \in  \mathbb{R}^{k \times d}$
    * $d_t \in  \mathbb{R}^{k}$
    * $d_{t,i} \in (0,1) \space \forall i$ 
4. recompose original sentence using the distribution over descriptors and the descriptor matrix:
    * $r_t = R^Td_t$
    * $R^T \in \mathbb{R}^{d \times k}$
    * $r_t \in \mathbb{R}^{d}$
5. score distance between $r_t$ and $v_{s_t}$
    * $distance = dist(r_t, v_{s_t})$
    
    
#### Notes on implementing it with keras
Every step that uses a matrix multiplication above can be implemented in keras using a dense layer, formatted like this:
* `h = keras.layers.Dense(units = a, input_shape = (b, ), activation= "the_activation")(prev_layer)`
    * This will make the dense layer use a weight matrix $W \in \mathbb{R}^{a \times b}$, and activation "`the_activation`"

In [127]:
# Imports
import keras
import tensorflow as tf
from keras.layers import Embedding, Dense, Lambda

In [128]:
d = 100
k = 20

In [166]:
wordids = keras.layers.Input(shape=(max_len,))

# Embed the wordids.
e = keras.layers.Embedding(input_dim=vocab_size, 
                           output_dim=d, 
                           input_length=max_len)(wordids)

# Take elementwise average over vectors
a = keras.layers.Lambda(lambda x: keras.backend.mean(x, axis=1))(e)

# dense layer
ht = keras.layers.Dense(units = d, input_shape = (d, ), activation = "relu")(a)

# dense layer with softmax activation, (where previous states will eventually be inserted) 
dt = keras.layers.Dense(units = k, input_shape = (d, ), activation = "softmax")(ht)

# reconstruction layer
rt = keras.layers.Dense(units = d, input_shape = (k, ), activation = "linear")(dt)

# rt = keras.layers.Dense(units = d, input_shape = (k, ), activation = "linear")(a)

In [167]:
print(rt)

Tensor("dense_34/BiasAdd:0", shape=(?, 100), dtype=float32)


In [168]:
model.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 51)                0         
_________________________________________________________________
embedding_9 (Embedding)      (None, 51, 100)           2383800   
_________________________________________________________________
lambda_9 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_25 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_26 (Dense)             (None, 20)                2020      
_________________________________________________________________
dense_27 (Dense)             (None, 100)               2100      
Total params: 2,398,020
Trainable params: 2,398,020
Non-trainable params: 0
_________________________________________________

In [177]:
#compile model
model = keras.Model(inputs=wordids, outputs=rt)
model.compile(optimizer = 'adam', loss="categorical_crossentropy")

In [180]:
model.fit(x=x_train_padded, y=x_train_padded, batch_size=1)

ValueError: Error when checking target: expected dense_34 to have shape (100,) but got array with shape (51,)

In [161]:
for l in model.layers:
    print(l)
    print(50*"=")
    print("input shape", l.input_shape)
    print("output shape", l.output_shape)

<keras.engine.input_layer.InputLayer object at 0x7fd9dc31af90>
input shape (None, 51)
output shape (None, 51)
<keras.layers.embeddings.Embedding object at 0x7fd9dc31add0>
input shape (None, 51)
output shape (None, 51, 100)
<keras.layers.core.Lambda object at 0x7fd9dc208d90>
input shape (None, 51, 100)
output shape (None, 100)
<keras.layers.core.Dense object at 0x7fd9dc31acd0>
input shape (None, 100)
output shape (None, 100)
<keras.layers.core.Dense object at 0x7fd9dc31ad50>
input shape (None, 100)
output shape (None, 20)
<keras.layers.core.Dense object at 0x7fd9dc371a10>
input shape (None, 20)
output shape (None, 100)


In [173]:
from keras.models import Sequential
from keras.layers import Flatten, Dropout

In [176]:
model = Sequential()
model.add(Flatten(input_shape=(4,)))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='sigmoid'))

ValueError: Input 0 is incompatible with layer flatten_2: expected min_ndim=3, found ndim=2

In [None]:
mo