## Training an RMN

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import os

In [2]:
os.chdir("../../../scripts/assembly")
from session_speaker_assembly import *
from preprocess import *
from document import *
from constant import SPEECHES, SPEAKER_MAP, HB_PATH, EMBEDDINGS

In [3]:
session = 111
speak_map_cols = ['speakerid','chamber','state','gender']

speaker_map_df = pd.read_csv(os.path.join(HB_PATH,SPEAKER_MAP % session), sep = '|')[speak_map_cols]
speaker_map_df = speaker_map_df.groupby('speakerid').last().reset_index()
speaker_map_df

Unnamed: 0,speakerid,chamber,state,gender
0,111113931,S,IN,M
1,111113951,S,UT,M
2,111113981,S,MO,M
3,111114011,S,KS,M
4,111114021,S,KY,M
...,...,...,...,...
552,111121840,H,NV,M
553,111121930,H,IL,M
554,111121940,H,FL,M
555,111121950,H,AZ,F


In [4]:
subject_df = subject_docs(session = session,
                          speech_path = HB_PATH,
                          min_tokens=MIN_TOKENS,
                          span_finder=make_span_finder("health", WINDOW))
subject_df.head()

Unnamed: 0,speakerid,party,speech,congress
0,111120160.0,D,honest and fair prosperity for the many. not j...,111
1,111121410.0,D,put Americans back to work by investing in job...,111
2,111116790.0,R,on this. and no one chose to yield to me at al...,111
3,111120961.0,D,together. With the middle class struggling to ...,111
4,111119891.0,R,for all. He did it in a way where Atlanta was ...,111


In [5]:
# megre speech and speaker metadata
session_df = subject_df.merge(speaker_map_df, how = 'inner', on = 'speakerid')

# ensure proper merge
assert(subject_df.shape[0]==session_df.shape[0])
assert(subject_df.shape[1] + len(speak_map_cols) - 1 == session_df.shape[1])

In [6]:
session_df.head()

Unnamed: 0,speakerid,party,speech,congress,chamber,state,gender
0,111120160.0,D,honest and fair prosperity for the many. not j...,111,H,CT,M
1,111120160.0,D,Congressman STARK. and many others for their t...,111,H,CT,M
2,111120160.0,D,modify the terms of mortgage loans. we will gi...,111,H,CT,M
3,111120160.0,D,the Nations wealthiest 1 percent. not the baro...,111,H,CT,M
4,111120160.0,D,struggles for equality. as well as political a...,111,H,CT,M


In [7]:
# subset data for prelim building
size = session_df.shape[0]
sample_df = session_df.iloc[:size,:]

sample_df['speakerid'] = sample_df['speakerid'].astype(str)

# one-hot-encode speaker metadata
for col in speak_map_cols:
    sample_df = pd.concat([sample_df,pd.get_dummies(sample_df[col])], axis = 1)
    

sample_df

Unnamed: 0,speakerid,party,speech,congress,chamber,state,gender,111113931.0,111113951.0,111113981.0,...,UT,VA,VI,VT,WA,WI,WV,WY,F,M
0,111120160.0,D,honest and fair prosperity for the many. not j...,111,H,CT,M,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,111120160.0,D,Congressman STARK. and many others for their t...,111,H,CT,M,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,111120160.0,D,modify the terms of mortgage loans. we will gi...,111,H,CT,M,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,111120160.0,D,the Nations wealthiest 1 percent. not the baro...,111,H,CT,M,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,111120160.0,D,struggles for equality. as well as political a...,111,H,CT,M,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14315,111119610.0,R,You wonder why this component would be in a he...,111,H,GA,M,0,0,0,...,0,0,0,0,0,0,0,0,0,1
14316,111119610.0,R,not defend this 2.000page spaghetti plate here...,111,H,GA,M,0,0,0,...,0,0,0,0,0,0,0,0,0,1
14317,111119610.0,R,party. But. instead. we have proposed positive...,111,H,GA,M,0,0,0,...,0,0,0,0,0,0,0,0,0,1
14318,111119610.0,R,around him. skimming the top of his right boot...,111,H,GA,M,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [8]:
sample_speakers = sample_df['speakerid'].unique()
print('speaker count:', len(sample_speakers))

speaker count: 536


There are a total of 535 Members of Congress. 100 serve in the U.S. Senate and 435 serve in the U.S. House of Representatives. A length of 50 suggests that nearly everyone commented on "health" (in a speech of more than 50 words) at some point.

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [10]:
# building tokenizers, word indecies, and train data

speech_tokenizer = Tokenizer()
speech_tokenizer.fit_on_texts(sample_df['speech'].values)
speeches_word_index = speech_tokenizer.word_index

tokenizers = {}
tokenizers['speech'] = {'tokenizer': speech_tokenizer,
                        'train': speech_tokenizer.texts_to_sequences(sample_df['speech'].values),
                        'word_index': speeches_word_index}

for col in speak_map_cols:
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sample_df[col].values)
    tokenizers[col] = {}
    tokenizers[col]['train'] = tokenizer.texts_to_sequences(sample_df[col].values)
    tokenizers[col]['word_index'] = tokenizer.word_index
    tokenizers[col]['tokenizer'] = tokenizer

In [11]:
vocab_size = len(speeches_word_index)
vocab_size

20409

In [12]:
speeches_train = tokenizers['speech']['train']
len(speeches_train)

14320

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
max_len = WINDOW + 1
speeches_train_padded = pad_sequences(speeches_train, maxlen=max_len, padding="post")

In [15]:
speeches_train_padded

array([[ 2340,     3,   843, ...,     0,     0,     0],
       [  809,  3298,     3, ...,     0,     0,     0],
       [ 3899,     1,   582, ...,     0,     0,     0],
       ...,
       [  589,    39,   520, ...,     0,     0,     0],
       [  328,   364, 20407, ...,     0,     0,     0],
       [  622,     4,   146, ...,     0,     0,     0]], dtype=int32)

I think that the sentences need to be in integer-tokenized form.

From Iyyer et el.

"Each input to the RMN is a tuple that contains identifiers for a book and two character, as well as the spans corresponding to their relationship: $(b, c_1, c_2, S_{c_1,c_2})$. Given one such input, our objective is to reconstruct $S_(c_1,c_2)$ using a linear combination of relationship descriptors from R as shown in Figure 2; we now describe this process formally."


### Needs for Baseline goal

Let...
* $s_{v_t}$ be the $t_{th}$ span of text in the span set $S_{c_1,c_2}$
* $v_{s_t}$ be the vector that results from taking the element-wise average of the word vectors in $s_{v_t}$
* $C$ be the set metadata embeddings
* $m_{t,c}$ be the metadata embeddings vector for metadata $c$ with 
* $d$ be the dimension of the embedding
* $k$ be the number of decsriptors


Compute Sequence: Given $s_{v_t}$, do the following steps:
1. compute avg speech vector, $v_{s_t}$,
    * $v_{s_t} \in \mathbb{R}^{d}$
2. concat avg span and metadate embeddings
    * $ m_{t,c} \in \mathbb{R}^{d}$
    * [$v_{s_t}; m_{t,1};...; m_{t,|C|}$]
2. compute hidden state with Relu activation: 
    * $h_t =  relu \space (W_h \cdot [v_{s_t}; m_{t,1};...; m_{t,|C|}])$
    * $W_h \in \mathbb{R}^{d \times (d + d|C|)}$ 
    * $h_t \in  \mathbb{R}^{d}$
3. get distribution over topics using another hidden layer: 
    * $d_t = softmax \space (W_d \cdot h_t)$
    * $W_d \in  \mathbb{R}^{k \times d}$
    * $d_t \in  \mathbb{R}^{k}$
    * $d_{t,i} \in (0,1) \space \forall i$ 
4. recompose original sentence using the distribution over descriptors and the descriptor matrix:
    * $r_t = R^Td_t$
    * $R^T \in \mathbb{R}^{d \times k}$
    * $r_t \in \mathbb{R}^{d}$
5. score distance between $r_t$ and $v_{s_t}$
    * $distance = dist(r_t, v_{s_t})$
    
    
#### Notes on implementing it with keras
Every step that uses a matrix multiplication above can be implemented in keras using a dense layer, formatted like this:
* `h = keras.layers.Dense(units = a, input_shape = (b, ), activation= "the_activation")(prev_layer)`
    * This will make the dense layer use a weight matrix $W \in \mathbb{R}^{a \times b}$, and activation "`the_activation`"

In [16]:
# Imports
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Embedding, Dense, Lambda, Input, Masking

The GloVe embeddings are on a local VM, and are not yet in `gs://rwc1/embeddings/`. Attemtps to access embeddings from the gcloud bucket had bugs. You can find the embeddings used [here](https://nlp.stanford.edu/projects/glove/), which are the Wikipedia + Gigaword 5 trained embeddings with 6 billion tokens.

In [17]:
k = 20
GLOVE_DIMS = [50, 100, 200, 300]
EMBEDDING_DIM = GLOVE_DIMS[0]

embeddings_index = {}
glove = open('../../../glove/glove.6B.%dd.txt' % EMBEDDING_DIM)
for line in glove:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except Exception as e:
        print(values[1:])
        raise
        
    embeddings_index[word] = coefs
glove.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [18]:
embedding_matrix = np.zeros((len(speeches_word_index) + 1, EMBEDDING_DIM))
for word, i in speeches_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [19]:
# average of spane embeddings
Vst_train = embedding_matrix[speeches_train_padded].mean(axis=1)
Vst_train.shape

(14320, 50)

In [20]:
# one-hot-encoded speaker metadata inputs

metadata_dict = {}

for col in speak_map_cols:
    df = sample_df[sample_df[col].unique()].values
    dim = df.shape[1]
    metadata_dict[col] = {'input': df, 'input_dim': dim}

metadata_dict.keys()

dict_keys(['speakerid', 'chamber', 'state', 'gender'])

In [263]:
from tensorflow.keras.constraints import Constraint

class Orthoganal(Constraint):
    """Constrains the weight matrix of a tensor's
    hidden units to be orthogonal during optimization.
    
    # Args ---
        axis: axis along which orthognality condition
        is applied. Defualt of None applies to column
        orthogonality."""
    
    def __init__(self, axis = 1, lamb = 1.0):
        self.axis = axis
        self.lamb = lamb

    def __call__(self, weight_mat):        
        return self.orthoganalize(weight_mat)
        
    def orthoganalize(self, weight_mat):
        if self.axis == 1:
            weight_mat = K.transpose(weight_mat)
            
        RR_t = K.dot(K.transpose(weight_mat), weight_mat)
        Id_mat = K.eye(int(RR_t.shape[0]))
        return self.lamb*K.sqrt(K.sum(K.square(RR_t - Id_mat)))

In [74]:
def model_loss(self, ):
    
    loss_func = tensorflow.keras.losses.hinge
    
    
    def custom_loss(y_true, y_pred):
        

In [264]:
# input avg span embeddings
vt = Input(shape=(EMBEDDING_DIM,), name='Avg.Span.Embed.Input')

# masking layer to account for padding
# masking_layer = Masking(mask_value=0.0, input_shape = (EMBEDDING_DIM,), name = "Mask")(vt)

## initializing speaker metadata embeddings

input_layers = [vt]
embedding_layers = [vt]
for col in speak_map_cols:
    
    # one-hot-encoded
    input_layer = Input(shape=(metadata_dict[col]['input_dim'],), name= col + '.Embed.Input')
    embedding_init = (Dense(units = EMBEDDING_DIM,
                            kernel_initializer = 'glorot_normal',
                            input_shape = (metadata_dict[col]['input_dim'], ),
                            activation = "linear",
                            name = 'C_' + col)(input_layer))
    
    # keras embedding layers
#     input_layer = (Embedding(output_dim = EMBEDDING_DIM,
#                              embeddings_initializer = 'glorot_normal',
#                             input_dim = (metadata_dict[col]['input_dim'], ),
#                             name = 'C_' + col))
    
    input_layers.append(input_layer)
    embedding_layers.append(embedding_init)

# concat speaker metadata embeddings
_ht = tf.keras.layers.Concatenate(axis=1, name = 'Concat.Layer')(embedding_layers)

# dense layer
ht = Dense(units = EMBEDDING_DIM, input_shape = (_ht.shape[1], ), activation = "relu", name = "Wh")(_ht)

# dense layer with softmax activation, (where previous states will eventually be inserted) 
dt = Dense(units = k, input_shape = (EMBEDDING_DIM, ), activation = "softmax", name = "Wd")(ht)

# reconstruction layer
rt = Dense(units = EMBEDDING_DIM,
           input_shape = (k, ),
           activation = "linear",
           kernel_regularizer = Orthoganal(),
           name = "R")(dt)

In [265]:
# compile model
model = tf.keras.Model(inputs=input_layers, outputs=rt)
model.compile(optimizer = 'adam', loss="hinge")

NameError: name 'lamb' is not defined

In [258]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
speakerid.Embed.Input (InputLay (None, 536)          0                                            
__________________________________________________________________________________________________
chamber.Embed.Input (InputLayer (None, 2)            0                                            
__________________________________________________________________________________________________
state.Embed.Input (InputLayer)  (None, 56)           0                                            
__________________________________________________________________________________________________
gender.Embed.Input (InputLayer) (None, 2)            0                                            
__________________________________________________________________________________________________
Avg.Span.E

In [254]:
inputs = [Vst_train]
for key in metadata_dict.keys():
    inputs.append(metadata_dict[key]['input'])

model.fit(x=inputs, y=Vst_train, batch_size=50, epochs = 5)

Epoch 1/5


FailedPreconditionError: Attempting to use uninitialized value loss_22/kernel/Regularizer/Variable
	 [[{{node loss_22/kernel/Regularizer/Variable/read}}]]

In [246]:
R = np.transpose(model.get_layer('R').get_weights()[0])
R.shape

(50, 20)

In [247]:
np.linalg.matrix_rank(R)

20

In [237]:
R_ = np.dot(R,np.transpose(R))
ones_R = np.ones_like(R_)
(np.dot(R_,np.transpose(R_)) - ones_R).sum()

-2469.27

In [None]:
model.predict(inputs)

In [None]:
Vst_train

- What is the file drawer problem? Why is the file drawer problem important from the perspective of a firm trying to learn about the effectiveness of an intervention from peer reviewed research?
- One response to the file drawer problem is to say, if there are multiple findings that point in the same direction, the effect is  "real." What is the logic of this claim? How does p-hacking subvert this logic?
- What is the pcurve? What is it meant to demonstrate (Figure   1). What is the key comparison to make based on Figure 1?