In [1]:
import os
os.chdir("/home/rocassius/w266_final/scripts/modeling")

In [5]:
import os
import numpy as np
import pandas as pd
from scipy import stats

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Embedding, Dense, Lambda, Input, Reshape

from embeddings import EMBEDDING_DIM

In [6]:
from preprocess import *
from document import *
from subject import subject_keywords
from constant import SPEECHES, SPEAKER_MAP, HB_PATH, EMBEDDINGS, DOC_PATH, DOCUMENT

In [26]:
# os.chdir("../modeling")
from token_mapping import *
from embeddings import *

In [8]:
DOC_SAMPLE_PATH = os.path.join(DOC_PATH, "doc-sample/")
docs_df = load_documents(subject_keywords.keys(), DOC_SAMPLE_PATH)
N = 100
docs_df = docs_df.sample(N)

In [16]:
feature_columns = [
 'speakerid',
 'chamber',
 'state',
 'gender',
 'party',
 'congress',
 'subject']

In [17]:
token_dict = build_tokenizer_dict(docs_df, feature_columns)

In [18]:
metadata_dict = build_metadata_dict(docs_df, feature_columns)
metadata_dict.keys()

dict_keys(['speakerid', 'chamber', 'state', 'gender', 'party', 'congress', 'subject'])

In [37]:
def build_embedding_matrix(word_index, embeddings_index):

    embedding_dim = len(embeddings_index['the'])
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector 
            
    embedding_matrix = embedding_matrix.astype('float16')
    
    return embedding_matrix

In [38]:
import pickle
embedding_file = "/home/rocassius/gen-data/tools/embbedding_index_50d"

f = open(embedding_file, "rb")
embeddings_index = pickle.load(f)
f.close()

embeddings_matrix = build_embedding_matrix(token_dict['document']['token_index'], embeddings_index)#.astype('float16')

In [39]:
feature_columns = [
 'speakerid',
 'chamber',
 'state',
 'gender',
 'party',
 'congress',
 'subject']


dict_keys(['speakerid', 'chamber', 'state', 'gender', 'party', 'congress', 'subject'])

In [None]:
def build_tokenizer_dict(subject_df):
    
    max_len = WINDOW + 1
    
    # building tokenizers, word indecies, and train data
    speech_tokenizer = Tokenizer()
    speech_tokenizer.fit_on_texts(subject_df['speech'].values)
    speeches_word_index = speech_tokenizer.word_index
    speeches_train = speech_tokenizer.texts_to_sequences(subject_df['speech'].values)
    speeches_train_padded = pad_sequences(speeches_train, maxlen=WINDOW + 1, padding="post")
    


    tokenizers = {}
    tokenizers['speech'] = {'tokenizer': speech_tokenizer,
                            'train': speeches_train,
                            'train_padded': speeches_train_padded,
                            'word_index': speeches_word_index}
        
    return tokenizers

In [73]:
from functools import partial

In [None]:
pad

In [142]:
def build_metadata_dict(document_df, metadata_columns):

    metadata_dict = {}
    
    for col in metadata_columns:
        
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(document_df[col])
    
        metadata_dict[col] = {
            'tokenizer': tokenizer,
            'tokenize': tokenizer.texts_to_sequences,
            'token_index': tokenizer.word_index, 
            'input_dim': len(tokenizer.word_index)}        
        
    return metadata_dict

In [234]:
from functools import partial

In [244]:
def tokenize_pad(documents, tokenizer, max_span_len):
    """Tokenize and pad documents using a tokenizer"""
    
    tokenized = tokenizer.texts_to_sequences(documents)
    padded = pad_sequences(tokenized, maxlen=max_span_len, padding = "post")
    
    return padded

In [245]:
def build_tokenizer_dict(document_df, max_span_len = MIN_TOKENS):

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(document_df['document'])
    tokenizer_pad = partial(tokenize_pad, 
                            tokenizer=tokenizer, 
                            max_span_len=max_span_len)
    
#     def tokenize_pad(documents):
#         tokenized = tokenizer.texts_to_sequences(documents)
#         padded = pad_sequences(tokenized, maxlen=max_span_len, padding = "post")
#         return padded
    
    tokenizer_dict = {
        'tokenizer': tokenizer, 
        'tokenize_pad': tokenizer_pad, 
        'word_index': tokenizer.word_index,
        'max_span_length': max_span_len}
    
    return tokenizer_dict
    

In [246]:
metadata_dict = build_metadata_dict(docs_df, feature_columns)

In [247]:
tokenizer_dict = build_tokenizer_dict(docs_df)

In [334]:
RMN = "rmn_%s_dir"
MOD = "model"
ARCH = "architecture"
ATTR = "attributes"

In [325]:

# constants
NUM_TOPICS = 20
OPTIMIZER = 'adam'


class RMN(object):
    
    def __init__(self):
        
        self.num_topics = NUM_TOPICS
        
        self.embedding_matrix = None
        self.tokenizer_dict = None
        self.metadata_dict = None
        
        self.model = None
        self.topic_model = None
        
    
    @property
    def embedding_dim(self):
        return self.embedding_matrix.shape[1]
    
    
    def model_loss(self, layer, lamb = 1.0):
        """Custom loss function to engourage 
        orthoganality of dictionary matrix R."""

        R = K.transpose(layer)
        
        def custom_loss(y_true, y_pred):

            hinge_loss = tf.keras.losses.hinge(y_true, y_pred)

            RR_t = K.dot(R, K.transpose(R))
            Id_mat = K.eye(self.embedding_dim)

            orth_penalty = K.sqrt(K.sum(K.square(RR_t - Id_mat)))

            return hinge_loss + lamb*orth_penalty

        return custom_loss
    
    
    def build_model(self):
        """Connstruct the RMN model architecture
        """
        # document span input
        vt = Input(shape=(self.tokenizer_dict['max_span_length'], ), name='Span.Input')
    
        input_layers = [vt]
        embedding_layers = [vt]
        
        for col in self.metadata_dict.keys():
            
            input_layer = Input(shape=(1,), name= col + '.Input')
            
            # embedding layer for col
            embedding_init = Embedding(
                input_dim = self.metadata_dict[col]['input_dim'] + 1, 
                output_dim = self.embedding_dim,
                input_length = 1)(input_layer)
            
            # reshape
            embedding_layer = Reshape((self.embedding_dim, ), name=col + '.Embed.Layer')(embedding_init)
            
            input_layers.append(input_layer)
            embedding_layers.append(embedding_layer)

        # concat speaker metadata embeddings
        _ht = tf.keras.layers.Concatenate(axis=1, name = 'Concat.Layer')(embedding_layers)

        # dense layer
        ht = Dense(units = self.embedding_dim, 
                   input_shape = (_ht.shape[1], ), 
                   activation = "relu", name = "Wh")(_ht)

        # dense layer with softmax activation, (where previous states will eventually be inserted) 
        dt = Dense(units = self.num_topics, 
                   input_shape = (self.embedding_dim, ), 
                   activation = "softmax", name = "Wd")(ht)

        # reconstruction layer
        rt = Dense(units = self.embedding_dim,
                   input_shape = (self.num_topics, ),
                   activation = "linear",
                   # kernel_regularizer = Orthoganal(),
                   name = "R")(dt)

        # compile
        model = tf.keras.Model(inputs=input_layers, outputs=rt)
        model.compile(optimizer = OPTIMIZER, loss = self.model_loss(rt))

        self.model = model
    
    
    def build_topic_model(self, topic_layer = "Wd"):
        """Contruct model whose output is the topic distribution layer
        """
        topic_model = tf.keras.Model(
            inputs = self.model.input,
            outputs = self.model.get_layer(topic_layer).output)
        
        self.topic_model = topic_model
    
    
    def prep_y(self, y):
        """Returns the average of the vectors in each span of text
        """
        padded_spans = self.tokenizer_dict['tokenize_pad'](y)
        vector_spans = self.embedding_matrix[padded_spans].mean(axis=1)
        
        return vector_spans
    
    
    def prep_metadata(self, df):
        """Preps metadata for training or prediction
        """
        metadata_ids = [np.array(self.metadata_dict[col]['tokenize'](df[col]))
                        for col in metadata_dict.keys()]

        return metadata_ids
        
    
    def prep_inputs(self, df):
        """Preps metadata for training or prediction
        """
        vector_spans = self.prep_y(df['document'])
        metadata_ids = self.prep_metadata(df)
        inputs = [vector_spans] + metadata_ids
        
        return inputs
    
    
    def predict_topics(self, df):
        """Predicts the topic distributions for a df
        """
        
        # ensure the topic model has been built
        if self.topic_model is None:
            self.build_topic_model()
            
        topic_preds = self.topic_model.predict(x=self.prep_inputs(df))
        
        return topic_preds
    
    
    def save_rmn(self, name, save_path):
        
        
        
    

In [326]:
# self.num_topics = None
# self.embedding_matrix = None
# self.embedding_dim = None

# self.tokenizer_dict = None
# self.metadata_dict = None
# self.model = None

In [327]:
rmn = RMN()
rmn.num_topics = 40
rmn.metadata_dict = metadata_dict
rmn.tokenizer_dict = tokenizer_dict
rmn.embedding_matrix = embeddings_matrix

rmn.build_model()

In [328]:
inputs = rmn.prep_inputs(docs_df)

In [329]:
rmn.model.fit(x=inputs, y = inputs[0], batch_size = 1, epochs = 5)

Train on 100 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f4b20b14510>

In [330]:
rmn.build_topic_model()

In [331]:
tm = rmn.topic_model

In [333]:
tm.predict(inputs)

array([[1.12728667e-05, 7.61330057e-06, 6.69647590e-04, ...,
        1.71813335e-05, 1.78559560e-06, 4.75757133e-06],
       [1.86095385e-05, 1.47195879e-05, 9.28174879e-04, ...,
        3.22538181e-05, 4.34476215e-06, 1.09852326e-05],
       [1.45056529e-05, 1.07052783e-05, 8.27712880e-04, ...,
        2.23721418e-05, 2.69438419e-06, 7.47895729e-06],
       ...,
       [1.53639830e-05, 1.09285975e-05, 8.41916946e-04, ...,
        2.48461438e-05, 2.39745236e-06, 7.06023366e-06],
       [1.40847606e-05, 9.70476322e-06, 6.71438698e-04, ...,
        2.51738729e-05, 2.33795140e-06, 6.18180275e-06],
       [2.75614293e-05, 2.11138104e-05, 1.19606068e-03, ...,
        3.66882196e-05, 5.07829600e-06, 1.18210473e-05]], dtype=float32)

In [291]:
model_dict = {
    'num_topics': rmn.num_topics,
    'metadata_dict': rmn.metadata_dict,
    'tokenizer_dict': rmn.tokenizer_dict,
    'emebedding_matrix': rmn.embedding_matrix
}

In [252]:
os.chdir("/home/rocassius/gen-data/models")

In [253]:
rmn.metadata_dict['speakerid']['tokenize']

<bound method Tokenizer.texts_to_sequences of <keras_preprocessing.text.Tokenizer object at 0x7f4b22495c10>>

In [292]:
filename = 'rmn_model_dict'
outfile = open(filename,'wb')
pickle.dump(model_dict, outfile)
outfile.close()

In [None]:
filename = 'rmn_model_dict'
outfile = open(filename,'wb')
pickle.dump(model_dict, outfile)
outfile.close()

In [None]:
tokenizer_dict