In [355]:
import os
os.chdir("/home/rocassius/w266_final/scripts/modeling")

In [5]:
import os
import numpy as np
import pandas as pd
from scipy import stats

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Embedding, Dense, Lambda, Input, Reshape

from embeddings import EMBEDDING_DIM

In [6]:
from preprocess import *
from document import *
from subject import subject_keywords
from constant import SPEECHES, SPEAKER_MAP, HB_PATH, EMBEDDINGS, DOC_PATH, DOCUMENT

In [26]:
# os.chdir("../modeling")
from token_mapping import *
from embeddings import *

In [8]:
DOC_SAMPLE_PATH = os.path.join(DOC_PATH, "doc-sample/")
docs_df = load_documents(subject_keywords.keys(), DOC_SAMPLE_PATH)
N = 100
docs_df = docs_df.sample(N)

In [16]:
feature_columns = [
 'speakerid',
 'chamber',
 'state',
 'gender',
 'party',
 'congress',
 'subject']

In [17]:
token_dict = build_tokenizer_dict(docs_df, feature_columns)

In [18]:
metadata_dict = build_metadata_dict(docs_df, feature_columns)
metadata_dict.keys()

dict_keys(['speakerid', 'chamber', 'state', 'gender', 'party', 'congress', 'subject'])

In [38]:
import pickle
embedding_file = "/home/rocassius/gen-data/tools/embbedding_index_50d"

f = open(embedding_file, "rb")
embeddings_index = pickle.load(f)
f.close()

embeddings_matrix = build_embedding_matrix(token_dict['document']['token_index'], embeddings_index)#.astype('float16')

In [39]:
feature_columns = [
 'speakerid',
 'chamber',
 'state',
 'gender',
 'party',
 'congress',
 'subject']


dict_keys(['speakerid', 'chamber', 'state', 'gender', 'party', 'congress', 'subject'])

In [73]:
from functools import partial

In [None]:
pad

In [142]:
def build_metadata_dict(document_df, metadata_columns):

    metadata_dict = {}
    
    for col in metadata_columns:
        
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(document_df[col])
    
        metadata_dict[col] = {
            'tokenizer': tokenizer,
            'tokenize': tokenizer.texts_to_sequences,
            'token_index': tokenizer.word_index, 
            'input_dim': len(tokenizer.word_index)}        
        
    return metadata_dict

In [234]:
from functools import partial

In [244]:
def tokenize_pad(documents, tokenizer, max_span_len):
    """Tokenize and pad documents using a tokenizer
    """
    tokenized = tokenizer.texts_to_sequences(documents)
    padded = pad_sequences(tokenized, maxlen=max_span_len, padding = "post")
    
    return padded

In [346]:
def build_tokenizer_dict(document_df, max_span_len = MIN_TOKENS):

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(document_df['document'])
    tokenizer_pad = partial(tokenize_pad, tokenizer=tokenizer, max_span_len=max_span_len)
    
    tokenizer_dict = {
        'tokenizer': tokenizer, 
        'tokenize_pad': tokenizer_pad, 
        'word_index': tokenizer.word_index,
        'max_span_length': max_span_len}
    
    return tokenizer_dict
    

In [347]:
metadata_dict = build_metadata_dict(docs_df, feature_columns)

In [348]:
tokenizer_dict = build_tokenizer_dict(docs_df)

In [457]:
from tensorflow.keras.models import load_model, model_from_json

# constants
MAX_SPAN_LENGTH = 50
NUM_TOPICS = 20

OPTIMIZER = 'adam'
BATCH_SIZE = 50
EPOCHS = 5

RMN_TAG = "rmn_%s"
MODEL = "model.h5"
ARCH = "architecture"
ATTR = "attributes"


class RMN(object):
    """
    Class for constructing a Relationship Modeling Network
    """
    
    def __init__(self):
        
        # model attributes
        self.num_topics = NUM_TOPICS
        self.embedding_matrix = None
        self.tokenizer_dict = None
        self.metadata_dict = None
        
        # models 
        self.model = None
        self.topic_model = None
        
    
    @property
    def embedding_dim(self):
        return self.embedding_matrix.shape[1]
    
    
    def model_loss(self, layer, lamb = 1.0):
        """Custom loss function to engourage 
        orthoganality of dictionary matrix R."""

        R = K.transpose(layer)
        
        def custom_loss(y_true, y_pred):

            hinge_loss = tf.keras.losses.hinge(y_true, y_pred)

            RR_t = K.dot(R, K.transpose(R))
            Id_mat = K.eye(self.embedding_dim)

            orth_penalty = K.sqrt(K.sum(K.square(RR_t - Id_mat)))

            return hinge_loss + lamb*orth_penalty

        return custom_loss
    
    
    def build_model(self):
        """Connstruct the RMN model architecture
        """
        # document span input
        vt = Input(shape=(self.tokenizer_dict['max_span_length'], ), name='Span.Input')
    
        input_layers = [vt]
        embedding_layers = [vt]
        
        for col in self.metadata_dict.keys():
            
            input_layer = Input(shape=(1,), name= col + '.Input')
            
            # embedding layer for col
            embedding_init = Embedding(
                input_dim = self.metadata_dict[col]['input_dim'] + 1, 
                output_dim = self.embedding_dim,
                input_length = 1)(input_layer)
            
            # reshape
            embedding_layer = Reshape((self.embedding_dim, ), name=col + '.Embed.Layer')(embedding_init)
            
            input_layers.append(input_layer)
            embedding_layers.append(embedding_layer)

        # concat speaker metadata embeddings
        _ht = tf.keras.layers.Concatenate(axis=1, name = 'Concat.Layer')(embedding_layers)

        # dense layer
        ht = Dense(units = self.embedding_dim, 
                   input_shape = (_ht.shape[1], ), 
                   activation = "relu", name = "Wh")(_ht)

        # dense layer with softmax activation, (where previous states will eventually be inserted) 
        dt = Dense(units = self.num_topics, 
                   input_shape = (self.embedding_dim, ), 
                   activation = "softmax", name = "Wd")(ht)

        # reconstruction layer
        rt = Dense(units = self.embedding_dim,
                   input_shape = (self.num_topics, ),
                   activation = "linear",
                   # kernel_regularizer = Orthoganal(),
                   name = "R")(dt)

        # compile
        model = tf.keras.Model(inputs=input_layers, outputs=rt)
        model.compile(optimizer = OPTIMIZER, loss = self.model_loss(rt))

        self.model = model
    
    
    def build_topic_model(self, topic_layer = "Wd"):
        """Contruct model whose output is the topic distribution layer
        """
        topic_model = tf.keras.Model(
            inputs = self.model.input,
            outputs = self.model.get_layer(topic_layer).output)
        
        self.topic_model = topic_model
    
    
    def prep_y(self, y):
        """Returns the average of the vectors in each span of text
        """
        padded_spans = self.tokenizer_dict['tokenize_pad'](y)
        vector_spans = self.embedding_matrix[padded_spans].mean(axis=1)
        
        return vector_spans
    
    
    def prep_metadata(self, df):
        """Preps metadata for training or prediction
        """
        metadata_ids = [np.array(self.metadata_dict[col]['tokenize'](df[col]))
                        for col in metadata_dict.keys()]

        return metadata_ids
        
    
    def prep_inputs(self, df):
        """Preps metadata for training or prediction
        """
        vector_spans = self.prep_y(df['document'])
        metadata_ids = self.prep_metadata(df)
        inputs = [vector_spans] + metadata_ids
        
        return inputs
    
    
    def predict_topics(self, df):
        """Predicts the topic distributions for a df
        """
        
        # ensure the topic model has been built
        if self.topic_model is None:
            self.build_topic_model()
            
        topic_preds = self.topic_model.predict(x=self.prep_inputs(df))
        
        return topic_preds
    
    
    def fit(self, df, batch_size = BATCH_SIZE, epochs = EPOCHS):
        
        inputs = self.prep_inputs(df)
        y_true = self.prep_y(df['document'])
        
        self.model.fit(x = inputs, 
                       y = y_true, 
                       batch_size = batch_size, 
                       epochs = epochs)

    
    def save_rmn(self, name, save_path):
        """
        Save the model's weights, architecture and attributes
        """
        
        # assemble attribute dictionary
        attribute_dict = {
            'num_topics': self.num_topics,
            'emedding_matrix': self.embedding_matrix,
            'tokenizer_dict': self.tokenizer_dict,
            'metadata_dict': self.metadata_dict}
        
        # make directory for model
        model_path = os.path.join(save_path, RMN_TAG % name)
        os.mkdir(model_path)
        
        # save model weights
        self.model.save(os.path.join(model_path, MODEL))
        
        # save model architecture
        pickle_object(self.model.to_json(), os.path.join(model_path, ARCH))
        
        # save model attributes
        pickle_object(attribute_dict, os.path.join(model_path, ATTR))
        
        
    def load_rmn(self, name, save_path):
        """
        Load the model, weights, architecture and attributes from a saved model
        """
        
        # make directory for model
        model_path = os.path.join(save_path, RMN_TAG % name)
        
        # Load architecture and weights
        self.model = model_from_json(load_pickled_object(os.path.join(model_path, ARCH)))
        self.model.load_weights(os.path.join(model_path, MODEL))
        
        # load attributes
        attributes_dict = load_pickled_object(os.path.join(model_path, ATTR))
        
        # update attributes
        self.num_topics = attributes_dict['num_topics']
        self.embedding_matrix = attributes_dict['emedding_matrix']
        self.tokenizer_dict = attributes_dict['tokenizer_dict']
        self.metadata_dict = attributes_dict['metadata_dict']
        
        

In [458]:

# model = model_from_json(open(modelFile).read())
# model.load_weights(os.path.join(os.path.dirname(modelFile), 'model_weights.h5'))


# self.num_topics = None
# self.embedding_matrix = None
# self.embedding_dim = None

# self.tokenizer_dict = None
# self.metadata_dict = None
# self.model = None

In [459]:
rmn = RMN()
rmn.num_topics = 40
rmn.metadata_dict = metadata_dict
rmn.tokenizer_dict = tokenizer_dict
rmn.embedding_matrix = embeddings_matrix

rmn.build_model()

In [460]:
inputs = rmn.prep_inputs(docs_df)

In [462]:
rmn.model.fit(x=inputs, y = y, batch_size = 5, epochs = 5)

Train on 100 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f4b084a4990>

In [464]:
rmn.save_rmn(save_path = "/home/rocassius/gen-data/models", 
             name = "albertiso")

In [466]:
rmn_2 = RMN()

In [467]:
rmn_2.load_rmn(name="albertiso", save_path = "/home/rocassius/gen-data/models")

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [469]:
rmn_2.predict_topics(docs_df)

array([[7.1652193e-04, 1.7910470e-01, 9.8269163e-03, ..., 2.5755103e-04,
        7.9114419e-05, 1.5086564e-03],
       [8.8874716e-03, 4.4744827e-02, 3.8583890e-02, ..., 1.8278451e-03,
        1.3055729e-03, 5.9237620e-03],
       [7.0628920e-03, 4.6495214e-02, 3.6701795e-02, ..., 1.2913044e-03,
        1.1500251e-03, 4.5245788e-03],
       ...,
       [9.9604228e-04, 1.3856687e-01, 1.3478625e-02, ..., 2.4042113e-04,
        1.0308698e-04, 1.5409698e-03],
       [4.9326029e-03, 5.3708877e-02, 2.9198263e-02, ..., 7.0617168e-04,
        5.6763762e-04, 2.7723091e-03],
       [1.8442660e-03, 1.4638846e-01, 1.5711518e-02, ..., 5.8922678e-04,
        1.8944810e-04, 2.6625546e-03]], dtype=float32)

In [389]:
def pickle_object(x, file_name):
    """
    Helper function for pickling an object
    """
    outfile = open(file_name, "wb")
    pickle.dump(x, outfile)
    outfile.close()


In [396]:
def load_pickled_object(file_name):
    """
    Helper function for loading a pickled object
    """
    infile = open(file_name, "rb")
    x = pickle.load(infile)
    infile.close()
    
    return x

In [397]:
save_path 

'/home/rocassius/gen-data/models'

In [377]:
from tensorflow.keras.models import load_model, model_from_json

In [252]:
os.chdir("/home/rocassius/gen-data/models")

In [379]:
model_dict = "hello there"

In [385]:
pickle_object(model_dict, filename)

In [380]:
filename = 'rmn_model_dict'
outfile = open(filename,'wb')
pickle.dump(model_dict, outfile)
outfile.close()

In [None]:
filename = 'rmn_model_dict'
outfile = open(filename,'wb')
pickle.dump(model_dict, outfile)
outfile.close()

In [None]:
tokenizer_dict