# Training an RMN on all session documents

In [2]:
import os
import sys
import pandas as pd

In [273]:
sys.path.append("/home/rocassius/w266_final/scripts/assembly")
sys.path.append("/home/rocassius/w266_final/scripts/modeling")

In [4]:
from document import load_documents
from constant import DOC_ALL_PATH

In [5]:
from helper import load_pickled_object
from rmn import RMN
from rmn_data_generator import RMN_DataGenerator

In [6]:
# load embedding tools
local_tools_path = '/home/rocassius/gen-data/tools'
tokenizer_dict = load_pickled_object(os.path.join(local_tools_path, "global_tokenizer_dict"))
metadata_dict = load_pickled_object(os.path.join(local_tools_path, "global_metadata_dict"))
embedding_matrix = load_pickled_object(os.path.join(local_tools_path, "global_embedding_matrix_50d"))

In [7]:
# load documents
docs_df = load_documents([111], DOC_ALL_PATH)

In [233]:
from subject import subject_keywords
from constant import DOC_SAMPLE_PATH
DOC = 'documents_%s.txt'


In [242]:
docs_df = pd.concat([pd.read_csv(os.path.join(DOC_SAMPLE_PATH, DOC % s), sep = "|") 
                     for s in list(subject_keywords.keys())[1:]])

In [243]:
docs_df.rename(columns={'congress':'session'}, inplace=True)

In [244]:
docs_df.dtypes

speakerid    float64
lastname      object
firstname     object
chamber       object
state         object
gender        object
party         object
document      object
session        int64
subject       object
dtype: object

In [245]:
# correct types
docs_df['speakerid'] = docs_df['speakerid'].astype(int).astype(str)
docs_df['session'] = docs_df['session'].astype(str)

In [246]:
docs_df

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,session,subject
0,105111840,GINGRICH,NEWTON,H,GA,M,R,lyndon johnsons former advisor jimmy carters s...,105,alcohol
1,105110921,ASHCROFT,JOHN,S,MO,M,R,defective products placing burden responsible ...,105,alcohol
2,105114091,DODD,CHRISTOPHER,S,CT,M,D,supervision devastating longterm impact one st...,105,alcohol
3,105111431,MOYNIHAN,DANIEL,S,NY,M,D,protracted series negotiations ensued able upd...,105,alcohol
4,105116401,INOUYE,DANIEL,S,HI,M,D,days hospitalization emergency room visits vi...,105,alcohol
...,...,...,...,...,...,...,...,...,...,...
163292,111121290,SCOTT,ROBERT,H,VA,M,D,time consume first want well introducing impor...,111,trade
163293,111121290,SCOTT,ROBERT,H,VA,M,D,provisions false claims act order effectively ...,111,trade
163294,111121290,SCOTT,ROBERT,H,VA,M,D,consensus worked together bipartisan basis sen...,111,trade
163295,111121290,SCOTT,ROBERT,H,VA,M,D,yielding along colleague would like briefly di...,111,trade


In [247]:
d = docs_df.sample(50000)

In [276]:
from token_mapping import *

In [275]:
import os
import pickle

import tensorflow as tf
import tensorflow.keras.backend as K
import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.sequence import pad_sequences


#os.chdir("../assembly/")
from constant import EMBEDDINGS


# constants
GLOVE_DIMS = [50, 100, 200, 300]
EMBEDDING_DIM = GLOVE_DIMS[0]
GLOVE_PATH = os.path.join(EMBEDDINGS, 'glove6B/glove.6B.%dd.csv' % EMBEDDING_DIM)


def fetch_embeddings(embeddings_dim = EMBEDDING_DIM):
    
    path = os.path.join(EMBEDDINGS, 'glove6B/glove.6B.%dd.csv' % EMBEDDING_DIM)
    
    embeddings_index = pd.read_csv(path).to_dict(orient = 'list')
    
    return embeddings_index



def build_embedding_matrix(word_index, embeddings_index, stopwords=[]):
    
    # get the embedding dimension
    embedding_dim = len(embeddings_index['the'])
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None and word not in stopwords:
            # words not found in embedding index and stopwords will be all-zeros.
            embedding_matrix[i] = embedding_vector 
            
    embedding_matrix = embedding_matrix.astype('float16')
    
    return embedding_matrix

In [267]:
tokenizer_dict = build_tokenizer_dict(docs_df, max_span_len=50)

In [294]:
meta_cols = ['speakerid', 'chamber', 'state', 'gender','party', 'session', 'subject']

In [295]:
metadata_dict = build_metadata_dict(docs_df, metadata_columns=meta_cols)

In [277]:
embedding_index = fetch_embeddings()

In [278]:
embedding_matrix = build_embedding_matrix(tokenizer_dict['word_index'], embedding_index)

In [440]:
#==================#
#=*= RMN Module =*=#
#==================#

# RMN Class for training Relationship Modeling Networks 

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
import numpy as np

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Embedding, Dense, Lambda, Input, Masking, Reshape, Concatenate
from tensorflow.keras.models import load_model, model_from_json
from tensorflow.keras.regularizers import Regularizer

from rmn_data_generator import RMN_DataGenerator
from helper import pickle_object, load_pickled_object
from vector_math import find_nn_cos

# constants
MAX_SPAN_LENGTH = 50
NUM_TOPICS = 20
LAMBDA = 1.0
GAMMA = 1.0

# hyperparameters
OPTIMIZER = 'adam'
BATCH_SIZE = 50
EPOCHS = 5

# saving tags
RMN_TAG = "rmn_%s"
MODEL = "model.h5"
ATTR = "attributes"

# attribute keys
N_TOP_KEY = 'num_topics'
LAMB_KEY  = 'lambda'
EMBED_KEY = 'emedding_matrix'
TOKEN_KEY = 'tokenizer_dict'
META_KEY  = 'metadata_dict'


class RMN(object):
    """
    Class for constructing a Relationship Modeling Network
    """
    
    def __init__(self):
        
        # model parameters
        self.num_topics = NUM_TOPICS
        self.lamb = LAMBDA
        self.gamma = GAMMA
        
        # model attrbiutes
        self.embedding_matrix = None
        self.tokenizer_dict = None
        self.metadata_dict = None
        
        # models 
        self.model = None
        self.topic_model = None
        
    
    @property
    def embedding_dim(self):
        return self.embedding_matrix.shape[1]
    
    @property
    def topic_matrix(self):
        """Return the topic matrix associated with the rmn"""
        # dim = [num_topics, embedding_dim]
        return self.model.get_layer('Wd').get_weights()[0].T
    
    @property
    def tuned_embedding_matrix(self):
        """Return the current embedding matrix of the rmn"""
        return rmn.model.get_layer('Span.Embedding').get_weights()[0]
        
    
    
    def model_loss(self):
        """Hinge loss function.
        """
        # def sample_spans()
        
        def custom_loss(y_true, y_pred):
            
            # hinge_loss
            y_true_normalized = K.l2_normalize(y_true, axis=-1)
            y_pred_normalized = K.l2_normalize(y_pred, axis=-1)
            dot_product = K.sum(y_true_normalized * y_pred_normalized, axis=-1)
            hinge_loss = K.mean(K.maximum(0., 1. - dot_product))

            return hinge_loss 

        return custom_loss
    
    
    def build_model(self, embedding_trainable=False):
        """Connstruct the RMN model architecture
        """
        # Span Input
        span_input = Input(shape=(self.tokenizer_dict['max_span_length'],), 
                           name='Span.Input')
        span_embedding = Embedding(input_dim=len(self.tokenizer_dict['word_index']) + 1, 
                                   output_dim=self.embedding_dim, 
                                   weights=[self.embedding_matrix],
                                   input_length=self.tokenizer_dict['max_span_length'],
                                   trainable=embedding_trainable, 
                                   name = 'Span.Embedding')(span_input)
        
        # Take elementwise average over vectors
        span_avg = Lambda(lambda x: K.mean(x, axis=1), name = "Span.Avg.Layer")(span_embedding)

        input_layers = [span_input]
        embedding_layers = [span_avg]
        
        for col in self.metadata_dict.keys():
            
            input_layer = Input(shape=(1,), name= col + '.Input')
            
            # embedding layer for col
            embedding_init = Embedding(
                input_dim = self.metadata_dict[col]['input_dim'] + 1, 
                output_dim = self.embedding_dim,
                input_length = 1)(input_layer)
            
            # reshape
            embedding_layer = Reshape((self.embedding_dim, ), name=col + '.Embed.Layer')(embedding_init)
            
            input_layers.append(input_layer)
            embedding_layers.append(embedding_layer)

        # concatenate span vector with metadata embeddings
        _ht = Concatenate(axis=1, name = 'Concat.Layer')(embedding_layers)

        # dense layer
        ht = Dense(units = self.embedding_dim, 
                   input_shape = (_ht.shape[1], ), 
                   activation = "relu", name = "Wh")(_ht)

        # dense layer whose output is a probability distribution
        dt = Dense(units = self.num_topics, 
                   input_shape = (self.embedding_dim, ), 
                   activation = "softmax", 
                   activity_regularizer = Purity(self.gamma),
                   name = "Wd")(ht)

        # reconstruction layer
        rt = Dense(units = self.embedding_dim,
                   input_shape = (self.num_topics, ),
                   activation = "linear",
                   kernel_regularizer = Orthogonality(self.lamb),
                   name = "R")(dt)

        # compile
        model = tf.keras.Model(inputs=input_layers, outputs=rt)
        model.compile(optimizer = OPTIMIZER, loss = self.model_loss())
        self.model = model
        
        # build associated topic model
        self.build_topic_model()
        
        
    def build_topic_model(self, topic_layer = "Wd"):
        """Contruct model whose output is the topic distribution layer
        """
        topic_model = tf.keras.Model(
            inputs = self.model.input,
            outputs = self.model.get_layer(topic_layer).output)
        
        self.topic_model = topic_model
          
    
    def prep_spans(self, documents):
        """Returns the lists of word ids associated with the text
        """
        return self.tokenizer_dict['tokenize_pad'](documents)
    
    
    def prep_metadata(self, df):
        """Preps metadata for training or prediction
        """
        metadata_x = [np.array(self.metadata_dict[col]['tokenize'](df[col]))
                      for col in self.metadata_dict.keys()]

        return metadata_x
        
    
    def prep_X(self, df, for_training=False):
        """Preps metadata and spans for training or prediction
        """
        spans_y = self.prep_spans(df['document'])
        metadata_x = self.prep_metadata(df)
        X = [spans_y] + metadata_x
        
        if for_training:
            y = self.embedding_matrix[spans_y].mean(axis=1)
            return X, y
        else:
            return X

    def predict_y(self, df, use_generator=True):
        """Predicts the rmn outputs for a df
        """
        # ensure the topic model has been built
        if self.topic_model is None:
            self.build_topic_model()
        
        if use_generator:
            return self.predict_with_generator(df, self.model)
        else:
            return self.predict_(df, self.model)
    
    
    def predict_topics(self, df, use_generator=True):
        """Predicts the topic distributions for a df
        """        
        # ensure the topic model has been built
        if self.topic_model is None:
            self.build_topic_model()
        
        if use_generator:
            return self.predict_with_generator(df, self.topic_model)
        else:
            return self.predict_(df, self.topic_model)

        
    def predict_(self, df, model):
        """Makes a predictions for a df with a model
        """
        return model.predict(x=self.prep_X(df))
        
    
    def predict_with_generator(self, df, model):
        """Predict topic distributions with a generator
        """
        # Make sure data is not empty
        assert not df.empty

        # Calculate good batch size, 
        batch_size = max(1, min(10000, df.shape[0] // 10))
        n_batches = df.shape[0] // batch_size

        if n_batches < 2: 
            return self.predict_(df, model)
        else:
            # calculate remainder batch size
            r = df.shape[0] % batch_size
            if r == 0:
                g_index = df.index[:-batch_size]
                r_index = df.index[-batch_size:]
            else:
                g_index = df.index[:-r]
                r_index = df.index[-r:]

            # Make generator
            g = RMN_DataGenerator(self, df.loc[g_index], batch_size=batch_size, shuffle=False)

            # Predict on remainder batch
            r_pred = self.predict_(df.loc[r_index], model)
            # predict on generated batches
            g_pred = model.predict_generator(g, use_multiprocessing=True, workers=10, verbose=1)

            assert r_pred.shape[1] == g_pred.shape[1]
            pred = np.vstack([g_pred, r_pred])

            return pred
        
    
    def save_rmn(self, name, save_path):
        """
        Save the model's weights, architecture and attributes
        """
        # assemble attribute dictionary
        attribute_dict = {
            N_TOP_KEY:  self.num_topics,
            LAMB_KEY:   self.lamb,
            EMBED_KEY:  self.embedding_matrix,
            TOKEN_KEY:  self.tokenizer_dict,
            META_KEY:   self.metadata_dict}
        
        # make directory for model
        model_path = os.path.join(save_path, RMN_TAG % name)
        os.mkdir(model_path)
        
        # save model weights
        self.model.save_weights(os.path.join(model_path, MODEL))
        
        # save model attributes
        pickle_object(attribute_dict, os.path.join(model_path, ATTR))
        
        
    def load_rmn(self, name, save_path):
        """
        Load the model, weights, architecture and attributes from a saved model
        """
        # make directory for model
        model_path = os.path.join(save_path, RMN_TAG % name)
        
        # load attributes
        attributes_dict = load_pickled_object(os.path.join(model_path, ATTR))
        
        # update attributes
        self.num_topics       = attributes_dict[N_TOP_KEY]
        self.lamb             = attributes_dict[LAMB_KEY]
        self.embedding_matrix = attributes_dict[EMBED_KEY]
        self.tokenizer_dict   = attributes_dict[TOKEN_KEY]
        self.metadata_dict    = attributes_dict[META_KEY]
        
        # construct identical model architecture
        self.build_model()
        
        # Load weights
        self.model.load_weights(os.path.join(model_path, MODEL))
        
        # build associated topic model
        self.build_topic_model()
        
    
    def inspect_topics(self, k_neighbors=10, tuned_embedding=False):
        """
        Ouput the nearest neighbors of every topic vector in
        the model's topic layer
        """
        if tuned_embedding:
            E = self.tuned_embedding_matrix # dim = [num_words, embedding_dim]
        else:
            E = self.embedding_matrix
        Wd = self.topic_matrix    # dim = [num_topics, embedding_dim]
        
        for i in range(Wd.shape[0]):
            
            neighbors, sim = find_nn_cos(Wd[i], E, k_neighbors)
            words = [self.tokenizer_dict['tokenizer'].index_word[v] for v in neighbors]
            
            print(20*"=" +"\n")
            print("Topic", i)
            print(words)
    
    
# Orthogonality Regularizer #

class Orthogonality(Regularizer):
    """
    Regularizer for penalizing non-orthogonal components of a weight matrix.
    
    Args:
    - lamb: (Float) regularization penalty weight
    """

    def __init__(self, lamb = 1.):
        self.lamb = lamb

    def __call__(self, R):
        """Returns a component dependence penalty for matrix R
        """
        RRT = K.dot(R, K.transpose(R))
        I = K.eye(int(RRT.shape[0]))
        penalty = self.lamb * K.sqrt(K.sum(K.square(RRT - I)))
        
        return penalty
    
    
# Topic Purity Regularizer #

class Purity(Regularizer):
    """Regularizer for penalizing highly impure probability distributions
    """
    def __init__(self, gamma = 1.):
        self.gamma = gamma

    def __call__(self, p):
        """Returns the avergage shannon entropy of the distribution(s) p
        """
        # impurity = K.constant(7) - K.mean(K.sum(p*-K.log(p)/K.log(K.constant(2)), axis=-1))
        impurity = K.mean(K.sum(p*-K.log(p)/K.log(K.constant(2)), axis=-1)) + K.
        penalty = self.gamma * impurity
        
        return penalty

In [441]:
rmn = RMN()
rmn.embedding_matrix = embedding_matrix
rmn.tokenizer_dict = tokenizer_dict
rmn.metadata_dict = metadata_dict
rmn.num_topics = 25
rmn.lamb = 0.1
rmn.gamma = 0.1
rmn.build_model(embedding_trainable=True)

In [442]:
#rmn.model.summary()

In [443]:
# X, y = rmn.prep_X(d, for_training=True)

In [444]:
X[0].shape

(50000, 50)

In [445]:
y.shape

(50000, 50)

In [446]:
rmn.model.fit(X, y, epochs=2, batch_size=500)

Train on 50000 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fe66b98ca90>

In [447]:
rmn.inspect_topics()

  Ds = np.dot(Wv, v) / (np.linalg.norm(v) * np.linalg.norm(Wv, axis = 1))
  Ds = np.dot(Wv, v) / (np.linalg.norm(v) * np.linalg.norm(Wv, axis = 1))



Topic 0
['nrc', 'objection', 'travers', 'delaet', 'lundberg', 'dietetic', 'pathologist', 'screwtape', 'reprimand', 'kellenberger']

Topic 1
['circa', 'tarzan', 'fifties', 'sixties', 'orbison', 'thirties', 'bengali', 'orwell', 'sacagawea', 'everest']

Topic 2
['nscc', 'rrc', 'ucu', 'altmeyer', 'naag', 'iosco', 'sdb', 'emm', 'praetorian', 'inf']

Topic 3
['buttered', 'defecates', 'cellophane', 'hicksville', 'slathered', 'haberdasher', 'seedy', 'twinkies', 'hostess', 'overstuffed']

Topic 4
['sdlp', 'strife', 'burundi', 'slivka', 'displacing', 'bottlenecks', 'mostar', 'bloodshed', 'spiralling', 'meller']

Topic 5
['cotchett', 'salvatori', 'zeidler', 'hussman', 'spohr', 'laymon', 'lichenstein', 'zollo', 'harriott', 'elitch']

Topic 6
['sided', 'presses', 'mallet', 'andouille', 'sterner', 'ergonomically', 'liipfert', 'sausage', 'grater', 'pcr']

Topic 7
['walford', 'saltzman', 'porur', 'rowlands', 'ifill', 'gatton', 'feshbach', 'rmi', 'wanaque', 'doucet']

Topic 8
['willin', 'shap', 'fadel

In [374]:
from rmn_analyzer import RMN_Analyzer

In [417]:
analyzer = RMN_Analyzer(rmn, d)


In [418]:
analyzer.group_js({'subject': 'minorities'}, n=200)



{'mean': 0.09029715511581263,
 'lower': 0.08724137165459755,
 'upper': 0.09335293857702773}

In [419]:
def shannon_entropy(p):
    p = np.asarray(p)
    return np.sum(p*-np.log2(p), axis=-1)

In [420]:
shannon_entropy(analyzer.topic_preds[77])

4.5836267

In [421]:
shannon_entropy(analyzer.topic_preds[1000])

4.5617456

In [422]:
np.max(analyzer.topic_preds[10000])

0.057870764

In [423]:
np.argmax(analyzer.topic_preds, axis=-1)

array([23,  2, 23, ...,  2, 15,  0])

In [424]:
s = 'crime'

In [425]:
analyzer.group_js({'subject': s, 'party': 'R'}, n=200)

{'mean': 0.08887185003763831,
 'lower': 0.08536517650600806,
 'upper': 0.0923785235692686}

In [426]:
analyzer.group_js({'subject': s, 'party': 'D'}, n=200)

{'mean': 0.08277032549908123,
 'lower': 0.07963555455652582,
 'upper': 0.08590509644163666}

In [427]:
analyzer.inter_party_js({'subject': s}, n=200)

{'mean': 0.08559621174848,
 'lower': 0.08240782150303054,
 'upper': 0.08878460199392947}