# RMN Training

Training an RMN on sessions 105 - 111

In [1]:
import os
import pandas as pd

In [2]:
os.chdir("../../../scripts/assembly/")

In [3]:
from document import *
from subject import subject_keywords
from constant import DOC_PROPER_PATH, DOCUMENT, SESSIONS

In [4]:
docs_df = load_documents([111], DOC_PROPER_PATH)

In [5]:
docs_df.head()

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,subject,session
0,111117170,TOWNS,EDOLPHUS,H,NY,M,D,wouldnt we ban it i know the from has said may...,alcohol,111
1,111119891,ISAKSON,JOHN,S,GA,M,R,acts of terrorism we took the blind sheikthe f...,alcohol,111
2,111117461,ROCKEFELLER,JOHN,S,WV,M,D,buddy program and has been a lunch buddy for ...,alcohol,111
3,111118860,CARTER,JOHN,H,TX,M,R,assessment of the zero budgetary impact of thi...,alcohol,111
4,111118701,BOXER,BARBARA,S,CA,F,D,school curriculum and the challenges they see ...,alcohol,111


In [6]:
docs_df.dtypes

speakerid    object
lastname     object
firstname    object
chamber      object
state        object
gender       object
party        object
document     object
subject      object
session      object
dtype: object

In [7]:
docs_df.shape

(166133, 10)

In [9]:
###### run several times
os.chdir("../modeling")
from token_mapping import *
from embeddings import *
from helper import load_pickled_object
from rmn import RMN

In [12]:
feature_columns = [
 'speakerid',
 'chamber',
 'state',
 'gender',
 'party',
 'session',
 'subject']

In [13]:
# make tokenizer and metadata dicts
tokenizer_dict = build_tokenizer_dict(docs_df, max_span_len=)

In [15]:
tokenizer_dict.keys()

dict_keys(['tokenizer', 'tokenize_pad', 'word_index', 'max_span_length'])

In [16]:
metadata_dict = build_metadata_dict(docs_df, feature_columns)
metadata_dict.keys()

dict_keys(['speakerid', 'chamber', 'state', 'gender', 'party', 'session', 'subject'])

In [17]:
# Get embeddings
embedding_file = "/home/rocassius/gen-data/tools/embbedding_index_50d"
embeddings_index = load_pickled_object(embedding_file)
embeddings_matrix = build_embedding_matrix(tokenizer_dict['word_index'], embeddings_index).astype('float16')

In [18]:
embeddings_matrix.shape

(94696, 50)

# RMN Training

In [208]:
from tensorflow.keras.regularizers import Regularizer

class Orthogonality(Regularizer):
    """Regularizer for discouraging non-orthogonal components.
    
    # Arguments
        lamb: Float; regularization penalty weight
    """

    def __init__(self, lamb = 1.):
        self.lamb = lamb

    def __call__(self, R):
        RRT = K.dot(R, K.transpose(R))
        I = K.eye(int(RRT.shape[0]))
        penalty = self.lamb * K.sqrt(K.sum(K.square(RRT - I)))
        
        return penalty


In [211]:
#==================#
#=*= RMN Module =*=#
#==================#

# RMN Class for training Relationship Modeling Networks 

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
import numpy as np

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Embedding, Dense, Lambda, Input, Masking, Reshape
from tensorflow.keras.models import load_model, model_from_json

from helper import pickle_object, load_pickled_object
from vector_math import find_nn_cos

# constants
MAX_SPAN_LENGTH = 50
NUM_TOPICS = 20
LAMBDA = 5.0

OPTIMIZER = 'adam'
BATCH_SIZE = 50
EPOCHS = 5

RMN_TAG = "rmn_%s"
MODEL = "model.h5"
ARCH = "architecture"
ATTR = "attributes"


class RMN(object):
    """
    Class for constructing a Relationship Modeling Network
    """
    
    def __init__(self):
        
        # model parameters
        self.num_topics = NUM_TOPICS
        self.lamb = LAMBDA
        
        # model attrbiutes
        self.embedding_matrix = None
        self.tokenizer_dict = None
        self.metadata_dict = None
        
        # models 
        self.model = None
        self.topic_model = None
        
    
    @property
    def embedding_dim(self):
        return self.embedding_matrix.shape[1]
    
    
    def model_loss(self):
        """Hinge loss function.
        """
        def custom_loss(y_true, y_pred):
            # hinge_loss
            y_true_normalized = K.l2_normalize(y_true, axis=-1)
            y_pred_normalized = K.l2_normalize(y_pred, axis=-1)
            dot_product = K.sum(y_true_normalized * y_pred_normalized, axis=-1)
            hinge_loss = K.mean(K.maximum(0., 1. - dot_product))

            return hinge_loss 

        return custom_loss
    
    
    def build_model(self):
        """Connstruct the RMN model architecture
        """
        # document span input
        vt = Input(shape=(self.embedding_dim, ), name='Span.Input')
    
        input_layers = [vt]
        embedding_layers = [vt]
        
        for col in self.metadata_dict.keys():
            
            input_layer = Input(shape=(1,), name= col + '.Input')
            
            # embedding layer for col
            embedding_init = Embedding(
                input_dim = self.metadata_dict[col]['input_dim'] + 1, 
                output_dim = self.embedding_dim,
                input_length = 1)(input_layer)
            
            # reshape
            embedding_layer = Reshape((self.embedding_dim, ), name=col + '.Embed.Layer')(embedding_init)
            
            input_layers.append(input_layer)
            embedding_layers.append(embedding_layer)

        # concat speaker metadata embeddings
        _ht = tf.keras.layers.Concatenate(axis=1, name = 'Concat.Layer')(embedding_layers)

        # dense layer
        ht = Dense(units = self.embedding_dim, 
                   input_shape = (_ht.shape[1], ), 
                   activation = "relu", name = "Wh")(_ht)

        # dense layer with softmax activation, (where previous states will eventually be inserted) 
        dt = Dense(units = self.num_topics, 
                   input_shape = (self.embedding_dim, ), 
                   activation = "softmax", name = "Wd")(ht)

        # reconstruction layer
        rt = Dense(units = self.embedding_dim,
                   input_shape = (self.num_topics, ),
                   activation = "linear",
                   kernel_regularizer = Orthogonality(self.lamb),
                   name = "R")(dt)

        # compile
        model = tf.keras.Model(inputs=input_layers, outputs=rt)
        model.compile(optimizer = OPTIMIZER, loss = self.model_loss())

        self.model = model
    
    
    def build_topic_model(self, topic_layer = "Wd"):
        """Contruct model whose output is the topic distribution layer
        """
        topic_model = tf.keras.Model(
            inputs = self.model.input,
            outputs = self.model.get_layer(topic_layer).output)
        
        self.topic_model = topic_model
    
    def prep_y(self, y):
        """Returns the average of the vectors in each span of text
        """
        padded_spans = self.tokenizer_dict['tokenize_pad'](y)
        vector_spans = self.embedding_matrix[padded_spans].mean(axis=1)
        
        return vector_spans
    
    
    def prep_metadata(self, df):
        """Preps metadata for training or prediction
        """
        metadata_ids = [np.array(self.metadata_dict[col]['tokenize'](df[col]))
                        for col in self.metadata_dict.keys()]

        return metadata_ids
        
    
    def prep_inputs(self, df):
        """Preps metadata for training or prediction
        """
        vector_spans = self.prep_y(df['document'])
        metadata_ids = self.prep_metadata(df)
        inputs = [vector_spans] + metadata_ids
        
        return inputs
    
    
    def predict_topics(self, df):
        """Predicts the topic distributions for a df
        """
        
        # ensure the topic model has been built
        if self.topic_model is None:
            self.build_topic_model()
            
        topic_preds = self.topic_model.predict(x=self.prep_inputs(df))
        
        return topic_preds
    
    
    def fit(self, df, batch_size = BATCH_SIZE, epochs = EPOCHS):
        
        inputs = self.prep_inputs(df)
        y_true = self.prep_y(df['document'])
        
        self.model.fit(x = inputs, 
                       y = y_true, 
                       batch_size = batch_size, 
                       epochs = epochs)

    
    def save_rmn(self, name, save_path):
        """
        Save the model's weights, architecture and attributes
        """
        
        # assemble attribute dictionary
        attribute_dict = {
            'num_topics': self.num_topics,
            'emedding_matrix': self.embedding_matrix,
            'tokenizer_dict': self.tokenizer_dict,
            'metadata_dict': self.metadata_dict}
        
        # make directory for model
        model_path = os.path.join(save_path, RMN_TAG % name)
        os.mkdir(model_path)
        
        # save model weights
        self.model.save(os.path.join(model_path, MODEL))
        
        # save model architecture
        pickle_object(self.model.to_json(), os.path.join(model_path, ARCH))
        
        # save model attributes
        pickle_object(attribute_dict, os.path.join(model_path, ATTR))
        
        
    def load_rmn(self, name, save_path):
        """
        Load the model, weights, architecture and attributes from a saved model
        """
        
        # make directory for model
        model_path = os.path.join(save_path, RMN_TAG % name)
        
        # Load architecture and weights
        self.model = model_from_json(load_pickled_object(os.path.join(model_path, ARCH)))
        self.model.load_weights(os.path.join(model_path, MODEL))
        
        # load attributes
        attributes_dict = load_pickled_object(os.path.join(model_path, ATTR))
        
        # update attributes
        self.num_topics = attributes_dict['num_topics']
        self.embedding_matrix = attributes_dict['emedding_matrix']
        self.tokenizer_dict = attributes_dict['tokenizer_dict']
        self.metadata_dict = attributes_dict['metadata_dict']
       
    
    def inspect_topics(self, k_neighbors=10):
        """
        Ouput the nearest neighbors of every topic vector in
        the model's topic layer
        """
    
        # get embedding matrix, dim = [num_words, embedding_dim]
        E = self.embedding_matrix
        
        # get topic matrix, dim = [num_topics, embedding_dim]
        Wd = self.model.get_layer('Wd').get_weights()[0].T
        
        for i in range(Wd.shape[0]):
            
            neighbors, sim = find_nn_cos(Wd[i], E, k_neighbors)
            words = [self.tokenizer_dict['tokenizer'].index_word[v] for v in neighbors]
            
            print(20*"=" +"\n")
            print("Topic", i)
            print(words)

In [212]:
rmn = RMN()
rmn.embedding_matrix = embeddings_matrix
rmn.tokenizer_dict = tokenizer_dict
rmn.metadata_dict = metadata_dict
# num descriptors = 4 times number of subjects
rmn.num_topics = 100
rmn.lamb = 1.
rmn.build_model()

In [213]:
# inputs = rmn.prep_inputs(docs_df)
# y = inputs[0]

In [214]:
y.shape

(166133, 50)

In [217]:
rmn.model.fit(x=inputs, 
              y=y, 
              epochs = 10, 
              batch_size = 200, 
              use_multiprocessing=True)

Train on 166133 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fda49f5c890>

In [218]:
rmn.inspect_topics(k_neighbors = 5)


Topic 0
['amnesty', 'opm', 'zimbabwean', 'repatriate', 'zimbabwe']

Topic 1
['excerpts', 'speech', 'eulogy', 'fanfare', 'editorial']

Topic 2
['opennet', 'sentinels', 'blacklists', 'nro', 'conspirator']

Topic 3
['tl', 'tor', 'defines', 'incidence', 'punctuation']

Topic 4
['dams', 'moratorium', 'reef', 'chernobyl', 'cod']

Topic 5
['avionics', 'shigeru', 'atypical', 'bergquist', 'lute']

Topic 6
['thetford', 'dredge', 'plating', 'berms', 'sealer']

Topic 7
['een', 'planeta', 'zine', 'eady', 'pres']

Topic 8
['disgruntled', 'sinhalese', 'beleaguered', 'minority', 'suing']

Topic 9
['gut', 'nagging', 'arthritic', 'gnawing', 'eats']

Topic 10
['hondas', 'vanlandingham', 'musket', 'slipshod', 'taxiing']

Topic 11
['roundtables', 'brazile', 'greentech', 'dovetailed', 'marni']

Topic 12
['middleclass', 'dignitary', 'noemi', 'biracial', 'officeholder']

Topic 13
['perpetrate', 'misbranding', 'meritless', 'lilo', 'messrs']

Topic 14
['reynosa', 'salina', 'walder', 'hite', 'mcallen']

Topic 1


Topic 93
['tabulating', 'chairmanships', 'retakes', 'iaf', 'mustering']

Topic 94
['piqued', 'gushing', 'koontz', 'genuineness', 'aquifer']

Topic 95
['seasonally', 'depreciated', 'franc', 'logged', 'overvalued']

Topic 96
['combatants', 'possess', 'legally', 'consume', 'personas']

Topic 97
['organizes', 'departmental', 'secretarial', 'prek', 'curricular']

Topic 98
['redacted', 'vinyl', 'processed', 'keys', 'notices']

Topic 99
['zyed', 'hasbrouck', 'maries', 'noninterest', 'goring']


In [166]:
R = K.transpose(rmn.model.get_layer("Wd").get_weights()[0])
R

<tf.Tensor 'transpose_29:0' shape=(100, 50) dtype=float32>

In [167]:
R = K.constant(rmn.model.get_layer("R").get_weights()[0])
R

<tf.Tensor 'Const_36:0' shape=(100, 50) dtype=float32>

In [141]:
# orthogonality penalty
RR_t = K.dot(R, K.transpose(R))
Id_mat = K.eye(rmn.num_topics)
orth_penalty = K.sqrt(K.sum(K.square(RR_t - Id_mat)))


In [142]:
K.get_value(orth_penalty)

58.2372

In [147]:
K.get_value(K.sum(K.square(RR_t - Id_mat)))

3391.5718

In [144]:
K.get_value(K.norm(RR_t - Id_mat))

AttributeError: module 'tensorflow.python.keras.api._v1.keras.backend' has no attribute 'norm'

In [136]:
RR_t

<tf.Tensor 'MatMul_7:0' shape=(100, 100) dtype=float32>

In [137]:
Id_mat

<tf.Variable 'Variable_3:0' shape=(100, 100) dtype=float32>

In [138]:
RR_t - Id_mat

<tf.Tensor 'sub_12:0' shape=(100, 100) dtype=float32>