In [5]:
import os 
import sys
import pandas as pd


In [6]:
sys.path.append("/home/rocassius/w266_final/scripts/assembly")
sys.path.append("/home/rocassius/w266_final/scripts/modeling")

In [7]:
from document import load_documents
from constant import DOC_PRAYER_PATH, MIN_SESSION, MAX_SESSION, DOC_ALL_PATH
from subject import subject_keywords

sessions = list(range(MIN_SESSION, MAX_SESSION+1))

In [8]:
from helper import load_pickled_object
from rmn import *
from rmn_analyzer import RMN_Analyzer
from rmn_data_generator import RMN_DataGenerator

In [9]:
# load embedding tools
prayer_tools_path = "/home/rocassius/gen-data/tools/prayer_tools"
metadata_dict = load_pickled_object(os.path.join(prayer_tools_path, "metadata_dict"))
tokenizer_dict = load_pickled_object(os.path.join(prayer_tools_path, "tokenizer_dict"))
embedding_matrix = load_pickled_object(os.path.join(prayer_tools_path, "idf_embedding_matrix"))
global_embedding_matrix = load_pickled_object(os.path.join(prayer_tools_path, "embedding_matrix_wg"))
global_tokenizer_dict = load_pickled_object(os.path.join(prayer_tools_path, "tokenizer_dict_wg"))

In [10]:
df = load_documents([111], DOC_PRAYER_PATH)

In [349]:
#==================#
#=*= RMN Module =*=#
#==================#

# RMN Class for training Relationship Modeling Networks 

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
import numpy as np

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Embedding, Dense, Lambda, Input, Masking, Reshape, Concatenate
from tensorflow.keras.models import load_model, model_from_json
from tensorflow.keras.regularizers import Regularizer
from tensorflow.keras.optimizers import Adam

from rmn_data_generator import RMN_DataGenerator
from helper import pickle_object, load_pickled_object
from vector_math import find_nn_cos

# constants
MAX_SPAN_LENGTH = 50
NUM_TOPICS = 20
LAMBDA = 1.0
GAMMA = 1.0

# hyperparameters
OPTIMIZER = 'adam'
BATCH_SIZE = 50
EPOCHS = 5

# saving tags
RMN_TAG = "rmn_%s"
MODEL = "model.h5"
ATTR = "attributes"

# attribute keys
N_TOP_KEY = 'num_topics'
EMBED_KEY = 'emedding_matrix'
TOKEN_KEY = 'tokenizer_dict'
META_KEY  = 'metadata_dict'
DIM_KEY = 'meta_embedding_dim'


class RMN(object):
    """
    Class for constructing a Relationship Modeling Network
    """
    
    def __init__(self):
        
        # model attrbiutes
        self.num_topics = NUM_TOPICS
        self.embedding_matrix = None
        self.meta_embedding_dim = None
        self.tokenizer_dict = None
        self.metadata_dict = None
        
        # inference attributes
        self.infer_embedding_matrix = None
        self.infer_tokenizer_dict = None
        
        # models 
        self.model = None
        self.topic_model = None
        
    
    @property
    def embedding_dim(self):
        return self.embedding_matrix.shape[1]
    
    @property
    def topic_matrix(self):
        """Return the topic matrix associated with the rmn"""
        # dim = [num_topics, embedding_dim]
        return self.model.get_layer('Wd').get_weights()[0].T
    
    @property
    def tuned_embedding_matrix(self):
        """Return the current embedding matrix of the rmn"""
        return rmn.model.get_layer('Span.Embedding').get_weights()[0]
    
    def model_loss(self):
        """Hinge loss function.
        """
        def custom_loss(y_true, y_pred):
            
            # hinge_loss
            y_true_normalized = K.l2_normalize(y_true, axis=-1)
            y_pred_normalized = K.l2_normalize(y_pred, axis=-1)
            dot_product = K.sum(y_true_normalized * y_pred_normalized, axis=-1)
            hinge_loss = K.mean(K.maximum(0., 1. - dot_product))

            return hinge_loss 

        return custom_loss
    
    
    def build_model(self, embedding_trainable=False, bias_reconstruct=True,
                    gamma = 1., theta = 1., omega = 1., lamb = 1., word_dropout = 0.5):
        """Connstruct the RMN model architecture
        """
        # Span Input
        span_input = Input(shape=(self.tokenizer_dict['max_span_length'],), 
                           name='Span.Input')
        span_embedding = Embedding(input_dim=len(self.tokenizer_dict['word_index']) + 1, 
                                   output_dim=self.embedding_dim, 
                                   weights=[self.embedding_matrix],
                                   input_length=self.tokenizer_dict['max_span_length'],
                                   trainable=embedding_trainable, 
                                   name = 'Span.Embedding')(span_input)
        
        # Mask for randomly dropping words
        dropout_mask = K.stack(
            [K.random_binomial((span_embedding.shape[1],), p=word_dropout)]*span_embedding.shape[2], axis=1)
        # Average over the remaining words
        span_avg = Lambda(lambda x: K.mean(x * K.expand_dims(dropout_mask, axis=0), axis=1), 
                          name = "Span.Avg.Layer")(span_embedding)

        input_layers = [span_input]
        embedding_layers = [span_avg]
        
        for col in self.metadata_dict.keys():
            input_layer = Input(shape=(1,), name= col + '.Input')
            
            # embedding layer for col
            embedding_init = Embedding(
                input_dim = self.metadata_dict[col]['input_dim'] + 1, 
                output_dim = self.meta_embedding_dim,
                input_length = 1)(input_layer)
            
            # reshape
            embedding_layer = Reshape((self.meta_embedding_dim, ), name=col + '.Embed.Layer')(embedding_init)
            
            input_layers.append(input_layer)
            embedding_layers.append(embedding_layer)

        # concatenate span vector with metadata embeddings
        _ht = Concatenate(axis=1, name = 'Concat.Layer')(embedding_layers)

        # dense layer
        ht = Dense(units = self.embedding_dim, 
                   input_shape = (_ht.shape[1], ), 
                   activation = "relu", name = "Wh")(_ht)

        # dense layer whose output is a probability distribution
        dt = Dense(units = self.num_topics, 
                   input_shape = (self.embedding_dim, ), 
                   activation = "softmax",
                   activity_regularizer = Purity(gamma, theta, omega),
                   name = "Wd")(ht)

        # reconstruction layer
        rt = Dense(units = self.embedding_dim,
                   input_shape = (self.num_topics, ),
                   activation = "linear",
                   use_bias = bias_reconstruct,
                   kernel_regularizer = Orthogonality(lamb),
                   name = "R")(dt)

        # compile
        model = tf.keras.Model(inputs=input_layers, outputs=rt)
        #model.compile(optimizer = OPTIMIZER, loss='mean_squared_error')
        model.compile(optimizer = OPTIMIZER, loss = self.model_loss())
        self.model = model
        
        # build associated topic model
        self.build_topic_model()
        
    
    def set_topic_vectors(self, words):
        """Set the topic vectors with vectors corresponding to the given words
        """
        # get the word ids
        word_ids = self.tokenizer_dict['tokenize_pad'](words)[:,0]
        
        # replicate associated weights up to num_topics
        weights = np.tile(self.embedding_matrix[word_ids], 
                          (-(self.num_topics // -len(words)),1))[:self.num_topics]
        
        # set weights layer weights
        r = self.model.get_layer("R")
        if len(r.get_weights()) == 1:
            r.set_weights([weights])
        else:
            r.set_weights([weights, r.get_weights()[1]])
        
        
    def build_topic_model(self, topic_layer = "Wd"):
        """Contruct model whose output is the topic distribution layer
        """
        topic_model = tf.keras.Model(
            inputs = self.model.input,
            outputs = self.model.get_layer(topic_layer).output)
        
        self.topic_model = topic_model
          
    
    def prep_spans(self, documents):
        """Returns the lists of word ids associated with the text
        """
        return self.tokenizer_dict['tokenize_pad'](documents)
    
    
    def prep_metadata(self, df):
        """Preps metadata for training or prediction
        """
        metadata_x = [np.array(self.metadata_dict[col]['tokenize'](df[col]))
                      for col in self.metadata_dict.keys()]

        return metadata_x
        
    
    def prep_X(self, df, for_training=False):
        """Preps metadata and spans for training or prediction
        """
        spans_y = self.prep_spans(df['document'])
        metadata_x = self.prep_metadata(df)
        X = [spans_y] + metadata_x
        
        if for_training:
            y = self.embedding_matrix[spans_y].mean(axis=1)
            return X, y
        else:
            return X

    def predict_y(self, df, use_generator=True):
        """Predicts the rmn outputs for a df
        """
        # ensure the topic model has been built
        if self.topic_model is None:
            self.build_topic_model()
        
        if use_generator:
            return self.predict_with_generator(df, self.model)
        else:
            return self.predict_(df, self.model)
    
    
    def predict_topics(self, df, use_generator=True):
        """Predicts the topic distributions for a df
        """        
        # ensure the topic model has been built
        if self.topic_model is None:
            self.build_topic_model()
        
        if use_generator:
            return self.predict_with_generator(df, self.topic_model)
        else:
            return self.predict_(df, self.topic_model)

        
    def predict_(self, df, model):
        """Makes a predictions for a df with a model
        """
        return model.predict(x=self.prep_X(df))
        
    
    def predict_with_generator(self, df, model):
        """Predict topic distributions with a generator
        """
        # Make sure data is not empty
        assert not df.empty

        # Calculate good batch size, 
        batch_size = max(1, min(10000, df.shape[0] // 10))
        n_batches = df.shape[0] // batch_size

        if n_batches < 2: 
            return self.predict_(df, model)
        else:
            # calculate remainder batch size
            r = df.shape[0] % batch_size
            if r == 0:
                g_index = df.index[:-batch_size]
                r_index = df.index[-batch_size:]
            else:
                g_index = df.index[:-r]
                r_index = df.index[-r:]

            # Make generator
            g = RMN_DataGenerator(self, df.loc[g_index], batch_size=batch_size, shuffle=False)

            # Predict on remainder batch
            r_pred = self.predict_(df.loc[r_index], model)
            # predict on generated batches
            g_pred = model.predict_generator(g, use_multiprocessing=True, workers=10, verbose=1)

            assert r_pred.shape[1] == g_pred.shape[1]
            pred = np.vstack([g_pred, r_pred])

            return pred
        
    
    def save_rmn(self, name, save_path):
        """
        Save the model's weights, architecture and attributes
        """
        # assemble attribute dictionary
        attribute_dict = {
            N_TOP_KEY:  self.num_topics,
            EMBED_KEY:  self.embedding_matrix,
            TOKEN_KEY:  self.tokenizer_dict,
            META_KEY:   self.metadata_dict, 
            DIM_KEY:    self.meta_embedding_dim}
        
        # make directory for model
        model_path = os.path.join(save_path, RMN_TAG % name)
        os.mkdir(model_path)
        
        # save model weights
        self.model.save_weights(os.path.join(model_path, MODEL))
        
        # save model attributes
        pickle_object(attribute_dict, os.path.join(model_path, ATTR))
        
        
    def load_rmn(self, name, save_path):
        """
        Load the model, weights, architecture and attributes from a saved model
        """
        # make directory for model
        model_path = os.path.join(save_path, RMN_TAG % name)
        
        # load attributes
        attributes_dict = load_pickled_object(os.path.join(model_path, ATTR))
        
        # update attributes
        self.num_topics         = attributes_dict[N_TOP_KEY]
        self.embedding_matrix   = attributes_dict[EMBED_KEY]
        self.tokenizer_dict     = attributes_dict[TOKEN_KEY]
        self.metadata_dict      = attributes_dict[META_KEY]
        self.meta_embedding_dim = attributes_dict[DIM_KEY] 
        
        # construct identical model architecture
        self.build_model()
        
        # Load weights
        self.model.load_weights(os.path.join(model_path, MODEL))
        
        # build associated topic model
        self.build_topic_model()
        
    
    def inspect_topics(self, which_topics='all', k_neighbors=10):
        """
        Ouput the nearest neighbors of every topic vector in
        the model's topic layer
        """
        if which_topics == 'all':
            which_topics = range(self.num_topics) 
        
        if (self.infer_embedding_matrix is None or 
            self.infer_tokenizer_dict is None):
            self.infer_embedding_matrix = self.embedding_matrix
            self.infer_tokenizer_dict = self.tokenizer_dict
        
        E = self.infer_embedding_matrix # dim = [vocab_size, embedding_dim]
        Wd = self.topic_matrix          # dim = [num_topics, embedding_dim]
        
        for i in which_topics:
            # find nearest neighbors to topic
            neighbors, sim = find_nn_cos(Wd[i], E, k_neighbors)
            words = [self.infer_tokenizer_dict['tokenizer'].index_word[v] for v in neighbors]
            print(20*"=" +"\n")
            print("Topic", i)
            print(words)
    
    
# Orthogonality Regularizer #

class Orthogonality(Regularizer):
    """
    Regularizer for penalizing non-orthogonal components of a weight matrix.
    
    Args:
    - lamb: (Float) regularization penalty weight
    """

    def __init__(self, lamb = 1.):
        self.lamb = lamb

    def __call__(self, R):
        """Returns a component dependence penalty for matrix R
        """
        RRT = K.dot(R, K.transpose(R))
        I = K.eye(RRT.shape.as_list()[0])
        penalty = self.lamb * K.sqrt(K.sum(K.square(RRT - I)))
        
        return penalty
    
    
# Topic Purity Regularizer #

class Purity(Regularizer):
    """Regularizer for penalizing highly impure probability distributions
    """
    def __init__(self, gamma = 1., theta = 1., omega = 1., entr=True):
        self.gamma = gamma
        self.theta = theta
        self.omega = omega
        self.entr = entr

    def __call__(self, p):
        """Returns the avergage shannon entropy of the distribution(s) p
        """
        # calculate impurity
        if self.entr:
            impurity = K.sum(p*-K.log(p)/K.log(K.constant(2)), axis=-1)
        else: 
            # Gini impurity    
            impurity = 1.-K.sum(p*(1-p), axis=-1)
        concentration = K.max(p, axis=-1)
        
        # calculate batch similarity
        ppt = K.dot(p, K.transpose(p)) 
        similarity = K.mean(K.sum(ppt) - K.sum(tf.linalg.diag_part(ppt)))
        
        
        penalty = (self.gamma * K.mean(impurity) + 
                   self.theta * K.mean(concentration) + 
                   self.omega * similarity)
        
        return penalty
    
    
    
# RMN child for training on Data Generators #

class RigidRMN(RMN):
    """
    A Derivative of the RMN class for training with a generator
    
    This version does not use an embedding for the span input
    """
    
    
    
    def __init__(self, dropout):
        RMN.__init__(self)
        self.dropout = dropout
    
    def build_model(self, embedding_trainable=False, bias_reconstruct=True,
                    gamma = 1., theta = 1., omega = 1., lamb = 1.):
        """Connstruct the RMN model architecture
        """
        # Span Input
        span_input = Input(shape=(self.embedding_dim,), name='Span.Input')

        input_layers = [span_input]
        embedding_layers = [span_input]
        
        for col in self.metadata_dict.keys():
            input_layer = Input(shape=(1,), name= col + '.Input')
            
            # embedding layer for col
            embedding_init = Embedding(
                input_dim = self.metadata_dict[col]['input_dim'] + 1, 
                output_dim = self.meta_embedding_dim,
                input_length = 1)(input_layer)
            
            # reshape
            embedding_layer = Reshape((self.meta_embedding_dim, ), name=col + '.Embed.Layer')(embedding_init)
            
            input_layers.append(input_layer)
            embedding_layers.append(embedding_layer)

        # concatenate span vector with metadata embeddings
        _ht = Concatenate(axis=1, name = 'Concat.Layer')(embedding_layers)

        # dense layer
        ht = Dense(units = self.embedding_dim, 
                   input_shape = (_ht.shape[1], ), 
                   activation = "relu", name = "Wh")(_ht)

        # dense layer whose output is a probability distribution
        dt = Dense(units = self.num_topics, 
                   input_shape = (self.embedding_dim, ), 
                   activation = "softmax",
                   activity_regularizer = Purity(gamma, theta, omega),
                   name = "Wd")(ht)

        # reconstruction layer
        rt = Dense(units = self.embedding_dim,
                   input_shape = (self.num_topics, ),
                   activation = "linear",
                   use_bias = bias_reconstruct,
                   kernel_regularizer = Orthogonality(lamb),
                   name = "R")(dt)

        # compile
        model = tf.keras.Model(inputs=input_layers, outputs=rt)
        model.compile(optimizer = OPTIMIZER, loss='mean_squared_error')
        #model.compile(optimizer = OPTIMIZER, loss = self.model_loss())
        self.model = model
        
        # build associated topic model
        self.build_topic_model()
        
    def prep_spans(self, documents, for_training):
        """Returns the lists of word ids associated with the text
        """
        spans_y = self.tokenizer_dict['tokenize_pad'](documents)
        if for_training:
            spans_y = spans_y * np.random.binomial(1, self.dropout, spans_y.shape)
            
        y = self.embedding_matrix[spans_y].mean(axis=1)
        #y = y / np.linalg.norm(y, axis=-1)[:, np.newaxis]
        
        return y.astype(np.float16)
        
    
    def prep_X(self, df, for_training=False):
        """Preps metadata and spans for training or prediction
        """
        vectors_y = self.prep_spans(df['document'], for_training)
        metadata_x = self.prep_metadata(df)
        X = [vectors_y] + metadata_x
        
        if for_training:
            return X, X[0]
        else:
            return X
        
        

In [396]:
rmn = RigidRMN(dropout=0.5)
rmn.embedding_matrix = embedding_matrix
rmn.tokenizer_dict = tokenizer_dict
rmn.metadata_dict = metadata_dict
rmn.infer_embedding_matrix = global_embedding_matrix
rmn.infer_tokenizer_dict = global_tokenizer_dict
rmn.meta_embedding_dim = 25
rmn.num_topics = 50
rmn.build_model(gamma=0.1, theta=0., omega=.01, lamb=0.001, bias_reconstruct=False)

In [397]:
data_generator = RMN_DataGenerator(rmn=rmn, data_df=df, batch_size=256)

In [398]:
rmn.model.fit_generator(data_generator, epochs = 5, use_multiprocessing=True, workers=20)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7efbbc7f5090>

In [400]:
#====================#
#=*= RMN Analyzer =*=#
#====================#

# Class for analyzing an RMN

import numpy as np
import pandas as pd
from analysis import *

# variable constants
SUB = 'subject'
SPEAK = 'speakerid'
PARTY = 'party'
SESS = 'session'
# party constants
R = 'R'
D = 'D'
# metric constants
JS = 'js'
HH = 'hh'
N_REC = 'n_records'


class RMN_Analyzer(object):
    """Class for Analyzing an RMN with respect to a dataset
    """
    
    def __init__(self, rmn, df):
        """
        Args:
        - rmn: (RMN) the RMN to be used for analysis
        - df : (DataFrame) the dataframe to analyze
        """
        
        'Initialization'
        self.rmn = rmn
        self.df = df.reset_index(drop=True)
        self.topic_preds = None
        self.y_preds = None
        
    @property
    def index(self):
        return self.df.index
         
        
    def predict_topics(self, use_generator=True):
        """Computes the topic predictions for all observations
        """
        self.topic_preds = self.rmn.predict_topics(self.df, use_generator)
        
    
    def predict_y(self, use_generator=True):
        """Computes the sentence vector predictions for all observations
        """
        self.y_preds = self.rmn.predict_y(self.df, use_generator)
        
        
    def sample_indices(self, indices, n):
        """Returns a SRR of the indices provided
        """
        return np.random.choice(indices, n, replace=True)

    
    def bool_subset(self, col, value):
        """
        Returns a boolean vector for each observation in the
        dataframe indicating whether it meets the col == value condition
        """
        assert col in self.df.columns
        return self.df[col] == value
    
    
    def bool_index(self, conditions):
        """
        Returns a boolean vector for each observation in the
        dataframe indicating whether it meets all conditions
        
        Args:
        - conditions: (dict) dictionary of conditions
        
        Returns: 
        - pandas series of booleans indicating where all 
          of the conditions hold
        """
        # initialize bool index
        bool_index = (pd.Series(True)
                      .repeat(self.index.shape[0])
                      .reset_index(drop=True))
        
        for col, val in conditions.items():
            bool_index = bool_index & self.bool_subset(col, val)
            
        return bool_index
    
    
    def cond_index(self, conditions):
        """Returns indices of records meeting the conditions
        """
        return self.index[self.bool_index(conditions)]
    
    
    def n_records(self, conditions={}):
        """Returns the number of records meetings the conditions
        """
        return len(self.cond_index(conditions))
    
    
    def compute_JS(self, index_A, index_B, base=2):
        """
        Computes the mean pair-wise JS divergence and associated CI
        between indices in index_A and indices in index_B
        """
        p_A = self.topic_preds[index_A]
        p_B = self.topic_preds[index_B]
        js_list = [jensenshannon(p, q, base) for p, q in zip(p_A, p_B)]
        
        return mean_CI(js_list)
        
        
    def compute_HH(self, index):
        """
        Computes the mean HH index and associated CI between
        indices in index_A and indices in index_B
        """
        p = self.topic_preds[index]
        hh_list = [hh_index(q) for q in p]
        
        return mean_CI(hh_list)
          
    
    def inter_party_js(self, conditions, n):
        """
        Returns the estimated inter party JS divergence and a CI.
        
        Computes the inter party JS divergence between 
        Republicans and Democrats on a given subject
        
        Args:
        - subject: (str) subject to examine
        - n      : (int) sample size
        
        Returns: a numpy array of length 3, where
        - 0 is the mean divergence point estimate:
        - 1 is the lower bound of a 95% CI
        - 2 is the upper bound of a 95% CI
        """
        # ensure that the topic predictions exist
        if self.topic_preds is None:
            self.predict_topics()
        
        # find R and D indicies on the subject
        index_R = self.cond_index({**conditions, **{PARTY: R}})
        index_D = self.cond_index({**conditions, **{PARTY: D}})
        
        # return None if indices are insufficient
        if len(index_R)==0 or len(index_D)==0:
            return None
        
        # sample 
        samp_index_R = self.sample_indices(index_R, n)
        samp_index_D = self.sample_indices(index_D, n)
    
        return self.compute_JS(samp_index_R, samp_index_D)
    
    
    def group_js(self, conditions, n):
        """
        Returns the estimated mean JS divergence and a CI
        
        Estimates the average JS divergence between any two documents of
        a group defined by the conditions. A document by speaker _i_ is 
        never compared to another document by speaker _i_.
        
        
        Args:
        - conditions: (dict) dictionary of conditions
        - n         : (int) sample size
        
        Returns: a numpy array of length 3, where index...
        - 0 is the mean divergence point estimate:
        - 1 is the lower bound of a 95% CI
        - 2 is the upper bound of a 95% CI
        """
        # ensure that the topic predictions exist
        if self.topic_preds is None:
            self.predict_topics()
        
        # find indicies of party on the subject
        cond_index = self.cond_index(conditions)
        
        # Return none if there are fewer than 2 speakers
        if self.df.loc[cond_index][SPEAK].nunique() < 2:
            return None
        
        # Sample index pairs
        index_AB = []
        while len(index_AB) < n:
            a_b = self.sample_indices(cond_index, n=2)
            # include samples whose speakers are different
            if self.df.loc[a_b][SPEAK].nunique() == 2:
                index_AB.append(a_b)
        
        index_AB = np.asarray(index_AB)
        assert index_AB.shape == (n, 2)
        
        # get indices for each group
        index_A, index_B = index_AB[:,0], index_AB[:,1]
        
        return self.compute_JS(index_A, index_B)
    
    
    def group_hh(self, conditions={}, n=None):
        """
        Returns the estimated mean HH index and a CI
        
        Estimates the average Herfindahl–Hirschman Index 
        of all records meetings the conditons.
        
        Args:
        - subject: (str) subject to examine
        - party  : (str) party of interest
        - n      : (int) sample size
        
        Returns: a numpy array of length 3, where index...
        - 0 is the mean index point estimate:
        - 1 is the lower bound of a 95% CI
        - 2 is the upper bound of a 95% CI
        """
        # ensure that the topic predictions exist
        if self.topic_preds is None:
            self.predict_topics()
        
        # indicies meeting the conditions
        cond_index = self.cond_index(conditions)
        
        # return None if indices are insufficient
        if len(cond_index)==0:
            return None
        
        if n is None:
            return self.compute_HH(cond_index)
        else:
            samp_index = self.sample_indices(cond_index, n)
            return self.compute_HH(samp_index)
        
        
    def analyze_subset(self, conditions, n):
        """
        Returns a dictionary of analysis metrics for the subset 
        of records defined by the conditions.
        
        Note: It is recommended conditions be on subject
        
        Args:
        - conditions: (dict) dictionary of conditions
        - n         : (int) sample size for estimation of metrics
        
        for the entire dataset and for each subject the following are computed:
        - n_records, n_records_R
        - n_records_D
        - js
        - js_R
        - js_D
        - js_RD
        - hh
        - hh_R
        - hh_D
        
        Returns: a dictionary of metrics
        """
        # R and D added conditions
        conditions_R = {**conditions, **{PARTY: R}}
        conditions_D = {**conditions, **{PARTY: D}}
        
        # annotation tags
        _R = '_' + R
        _D = '_' + D
        _RD = _R + D
        
        metrics = {
            # n record data
            N_REC:    self.n_records(conditions),
            N_REC+_R: self.n_records(conditions_R),
            N_REC+_D: self.n_records(conditions_D),
            # JS divergence data
            JS:     self.group_js(conditions, n),
            JS+_R:  self.group_js(conditions_R, n),
            JS+_D:  self.group_js(conditions_D, n),
            JS+_RD: self.inter_party_js(conditions, n),
            # HH index data
            HH:    self.group_hh(conditions, n),
            HH+_R: self.group_hh(conditions_R, n),
            HH+_D: self.group_hh(conditions_D, n)
        }
        
        return metrics
    
        
    def analyze(self, subjects, n):
        """
        Returns a dictionary of analysis metrics at the subject level
        and at the session level (assuming self.df is the data of a
        single session).
        
        Args:
        - subjects: (array-like) list of subjects
        - n       : (int) sample size for estimation of metrics
        
        Returns: a dictionary of metrics
        """
        # analyze entire session dataset
        dataset_metrics = self.analyze_subset(conditions={}, n=n)
        
        # analyze by subject
        subject_metrics = {}
        for s in subjects:
            subject_metrics[s] = self.analyze_subset({SUB: s}, n)
        
        metrics = {'dataset' : dataset_metrics, 
                   'subjects': subject_metrics}
        
        return metrics
    
    
    def shannon_entropy(self):
        """Returns the Shannon Entropy of every topic prediction
        """
        # ensure that the topic predictions exist
        if self.topic_preds is None:
            self.predict_topics()
        
        return shannon_entropy(self.topic_preds)
    
    def average_topic_max(self, conditions={}):
        """Return the average of the maximum probability assigned to a topic
        """
        cond_index = self.cond_index(conditions)
        avg_topic_max = np.max(self.topic_preds[cond_index], axis=-1).mean().round(3)
        
        return avg_topic_max
    
    
    def first_topic_counts(self, conditions={}):
        """
        Returns a leaderboard of topics and how many times they 
        are the primary topic associated with a document.
        """
        if self.topic_preds is None:
            self.predict_topics()
           
        cond_index = self.cond_index(conditions)
        topic_counts = pd.Series(np.argmax(self.topic_preds[cond_index], axis=-1)).value_counts()
        
        return topic_counts
    
    
    def topic_use(self, conditions={}):
        """
        Returns a leaderboard of topics based on the percentage of 
        total weight given to them in all of the documents
        """
        cond_index = self.cond_index(conditions)
        topic_sums = pd.Series(np.sum(self.topic_preds[cond_index], axis=0))
        topic_use = topic_sums.sort_values(ascending=False) / topic_sums.sum()
        
        return topic_use
    
    
    def primary_topics(self, conditions={}, k=5):
        """Returns top k most prominent topics for documents
        """
        cond_index = self.cond_index(conditions)
        primary_topics = np.flip(np.argsort(self.topic_preds[cond_index]))[:,:k]
        
        return primary_topics
    
    
#     def topic_stats(self):

In [401]:
analyzer = RMN_Analyzer(rmn, df)

In [402]:
analyzer.predict_topics()



In [408]:
analyzer.shannon_entropy().mean().round(3)

2.208

In [403]:
analyzer.average_topic_max()

0.534

In [405]:
analyzer.topic_use()

38    0.046295
24    0.041911
5     0.036861
4     0.035688
11    0.035676
39    0.035258
9     0.034875
47    0.027730
46    0.027399
32    0.027045
43    0.026586
7     0.025356
18    0.024735
34    0.024522
19    0.023531
28    0.023268
42    0.022963
13    0.021544
3     0.019678
1     0.019516
10    0.019314
21    0.018854
35    0.018816
26    0.018396
37    0.017708
12    0.017573
20    0.016800
22    0.016673
8     0.016552
36    0.015538
23    0.015496
49    0.015483
33    0.015425
44    0.014418
29    0.014133
17    0.014035
0     0.013475
40    0.013022
27    0.012656
25    0.012477
48    0.012277
15    0.012216
14    0.011710
2     0.010991
41    0.010702
30    0.009904
45    0.009273
6     0.008881
16    0.008455
31    0.008309
dtype: float32

In [406]:
analyzer.first_topic_counts()


38    4686
24    4150
4     3963
5     3484
9     3468
11    3451
39    3388
43    2850
46    2631
47    2549
32    2524
7     2469
34    2337
18    2190
28    2157
42    2104
19    2074
13    2044
10    1888
21    1777
3     1684
12    1650
8     1643
26    1617
22    1598
1     1585
37    1580
20    1571
35    1557
49    1449
44    1426
36    1406
23    1253
29    1211
40    1204
17    1187
33    1183
2     1122
6     1074
27    1070
0     1053
25    1002
15     986
30     961
48     956
45     773
41     769
14     723
31     705
16     572
dtype: int64

In [407]:
analyzer.rmn.inspect_topics(analyzer.first_topic_counts().index)

  Ds = np.dot(Wv, v) / (np.linalg.norm(v) * np.linalg.norm(Wv, axis = 1))
  Ds = np.dot(Wv, v) / (np.linalg.norm(v) * np.linalg.norm(Wv, axis = 1))



Topic 38
['shoh', 'bucks', 'siders', 'keh', 'xxxx', 'niner', 'decent', 'rakh', 'eighty', 'ensnares']

Topic 24
['revues', 'musicals', 'minireviews', 'marcoussis', 'mcquarrie', 'isherwood', 'discounts', 'lome', 'seventeen', 'songwriters']

Topic 4
['furled', 'extricate', 'flier', 'floured', 'housekeeping', 'sabbatical', 'cushy', 'advantaged', 'reins', 'entangled']

Topic 5
['meed', 'arcadius', '1347', 'honorius', 'frontpage', 'syrmia', 'malm', 'inds', 'focaccia', 'tigranes']

Topic 9
['algeria', 'iceland', 'kayseri', 'js', 'kemerovo', 'galicia', 'lavezzi', 'kantō', 'siberia', 'hamit']

Topic 11
['equaling', 'cheesecloth', 'maghreb', 'cuga', 'therien', 'hons', 'eccellenza', 'kumaun', 'newcourt', 'dawat']

Topic 39
['jooss', 'thistlethwaite', 'stradlin', 'chawda', 'higelin', 'skempton', 'barkleys', 'bhattacharyya', 'rasi', 'voros']

Topic 43
['muentefering', 'companhia', 'classification', 'shutouts', 'unicellular', 'ilves', 'dismal', 'morganatic', 'chengjiang', 'lowly']

Topic 46
['1965'

In [379]:
analyzer.shannon_entropy().mean()

2.3060486

In [380]:
analyzer.topic_use().round(4)

8     0.0842
9     0.0777
35    0.0549
21    0.0523
46    0.0504
2     0.0461
20    0.0419
7     0.0387
22    0.0385
13    0.0377
14    0.0368
41    0.0308
34    0.0305
19    0.0280
1     0.0249
40    0.0236
48    0.0231
32    0.0217
26    0.0203
11    0.0182
45    0.0165
25    0.0132
39    0.0111
17    0.0110
3     0.0107
0     0.0104
47    0.0100
36    0.0096
10    0.0093
49    0.0086
44    0.0078
33    0.0068
29    0.0064
42    0.0061
37    0.0061
38    0.0061
4     0.0057
27    0.0057
30    0.0056
16    0.0056
43    0.0056
18    0.0053
6     0.0050
5     0.0049
28    0.0048
23    0.0047
24    0.0045
31    0.0044
12    0.0042
15    0.0041
dtype: float32

In [381]:
analyzer.primary_topics()

array([[ 8,  9, 22,  2, 46],
       [ 9,  8, 21, 22, 35],
       [ 8,  9, 22,  7, 35],
       ...,
       [29, 37, 23,  4, 31],
       [21, 17, 32, 41, 35],
       [ 2, 32, 41, 35, 17]])

In [385]:
analyzer.topic_preds[11].round(3).max()

0.347

In [384]:
analyzer.group_js(conditions={}, n=10000)

{'mean': 0.8821866551207175,
 'lower': 0.8796436010095868,
 'upper': 0.8847297092318481}

In [279]:
subject_keywords.keys()

dict_keys(['abortion', 'drug_alcohol', 'budget', 'guns', 'defense', 'economy', 'education', 'foreign', 'health', 'immigration', 'labor', 'minorities', 'women', 'tax', 'trade'])

In [363]:
analyzer.group_js(conditions={'subject':'immigration'}, n=10000)

{'mean': 0.8543544875543024,
 'lower': 0.8515277901812843,
 'upper': 0.8571811849273205}

In [364]:
analyzer.group_js(conditions={'subject':'guns'}, n=10000)

{'mean': 0.8614940578368009,
 'lower': 0.8583278252398969,
 'upper': 0.8646602904337048}

In [365]:
analyzer.group_js(conditions={'subject':'tax'}, n=10000)

{'mean': 0.8201166751350382,
 'lower': 0.8169926580013287,
 'upper': 0.8232406922687476}

In [326]:
analyzer.group_js(conditions={'subject':'women'}, n=10000)

{'mean': 0.8325434899237656,
 'lower': 0.8294430355481792,
 'upper': 0.835643944299352}

In [327]:
analyzer.group_js(conditions={'subject':'education'}, n=10000)

{'mean': 0.837679005868254,
 'lower': 0.834513355527901,
 'upper': 0.8408446562086067}

In [329]:
analyzer.group_js(conditions={'subject':'minorities'}, n=10000)

{'mean': 0.8587723603182558,
 'lower': 0.8560560399141686,
 'upper': 0.8614886807223434}

In [330]:
analyzer.group_js(conditions={'subject':'budget'}, n=10000)

{'mean': 0.8318494436201203,
 'lower': 0.8289183253696087,
 'upper': 0.8347805618706315}

In [331]:
analyzer.group_js(conditions={'subject':'drug_alcohol'}, n=10000)

{'mean': 0.8145034280806976,
 'lower': 0.8113631981805421,
 'upper': 0.8176436579808531}

In [332]:
analyzer.group_js(conditions={'subject':'health'}, n=10000)

{'mean': 0.8238726988337747,
 'lower': 0.820945972597457,
 'upper': 0.8267994250700923}

In [333]:
analyzer.group_js(conditions={'subject':'abortion'}, n=10000)

{'mean': 0.8297398497028997,
 'lower': 0.8269322897794448,
 'upper': 0.8325474096263547}

In [334]:
analyzer.group_js(conditions={'subject':'foreign'}, n=10000)

{'mean': 0.8738162841298852,
 'lower': 0.8713402076086982,
 'upper': 0.8762923606510722}

In [366]:
analyzer.inter_party_js(conditions={'subject':'tax'}, n=10000)

{'mean': 0.8208487595973093,
 'lower': 0.8176917141483929,
 'upper': 0.8240058050462259}

In [367]:
analyzer.group_js(conditions={'party': 'R', 'subject':'tax'}, n=10000)

{'mean': 0.8249732705518298,
 'lower': 0.8218381811337004,
 'upper': 0.8281083599699591}

In [368]:
analyzer.group_js(conditions={'party': 'D', 'subject':'tax'}, n=10000)

{'mean': 0.8169848236010119,
 'lower': 0.8138217310517034,
 'upper': 0.8201479161503203}