In [1]:
import os
import sys
import pandas as pd

In [2]:
sys.path.append("/home/rocassius/w266_final/scripts/assembly")
sys.path.append("/home/rocassius/w266_final/scripts/modeling")

In [3]:
from document import load_documents
from constant import DOC_PRAYER_PATH
from subject import subject_keywords

In [4]:
from helper import load_pickled_object
from rmn import *
from rmn_data_generator import RMN_DataGenerator
from rmn_analyzer import RMN_Analyzer

In [5]:
# load embedding tools
prayer_tools_path = "/home/rocassius/gen-data/tools/prayer_tools"

tokenizer_dict = load_pickled_object(os.path.join(prayer_tools_path, "tokenizer_dict"))
metadata_dict = load_pickled_object(os.path.join(prayer_tools_path, "metadata_dict"))
embedding_matrix = load_pickled_object(os.path.join(prayer_tools_path, "idf_embedding_matrix"))
global_embedding_matrix = load_pickled_object(os.path.join(prayer_tools_path, "embedding_matrix_wg"))
global_tokenizer_dict = load_pickled_object(os.path.join(prayer_tools_path, "tokenizer_dict_wg"))

In [6]:
tools_path = "/home/rocassius/gen-data/tools/"
metadata_dict_global = load_pickled_object(os.path.join(tools_path, "global_metadata_dict"))

In [7]:
docs_df = load_documents([111], DOC_PRAYER_PATH)
#d = docs_df.sample(000)

In [8]:
docs_df

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,subject,session
0,111116451,KOHL,HERBERT,S,WI,M,D,responsibility to protect those who cannot pro...,abortion,111
1,111117170,TOWNS,EDOLPHUS,H,NY,M,D,obama said i thought that was pretty tough and...,abortion,111
2,111115330,BEAN,MELISSA,H,IL,F,D,change the laws of this country of seeking to ...,abortion,111
3,111121930,FOSTER,BILL,H,IL,M,D,good character restraint respect for law and r...,abortion,111
4,111117650,HOLT,RUSH,H,NJ,M,D,this bill contains outofcontrol spending it co...,abortion,111
...,...,...,...,...,...,...,...,...,...,...
92749,111120531,MIKULSKI,BARBARA,S,MD,F,D,of a more accurate food facility registry impr...,trade,111
92750,111120170,LATTA,ROBERT,H,OH,M,R,their lives to save others that day these meda...,trade,111
92751,111116441,HUTCHISON,KAY,S,TX,F,R,investment this year the development assistanc...,trade,111
92752,111120860,POLIS,JARED,H,CO,M,D,economytoo much in the last decade of folks pa...,trade,111


In [225]:
#==================#
#=*= RMN Module =*=#
#==================#

# RMN Class for training Relationship Modeling Networks 

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
import numpy as np

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Embedding, Dense, Lambda, Input, Masking, Reshape, Concatenate
from tensorflow.keras.models import load_model, model_from_json
from tensorflow.keras.regularizers import Regularizer
from tensorflow.keras.optimizers import Adam

from rmn_data_generator import RMN_DataGenerator
from helper import pickle_object, load_pickled_object
from vector_math import find_nn_cos

# constants
MAX_SPAN_LENGTH = 50
NUM_TOPICS = 20
LAMBDA = 1.0
GAMMA = 1.0

# hyperparameters
OPTIMIZER = 'adam'
BATCH_SIZE = 50
EPOCHS = 5

# saving tags
RMN_TAG = "rmn_%s"
MODEL = "model.h5"
ATTR = "attributes"

# attribute keys
N_TOP_KEY = 'num_topics'
LAMB_KEY  = 'lambda'
EMBED_KEY = 'emedding_matrix'
TOKEN_KEY = 'tokenizer_dict'
META_KEY  = 'metadata_dict'
DIM_KEY = 'meta_embedding_dim'


class RMN(object):
    """
    Class for constructing a Relationship Modeling Network
    """
    
    def __init__(self):
        
        # model attrbiutes
        self.num_topics = NUM_TOPICS
        self.embedding_matrix = None
        self.meta_embedding_dim = None
        self.tokenizer_dict = None
        self.metadata_dict = None
        
        # inference attributes
        self.infer_embedding_matrix = None
        self.infer_tokenizer_dict = None
        
        # models 
        self.model = None
        self.topic_model = None
        
    
    @property
    def embedding_dim(self):
        return self.embedding_matrix.shape[1]
    
    @property
    def topic_matrix(self):
        """Return the topic matrix associated with the rmn"""
        # dim = [num_topics, embedding_dim]
        return self.model.get_layer('Wd').get_weights()[0].T
    
    @property
    def tuned_embedding_matrix(self):
        """Return the current embedding matrix of the rmn"""
        return rmn.model.get_layer('Span.Embedding').get_weights()[0]
    
    def model_loss(self):
        """Hinge loss function.
        """
        def custom_loss(y_true, y_pred):
            
            # hinge_loss
            y_true_normalized = K.l2_normalize(y_true, axis=-1)
            y_pred_normalized = K.l2_normalize(y_pred, axis=-1)
            dot_product = K.sum(y_true_normalized * y_pred_normalized, axis=-1)
            hinge_loss = K.mean(K.maximum(0., 1. - dot_product))

            return hinge_loss 

        return custom_loss
    
    
    def build_model(self, embedding_trainable=False, bias_reconstruct=True,
                    gamma = 1., theta = 1., omega = 1., word_dropout = 0.5):
        """Connstruct the RMN model architecture
        """
        # Span Input
        span_input = Input(shape=(self.tokenizer_dict['max_span_length'],), 
                           name='Span.Input')
        span_embedding = Embedding(input_dim=len(self.tokenizer_dict['word_index']) + 1, 
                                   output_dim=self.embedding_dim, 
                                   weights=[self.embedding_matrix],
                                   input_length=self.tokenizer_dict['max_span_length'],
                                   trainable=embedding_trainable, 
                                   name = 'Span.Embedding')(span_input)
        
        # Mask for randomly dropping words
        dropout_mask = K.stack(
            [K.random_binomial((span_embedding.shape[1],), p=word_dropout)]*span_embedding.shape[2], axis=1)
        # Average over the remaining words
        span_avg = Lambda(lambda x: K.mean(x * K.expand_dims(dropout_mask, axis=0), axis=1), 
                          name = "Span.Avg.Layer")(span_embedding)

        input_layers = [span_input]
        embedding_layers = [span_avg]
        
        for col in self.metadata_dict.keys():
            input_layer = Input(shape=(1,), name= col + '.Input')
            
            # embedding layer for col
            embedding_init = Embedding(
                input_dim = self.metadata_dict[col]['input_dim'] + 1, 
                output_dim = self.meta_embedding_dim,
                input_length = 1)(input_layer)
            
            # reshape
            embedding_layer = Reshape((self.meta_embedding_dim, ), name=col + '.Embed.Layer')(embedding_init)
            
            input_layers.append(input_layer)
            embedding_layers.append(embedding_layer)

        # concatenate span vector with metadata embeddings
        _ht = Concatenate(axis=1, name = 'Concat.Layer')(embedding_layers)

        # dense layer
        ht = Dense(units = self.embedding_dim, 
                   input_shape = (_ht.shape[1], ), 
                   activation = "relu", name = "Wh")(_ht)

        # dense layer whose output is a probability distribution
        dt = Dense(units = self.num_topics, 
                   input_shape = (self.embedding_dim, ), 
                   activation = "softmax",
                   activity_regularizer = Purity(gamma, theta, omega),
                   name = "Wd")(ht)

        # reconstruction layer
        rt = Dense(units = self.embedding_dim,
                   input_shape = (self.num_topics, ),
                   activation = "linear",
                   use_bias = bias_reconstruct,
                   kernel_regularizer = Orthogonality(self.lamb),
                   name = "R")(dt)

        # compile
        model = tf.keras.Model(inputs=input_layers, outputs=rt)
        #model.compile(optimizer = OPTIMIZER, loss='mean_squared_error')
        model.compile(optimizer = OPTIMIZER, loss = self.model_loss())
        self.model = model
        
        # build associated topic model
        self.build_topic_model()
        
    
    def set_topic_vectors(self, words):
        """Set the topic vectors with vectors corresponding to the given words
        """
        # get the word ids
        word_ids = self.tokenizer_dict['tokenize_pad'](words)[:,0]
        
        # replicate associated weights up to num_topics
        weights = np.tile(self.embedding_matrix[word_ids], 
                          (-(self.num_topics // -len(words)),1))[:self.num_topics]
        
        # set weights layer weights
        r = self.model.get_layer("R")
        if len(r.get_weights()) == 1:
            r.set_weights([weights])
        else:
            r.set_weights([weights, r.get_weights()[1]])
        
        
    def build_topic_model(self, topic_layer = "Wd"):
        """Contruct model whose output is the topic distribution layer
        """
        topic_model = tf.keras.Model(
            inputs = self.model.input,
            outputs = self.model.get_layer(topic_layer).output)
        
        self.topic_model = topic_model
          
    
    def prep_spans(self, documents):
        """Returns the lists of word ids associated with the text
        """
        return self.tokenizer_dict['tokenize_pad'](documents)
    
    
    def prep_metadata(self, df):
        """Preps metadata for training or prediction
        """
        metadata_x = [np.array(self.metadata_dict[col]['tokenize'](df[col]))
                      for col in self.metadata_dict.keys()]

        return metadata_x
        
    
    def prep_X(self, df, for_training=False):
        """Preps metadata and spans for training or prediction
        """
        spans_y = self.prep_spans(df['document'])
        metadata_x = self.prep_metadata(df)
        X = [spans_y] + metadata_x
        
        if for_training:
            y = self.embedding_matrix[spans_y].mean(axis=1)
            return X, y
        else:
            return X

    def predict_y(self, df, use_generator=True):
        """Predicts the rmn outputs for a df
        """
        # ensure the topic model has been built
        if self.topic_model is None:
            self.build_topic_model()
        
        if use_generator:
            return self.predict_with_generator(df, self.model)
        else:
            return self.predict_(df, self.model)
    
    
    def predict_topics(self, df, use_generator=True):
        """Predicts the topic distributions for a df
        """        
        # ensure the topic model has been built
        if self.topic_model is None:
            self.build_topic_model()
        
        if use_generator:
            return self.predict_with_generator(df, self.topic_model)
        else:
            return self.predict_(df, self.topic_model)

        
    def predict_(self, df, model):
        """Makes a predictions for a df with a model
        """
        return model.predict(x=self.prep_X(df))
        
    
    def predict_with_generator(self, df, model):
        """Predict topic distributions with a generator
        """
        # Make sure data is not empty
        assert not df.empty

        # Calculate good batch size, 
        batch_size = max(1, min(10000, df.shape[0] // 10))
        n_batches = df.shape[0] // batch_size

        if n_batches < 2: 
            return self.predict_(df, model)
        else:
            # calculate remainder batch size
            r = df.shape[0] % batch_size
            if r == 0:
                g_index = df.index[:-batch_size]
                r_index = df.index[-batch_size:]
            else:
                g_index = df.index[:-r]
                r_index = df.index[-r:]

            # Make generator
            g = RMN_DataGenerator(self, df.loc[g_index], batch_size=batch_size, shuffle=False)

            # Predict on remainder batch
            r_pred = self.predict_(df.loc[r_index], model)
            # predict on generated batches
            g_pred = model.predict_generator(g, use_multiprocessing=True, workers=10, verbose=1)

            assert r_pred.shape[1] == g_pred.shape[1]
            pred = np.vstack([g_pred, r_pred])

            return pred
        
    
    def save_rmn(self, name, save_path):
        """
        Save the model's weights, architecture and attributes
        """
        # assemble attribute dictionary
        attribute_dict = {
            N_TOP_KEY:  self.num_topics,
            LAMB_KEY:   self.lamb,
            EMBED_KEY:  self.embedding_matrix,
            TOKEN_KEY:  self.tokenizer_dict,
            META_KEY:   self.metadata_dict, 
            DIM_KEY:    self.meta_embedding_dim}
        
        # make directory for model
        model_path = os.path.join(save_path, RMN_TAG % name)
        os.mkdir(model_path)
        
        # save model weights
        self.model.save_weights(os.path.join(model_path, MODEL))
        
        # save model attributes
        pickle_object(attribute_dict, os.path.join(model_path, ATTR))
        
        
    def load_rmn(self, name, save_path):
        """
        Load the model, weights, architecture and attributes from a saved model
        """
        # make directory for model
        model_path = os.path.join(save_path, RMN_TAG % name)
        
        # load attributes
        attributes_dict = load_pickled_object(os.path.join(model_path, ATTR))
        
        # update attributes
        self.num_topics         = attributes_dict[N_TOP_KEY]
        self.lamb               = attributes_dict[LAMB_KEY]
        self.embedding_matrix   = attributes_dict[EMBED_KEY]
        self.tokenizer_dict     = attributes_dict[TOKEN_KEY]
        self.metadata_dict      = attributes_dict[META_KEY]
        self.meta_embedding_dim = attributes_dict[DIM_KEY] 
        
        # construct identical model architecture
        self.build_model()
        
        # Load weights
        self.model.load_weights(os.path.join(model_path, MODEL))
        
        # build associated topic model
        self.build_topic_model()
        
    
    def inspect_topics(self, which_topics='all', k_neighbors=10):
        """
        Ouput the nearest neighbors of every topic vector in
        the model's topic layer
        """
        if which_topics == 'all':
            which_topics = range(self.num_topics) 
        
        if (self.infer_embedding_matrix is None or 
            self.infer_tokenizer_dict is None):
            self.infer_embedding_matrix = self.embedding_matrix
            self.infer_tokenizer_dict = self.tokenizer_dict
        
        E = self.infer_embedding_matrix # dim = [vocab_size, embedding_dim]
        Wd = self.topic_matrix          # dim = [num_topics, embedding_dim]
        
        for i in which_topics:
            # find nearest neighbors to topic
            neighbors, sim = find_nn_cos(Wd[i], E, k_neighbors)
            words = [self.infer_tokenizer_dict['tokenizer'].index_word[v] for v in neighbors]
            print(20*"=" +"\n")
            print("Topic", i)
            print(words)
    
    
# Orthogonality Regularizer #

class Orthogonality(Regularizer):
    """
    Regularizer for penalizing non-orthogonal components of a weight matrix.
    
    Args:
    - lamb: (Float) regularization penalty weight
    """

    def __init__(self, lamb = 1.):
        self.lamb = lamb

    def __call__(self, R):
        """Returns a component dependence penalty for matrix R
        """
        RRT = K.dot(R, K.transpose(R))
        I = K.eye(RRT.shape.as_list()[0])
        penalty = self.lamb * K.sqrt(K.sum(K.square(RRT - I)))
        
        return penalty
    
    
# Topic Purity Regularizer #

class Purity(Regularizer):
    """Regularizer for penalizing highly impure probability distributions
    """
    def __init__(self, gamma = 1., theta = 1., omega = 1.):
        self.gamma = gamma
        self.theta = theta
        self.omega = omega

    def __call__(self, p):
        """Returns the avergage shannon entropy of the distribution(s) p
        """
        # calculate impurity and concentration
        impurity = K.sum(p*-K.log(p)/K.log(K.constant(2)), axis=-1)
        concentration = K.max(p, axis=-1)
        # calculate batch similarity
        ppt = K.dot(p, K.transpose(p)) 
        similarity = K.mean(ppt) - K.mean(tf.linalg.diag_part(ppt))
        
        penalty = (self.gamma * K.mean(impurity) + 
                   self.theta * K.mean(concentration) + 
                   self.omega * similarity)
        
        return penalty
    

In [216]:
# 2, 3, 5 
x = K.constant([[[1,4,1,35,6], [89,5,7,23,9],[-5,8,78,-55,666]], 
                [[70,13,-1,-89,-6], [-8,-5,7,8,8], [-70,-65,17,9,3]]])
x

K.get_value(K.mean(
    x*tf.stack([tf.stack([K.random_binomial((x.shape[1],), p=0.5)]*x.shape[2],axis=1)]*2),
    axis=1))

array([[  0.33333334,   1.3333334 ,   0.33333334,  11.666667  ,
          2.        ],
       [ 23.333334  ,   4.3333335 ,  -0.33333334, -29.666666  ,
         -2.        ]], dtype=float32)

In [217]:
d = 5
m = 3

In [218]:
dropout_matrix = K.stack([K.random_binomial((m,), p=0.5)] * d , axis=1)
K.get_value(x*K.expand_dims(dropout_matrix, axis=0))


array([[[  0.,   0.,   0.,   0.,   0.],
        [ 89.,   5.,   7.,  23.,   9.],
        [ -5.,   8.,  78., -55., 666.]],

       [[  0.,   0.,  -0.,  -0.,  -0.],
        [ -8.,  -5.,   7.,   8.,   8.],
        [-70., -65.,  17.,   9.,   3.]]], dtype=float32)

In [227]:
rmn = RMN()
rmn.embedding_matrix = embedding_matrix
rmn.tokenizer_dict = tokenizer_dict
rmn.infer_embedding_matrix = global_embedding_matrix
rmn.infer_tokenizer_dict = global_tokenizer_dict
rmn.metadata_dict = metadata_dict_global
# rmn.metadata_dict = {'party': metadata_dict['party']}
rmn.meta_embedding_dim = 25
rmn.num_topics = 100
rmn.lamb = 1.0
rmn.build_model(gamma=.1, theta=0., omega=0.)

In [228]:
rmn.model.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Span.Input (InputLayer)         [(None, 40)]         0                                            
__________________________________________________________________________________________________
speakerid.Input (InputLayer)    [(None, 1)]          0                                            
__________________________________________________________________________________________________
chamber.Input (InputLayer)      [(None, 1)]          0                                            
__________________________________________________________________________________________________
state.Input (InputLayer)        [(None, 1)]          0                                            
____________________________________________________________________________________________

In [229]:
#rmn.set_topic_vectors(subject_keywords.keys())
rmn.set_topic_vectors(['abortion', 'children'])

In [230]:
d = docs_df[docs_df['subject']=='abortion']

In [231]:
X, y = rmn.prep_X(d, for_training=True)
#X, y = rmn.prep_X(docs_df.sample(frac=1), for_training=True)

In [233]:
rmn.model.fit(X, y, epochs=60, batch_size=20)

Train on 407 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<tensorflow.python.keras.callbacks.History at 0x7f4a5c7264d0>

In [234]:
analyzer = RMN_Analyzer(rmn, d)

In [235]:
analyzer.shannon_entropy().mean()



0.91456074

In [236]:
analyzer.topic_preds.round(3)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [237]:
analyzer.primary_topics()

array([[77, 48, 33, 84, 34],
       [77, 48, 33, 34, 84],
       [77, 48, 33, 34, 84],
       ...,
       [77, 48, 33, 34, 84],
       [77, 48, 33, 84, 34],
       [77, 48, 33, 84, 34]])

In [238]:
analyzer.first_topic_counts()

77    402
48      5
dtype: int64

In [239]:
analyzer.topic_use().round(3)

77    0.66
48    0.34
33    0.00
34    0.00
84    0.00
      ... 
64    0.00
68    0.00
56    0.00
19    0.00
46    0.00
Length: 100, dtype: float32

In [240]:
np.round(analyzer.topic_preds[0],3)

array([0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.352, 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.648, 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   ], dtype=float32)

In [241]:
np.round(analyzer.topic_preds[1],3)

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.29, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.71, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  ], dtype=float32)

In [242]:
np.round(analyzer.topic_preds[70],3)

array([0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.327, 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.673, 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   ], dtype=float32)

In [243]:
analyzer.rmn.inspect_topics(analyzer.first_topic_counts().index)

  Ds = np.dot(Wv, v) / (np.linalg.norm(v) * np.linalg.norm(Wv, axis = 1))
  Ds = np.dot(Wv, v) / (np.linalg.norm(v) * np.linalg.norm(Wv, axis = 1))



Topic 77
['idp', 'entangled', 'serviceman', 'ancestral', 'lineage', 'securely', 'quaternions', 'tethered', 'identity', 'albatross']

Topic 48
['activex', 'maneuver', 'outpost', 'invisibility', 'lapse', 'maneuvering', 'clone', 'paintbrush', 'samurai', 'pcu']


In [1030]:
np.round(analyzer.topic_use(conditions={'subject': 'abortion'}),3)

7    0.288
1    0.270
9    0.209
6    0.135
4    0.098
5    0.000
2    0.000
8    0.000
0    0.000
3    0.000
dtype: float32

In [1099]:
analyzer.inter_party_js(conditions={}, n=10000)

{'mean': 0.10666140194929351,
 'lower': 0.10510082393074104,
 'upper': 0.10822197996784592}

In [1100]:
analyzer.group_js(conditions={'party':'R'}, n=10000)

{'mean': 0.1085427735816004,
 'lower': 0.10697649360462963,
 'upper': 0.11010905355857117}

In [1101]:
analyzer.group_js(conditions={'party':'D'}, n=10000)

{'mean': 0.1022140048119605,
 'lower': 0.10070788089142411,
 'upper': 0.10372012873249688}

In [287]:
s = 'labor'

In [288]:
analyzer.group_js(conditions={'subject': s, 'party':'R'}, n=10000)

{'mean': 0.1811149685760053,
 'lower': 0.1801685756634546,
 'upper': 0.18206136148855603}

In [289]:
analyzer.group_js(conditions={'subject': s, 'party':'D'}, n=10000)

{'mean': 0.18142981918058432,
 'lower': 0.18048422123088034,
 'upper': 0.1823754171302883}

In [290]:
analyzer.inter_party_js(conditions={'subject':s}, n=10000)

{'mean': 0.1801105393970354,
 'lower': 0.17916237570626425,
 'upper': 0.18105870308780653}