In [1]:
import os
import pandas as pd

In [2]:
os.chdir("/home/rocassius/w266_final/scripts/assembly")
from constant import DOC_SAMPLE_PATH, DOC_PROPER_PATH
from document import *

In [3]:
df = load_documents(sessions=list(range(104,105)), read_path=DOC_PROPER_PATH)

In [4]:
os.chdir("/home/rocassius/w266_final/scripts/modeling")
from rmn import *
from rmn_data_generator import *

In [5]:
rmn = RMN()
rmn.load_rmn(name="half", save_path = "/home/rocassius/gen-data/models")

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [6]:
df_samp = df.sample(10000)

In [11]:
topics_preds = rmn.predict_topics(df_samp)

In [377]:
dg = RMN_DataGenerator(rmn, df, batch_size=10000)

In [378]:
# make topic predictions
topics_preds = rmn.topic_model.predict_generator(dg, use_multiprocessing=True, workers=10,verbose=1)



In [328]:
1,(1,2)

(1, (1, 2))

In [329]:
REP = "R"
DEM = "D"
PARTY = "party"

import numpy as np
from scipy.special import rel_entr
import statsmodels.stats.api as sms


def jensenshannon(p, q, base=None):
    """
    Returns the JS divergence between two 1-dimensional probability vectors,
    code taken from scipy and modified to fix bug
    """
    p = np.asarray(p)
    q = np.asarray(q)
    p = p / np.sum(p, axis=0)
    q = q / np.sum(q, axis=0)
    m = (p + q) / 2.0
    left = rel_entr(p, m)
    right = rel_entr(q, m)
    js = max(0, np.sum(left, axis=0) + np.sum(right, axis=0))
    if base is not None:
        js /= np.log(base)
    return np.sqrt(js / 2.0)


def mean_CI(x):
    """
    Returns a numpy array of length 3 with indices as follows:
    - 0, the estimated mean
    - 1, the 95% lower CI bound
    - 2, the 95% upper CI bound
    """
    mean = np.mean(x)
    CI = sms.DescrStatsW(x).tconfint_mean()
    mean_ci = np.array((mean,) + CI)
    
    return mean_ci


# Herfindahl–Hirschman Index
def hh_index(p):
    
    p = np.asarray(p)
    p = p / np.sum(p)
    hhi = np.sum(p**2)
    return hhi


In [330]:


SUB_KEY = 'subject'
SPEAKER = 'speakerid'

In [365]:
import numpy as np


class RMN_Analyzer(object):
    """Class for Analyzing an RMN with respect to a dataset
    """
    
    def __init__(self, rmn, df):
        
        self.rmn = rmn
        self.df = df.reset_index(drop=True)
        self.topic_preds = None
        
    @property
    def index(self):
        return self.df.index
         
        
    def predict_topics(self):
        """Computes the topic predictions for all observations
        """
        self.topic_preds = self.rmn.predict_topics(self.df)
        

    def bool_subset(self, col, value):
        """
        Returns a boolean vector for each observation in the
        dataframe indicating whether it meets the conditions
        """
        assert col in self.df.columns
        return self.df[col] == value
    
    
    def sample_indices(self, indices, n):
        """Returns a SRR of the indices provided
        """
        return np.random.choice(indices, n, replace=True)
    
    
    def compute_JS(self, index_A, index_B, base=2):
        """
        Computes the mean pair-wise JS divergence and associated CI
        between indices in index_A and indices in index_B
        """
        p_A = self.topic_preds[index_A]
        p_B = self.topic_preds[index_B]
        js_list = [jensenshannon(p, q, base) for p, q in zip(p_A, p_B)]
        
        return mean_CI(js_list)
        
        
    def compute_HH(self, index):
        """
        Computes the mean HH index and associated CI between
        indices in index_A and indices in index_B
        """
        p = self.topic_preds[index]
        hh_list = [hh_index(q) for q in p]
        
        return mean_CI(hh_list)
          
    
    def intra_party_divergence(self, party, subject, n):
        """
        Computes the intraparty divergence for a given party 
        on a given subject
        
        Args:
        - subject: (str) subject to examine
        - party  : (str) party of interest
        - n      : (int) sample size
        
        Returns: a numpy array of length 3, where index...
        - 0 is the mean divergence point estimate:
        - 1 is the lower bound of a 95% CI
        - 2 is the upper bound of a 95% CI
        """
        # ensure that the topic predictions exist
        if self.topic_preds is None:
            self.predict_topics()
        
        # find party indicies on the subject
        party_index = self.index[self.bool_subset(PARTY, party) & 
                                 self.bool_subset(SUB_KEY, subject)]
        
        # Return none if there are fewer than 2 speakers
        if self.df.loc[party_index][SPEAKER].nunique() < 2:
            return None
        
        # Sample index pairs
        index_AB = []
        while len(index_AB) < n:
            a_b = self.sample_indices(party_index, n=2)
            # include pairs whose speakers are different
            if self.df.loc[a_b][SPEAKER].nunique() == 2:
                index_AB.append(a_b)
        
        index_AB = np.asarray(index_AB)
        assert index_AB.shape == (n, 2)
        
        # get indices for each group
        index_A, index_B = index_AB[:,0], index_AB[:,1]
        
        return self.compute_JS(index_A, index_B)
    
    
    def inter_party_divergence(self, subject, n):
        """
        Computes the inter party JS divergence between Republicans and Democrats
        on a given subject
        
        Args:
        - subject: (str) subject to examine
        - n      : (int) sample size
        
        Returns: a numpy array of length 3, where
        - 0 is the mean divergence point estimate:
        - 1 is the lower bound of a 95% CI
        - 2 is the upper bound of a 95% CI
        """
        # ensure that the topic predictions exist
        if self.topic_preds is None:
            self.predict_topics()
        
        # find R and D indicies on the subject
        is_subject = self.bool_subset(SUB_KEY, subject)
        index_R = self.index[self.bool_subset(PARTY, REP) & is_subject]
        index_D = self.index[self.bool_subset(PARTY, DEM) & is_subject]
        
        # return None if indices are insufficient
        if len(index_R)==0 or len(index_D)==0:
            return None
        
        # sample 
        samp_index_R = self.sample_indices(index_R, n)
        samp_index_D = self.sample_indices(index_D, n)
    
        return self.compute_JS(index_R, index_D)
    
    
    def intra_party_hh(self, party, subject, n=None):
        """
        Estimates the average Herfindahl–Hirschman Index of all 
        party members in a party on a given subject
        
        Args:
        - subject: (str) subject to examine
        - party  : (str) party of interest
        - n      : (int) sample size
        
        Returns: a numpy array of length 3, where index...
        - 0 is the mean index point estimate:
        - 1 is the lower bound of a 95% CI
        - 2 is the upper bound of a 95% CI
        """
        # ensure that the topic predictions exist
        if self.topic_preds is None:
            self.predict_topics()
        
        # find party indicies on the subject
        party_index = self.index[self.bool_subset(PARTY, party) & 
                                 self.bool_subset(SUB_KEY, subject)]
        
        if n is None:
            samp_index = party_index
        else:
            samp_index = self.sample_indices(party_index, n)
            
        return self.compute_HH(samp_index)
        

In [366]:
analyzer = RMN_Analyzer(rmn, df_samp)

In [367]:
# analyzer.predict_topics()
analyzer.topic_preds = topics_preds

In [368]:
analyzer.inter_party_divergence('money', 1000000)

array([0.07592554, 0.07309074, 0.07876035])

In [369]:
analyzer.intra_party_divergence('R', 'money',  1000)

array([0.07296461, 0.07179212, 0.0741371 ])

In [370]:
analyzer.intra_party_hh('R', 'money')

party,  R
subject,  money
Int64Index([ 119,  141,  181,  208,  245,  277,  357,  403,  444,  541,
            ...
            9396, 9457, 9514, 9549, 9583, 9627, 9732, 9740, 9782, 9839],
           dtype='int64', length=180)


array([0.01084666, 0.01080808, 0.01088524])

In [372]:
len(analyzer.intra_party_hh('R', 'money', n= 20))

party,  R
subject,  money
Int64Index([ 119,  141,  181,  208,  245,  277,  357,  403,  444,  541,
            ...
            9396, 9457, 9514, 9549, 9583, 9627, 9732, 9740, 9782, 9839],
           dtype='int64', length=180)


3

In [184]:
df_samp.empty

False

In [240]:
mean_CI([2,3,3,6,6,3,100])

(17.571428571428573, (-16.07591696393205, 51.2187741067892))