In [1]:
import os
import pandas as pd

In [2]:
os.chdir("/home/rocassius/w266_final/scripts/assembly")
from constant import DOC_SAMPLE_PATH, DOC_PROPER_PATH
from document import *

In [3]:
df = load_documents(sessions=list(range(104,105)), read_path=DOC_PROPER_PATH)

In [4]:
os.chdir("/home/rocassius/w266_final/scripts/modeling")
from rmn import *
from rmn_data_generator import *

In [5]:
rmn = RMN()
rmn.load_rmn(name="half", save_path = "/home/rocassius/gen-data/models")

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [6]:
df_samp = df.sample(10000)

In [11]:
topics_preds = rmn.predict_topics(df_samp)

In [8]:
dg = RMN_DataGenerator(rmn, df_samp, batch_size=5000)

In [12]:
# make topic predictions
# topics_preds = rmn.topic_model.predict_generator(dg, use_multiprocessing=True, workers=10,verbose=1)

In [24]:
REP = "R"
DEM = "D"
PARTY = "party"

import numpy as np
from scipy.special import rel_entr

def jensenshannon(p, q, base=None):
    """
    Returns the JS divergence between two 1-dimensional probability vectors
    code taken from scipy and modified to fix bug
    
    """
    p = np.asarray(p)
    q = np.asarray(q)
    p = p / np.sum(p, axis=0)
    q = q / np.sum(q, axis=0)
    m = (p + q) / 2.0
    left = rel_entr(p, m)
    right = rel_entr(q, m)
    js = max(0, np.sum(left, axis=0) + np.sum(right, axis=0))
    if base is not None:
        js /= np.log(base)
    return np.sqrt(js / 2.0)


def mean_js_div(p1, p2, base=2):
    """
    Compute mean JS divergence between two list of probability distributions
    """
    mean_js = np.mean([jensenshannon(p, q, base) for p, q in zip(p1, p2)])
    
    return mean_js


def expected_div(df1, df2=None):

    if df2 is None: 
        df_A, df_B = df1[::2], df1[1::2]
    else:           
        df_A, df_B = df1, df2
    
    return mean_js_div(rmn.predict_topics(df_A), rmn.predict_topics(df_B))
    
    
def compute_diversity_scores(df):
    
    # sample
    d = blocked_sample(df, col="party", size=1)
    d = df
    # identify party members
    rep_df = d[d[PARTY] == REP]
    dem_df = d[d[PARTY] == DEM]
    
    div = {'inter_div': expected_div(rep_df, dem_df),
           'rep_div':   expected_div(rep_df),
           'dem_div':   expected_div(dem_df)}
    
    return div


def blocked_sample(df, col, size):
    """
    Returns a blocked random sample from df blocked on col.
    """
    return  df.groupby(col, as_index=False) \
              .apply(lambda d: d.sample(size)) \
              .sample(frac=1)


# Herfindahl–Hirschman Index
def hh_index(p):
    
    p = np.asarray(p)
    p = p / np.sum(p)
    hhi = np.sum(p**2)
    return hhi


def intra_div(p_list):
    
    hh_indices = [hh_index(p) for p in p_list]
    div = 1 - np.log(np.mean(hh_indicies))
    
    return div

In [96]:
sub = 'minorities'

SUB_KEY = 'subject'
SPEAKER = 'speakerid'

In [26]:
compute_diversity_scores(df_samp)

{'inter_div': 0.07909239252182533,
 'rep_div': 0.0785186946240704,
 'dem_div': 0.07986857147032105}

In [202]:
class RMN_Analyzer(object):
    
    def __init__(self, rmn, df):
        
        self.rmn = rmn
        self.df = df.reset_index(drop=True)
        self.topic_preds = None
        
    @property
    def index(self):
        return self.df.index
         
        
    def predict_topics(self):
        """Computes the topic predictions for all observations
        """
        self.topic_preds = self.rmn.predict_topics(self.df)
        

    def bool_subset(self, col, value):
        """
        Returns a boolean vector for each observation in the
        dataframe indicating whether it meets the conditions
        """
        return self.df[col] == value
    
    
    def sample_indices(self, indices, n):
        """Returns a SRR of the indices provided
        """
        return np.random.choice(indices, n, replace=True)
    
    
    def inter_group_divergence(self, index_A, index_B):
        """
        Returns the mean pairwise JS-divergence between topic predictions
        associated with sample1 and those associated with sample2
        """
        div = mean_js_div(self.topic_preds[index_A], 
                          self.topic_preds[index_B])
        
        return div
          
    
    def intra_party_divergence(self, party, subject, n):
        """
        Returns the intraparty divergence for a given party 
        on a given subject
        
        Args:
        subject: (str) subject to examine
        party  : (str) party of interest
        n      : (int) sample size
        """
        # ensure that the topic predictions exist
        if self.topic_preds is None:
            self.predict_topics()
        
        # find party indicies on the subject
        party_index = self.index[self.bool_subset(PARTY, party) & 
                                 self.bool_subset(SUB_KEY, subject)]
        
        # Return none if there are fewer than 2 speakers
        if self.df.loc[party_index][SPEAKER].nunique() < 2:
            return None
        
        # Sample index pairs
        index_AB = []
        while len(index_AB) < n:
            a_b = self.sample_indices(party_index, n=2)
            # include pairs whose speakers are different
            if self.df.loc[a_b][SPEAKER].nunique() == 2:
                index_AB.append(a_b)
        
        index_AB = np.asarray(index_AB)
        assert index_AB.shape == (n, 2)
        
        # get indices for each group
        index_A, index_B = index_AB[:,0], index_AB[:,1]
        
        return self.inter_group_divergence(index_A, index_B)
    
    
    def inter_party_divergence(self, subject, n):
        """
        Returns the interparty divergence between Republicans and Democrats
        on a given subject
        
        Args:
        subject: (str) subject to examine
        n      : (int) sample size
        """
        # ensure that the topic predictions exist
        if self.topic_preds is None:
            self.predict_topics()
        
        # find R and D indicies on the subject
        is_subject = self.bool_subset(SUB_KEY, subject)
        index_R = self.index[self.bool_subset(PARTY, REP) & is_subject]
        index_D = self.index[self.bool_subset(PARTY, DEM) & is_subject]
        
        # return None if indices are insufficient
        if len(index_R)==0 or len(index_D)==0:
            return None
        
        # sample 
        samp_index_R = self.sample_indices(index_R, n)
        samp_index_D = self.sample_indices(index_D, n)
    
        return self.inter_group_divergence(samp_index_R, samp_index_D)
    
    
#    def intra_party_diversity(self, subject, n):
        
        

In [203]:
d = pd.DataFrame({'a': [], 'b': []})

In [204]:
d["b"].nunique()

0

In [205]:
d.empty

True

In [206]:
analyzer = RMN_Analyzer(rmn, df_samp)

In [207]:
# analyzer.predict_topics()
analyzer.topic_preds = topics_preds

In [208]:
analyzer.inter_party_divergence('abortion', 10000)

0.08154862785029093

In [209]:
analyzer.intra_party_divergence('R', 'abortion',  10000)

0.07846073805828929

In [117]:
s

0.07827038726984283

In [184]:
df_samp.empty

False