In [1]:
import os
import sys
import pandas as pd

In [2]:
sys.path.append("/home/rocassius/w266_final/scripts/assembly")
sys.path.append("/home/rocassius/w266_final/scripts/modeling")
sys.path

['/home/rocassius/w266_final/notebooks/exploratory/analysis',
 '',
 '/home/rocassius/anaconda3/lib/python3.7/site-packages',
 '/home/rocassius/anaconda3/lib/python37.zip',
 '/home/rocassius/anaconda3/lib/python3.7',
 '/home/rocassius/anaconda3/lib/python3.7/lib-dynload',
 '/home/rocassius/w266_final/scripts',
 '/home/rocassius/anaconda3/lib/python3.7/site-packages/IPython/extensions',
 '/home/rocassius/.ipython',
 '/home/rocassius/w266_final/scripts/assembly',
 '/home/rocassius/w266_final/scripts/modeling']

In [3]:
# os.chdir("/home/rocassius/w266_final/scripts/assembly")
from constant import DOC_SAMPLE_PATH, DOC_PROPER_PATH
from document import *

In [4]:
# os.chdir("/home/rocassius/w266_final/scripts/modeling")
from rmn import *
#from analysis import *
from rmn_analyzer import *
#from rmn_data_generator import *

In [5]:
df_all = load_documents(sessions=list(range(104,105)), read_path=DOC_PROPER_PATH)

In [6]:
rmn = RMN()
rmn.load_rmn(name="full", save_path = "/home/rocassius/gen-data/models")

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


### Analyze the 105th session with an RMN_Analyzer

In [212]:
#====================#
#=*= RMN Analyzer =*=#
#====================#

# Class for analyzing an RMN

import numpy as np
from analysis import *

# constants
SUB_KEY = 'subject'
SPEAKER = 'speakerid'
PARTY = 'party'

# party constants
REP = 'R'
DEM = 'D'


class RMN_Analyzer(object):
    """Class for Analyzing an RMN with respect to a dataset
    """
    
    def __init__(self, rmn, df):
        """
        Args:
        - rmn: (RMN) the RMN to be used for analysis
        - df : (DataFrame) the dataframe to analyze
        """
        
        'Initialization'
        self.rmn = rmn
        self.df = df.reset_index(drop=True)
        self.topic_preds = None
        
    @property
    def index(self):
        return self.df.index
         
        
    def predict_topics(self, use_generator=True):
        """Computes the topic predictions for all observations
        """
        if use_generator:
            self.topic_preds = self.rmn.predict_topics_generator(self.df)
        else:
            self.topic_preds = self.rmn.predict_topics(self.df)
        
        
    def sample_indices(self, indices, n):
        """Returns a SRR of the indices provided
        """
        return np.random.choice(indices, n, replace=True)

    
    def bool_subset(self, col, value):
        """
        Returns a boolean vector for each observation in the
        dataframe indicating whether it meets the col = value condition
        
        """
        assert col in self.df.columns
        return self.df[col] == value
    
    
    def bool_index(self, conditions):
        """
        Returns a boolean vector for each observation in the
        dataframe indicating whether it meets all conditions
        
        Args:
        - conditions: (dict) dictionary of conditions
        
        Returns: 
        - pandas series of booleans indicating where all 
          of the conditions hold
        """
        # initialize bool index
        bool_index = (pd.Series(True)
                      .repeat(self.index.shape[0])
                      .reset_index(drop=True))
        
        for col, val in conditions.items():
            bool_index = bool_index & self.bool_subset(col, val)
            
        return bool_index
    
    
    def cond_index(self, conditions):
        """Returns indices of records meeting the conditions
        """
        return self.index[self.bool_index(conditions)]
    
    
    def n_records(self, conditions={}):
        """Returns the number of records meetings the conditions
        """
        return len(self.cond_index(conditions))
    
    
    def compute_JS(self, index_A, index_B, base=2):
        """
        Computes the mean pair-wise JS divergence and associated CI
        between indices in index_A and indices in index_B
        """
        p_A = self.topic_preds[index_A]
        p_B = self.topic_preds[index_B]
        js_list = [jensenshannon(p, q, base) for p, q in zip(p_A, p_B)]
        
        return mean_CI(js_list)
        
        
    def compute_HH(self, index):
        """
        Computes the mean HH index and associated CI between
        indices in index_A and indices in index_B
        """
        p = self.topic_preds[index]
        hh_list = [hh_index(q) for q in p]
        
        return mean_CI(hh_list)
          
    
    def inter_party_js(self, subject, n):
        """
        Returns the estimated inter party JS divergence and a CI.
        
        Computes the inter party JS divergence between 
        Republicans and Democrats on a given subject
        
        Args:
        - subject: (str) subject to examine
        - n      : (int) sample size
        
        Returns: a numpy array of length 3, where
        - 0 is the mean divergence point estimate:
        - 1 is the lower bound of a 95% CI
        - 2 is the upper bound of a 95% CI
        """
        # ensure that the topic predictions exist
        if self.topic_preds is None:
            self.predict_topics()
        
        # find R and D indicies on the subject
        index_R = self.cond_index({PARTY: REP, SUB_KEY: subject})
        index_D = self.cond_index({PARTY: DEM, SUB_KEY: subject})
        
        # return None if indices are insufficient
        if len(index_R)==0 or len(index_D)==0:
            return None
        
        # sample 
        samp_index_R = self.sample_indices(index_R, n)
        samp_index_D = self.sample_indices(index_D, n)
    
        return self.compute_JS(index_R, index_D)
    
    
    def group_js(self, conditions, n):
        """
        Returns the estimated mean JS divergence and a CI
        
        Estimates the average JS divergence between any two documents of
        a group defined by the conditions. A document by speaker _i_ is 
        never compared to another document by speaker _i_.
        
        
        Args:
        - conditions: (dict) dictionary of conditions
        - n      : (int) sample size
        
        Returns: a numpy array of length 3, where index...
        - 0 is the mean divergence point estimate:
        - 1 is the lower bound of a 95% CI
        - 2 is the upper bound of a 95% CI
        """
        # ensure that the topic predictions exist
        if self.topic_preds is None:
            self.predict_topics()
        
        # find indicies of party on the subject
        cond_index = self.cond_index(conditions)
        
        # Return none if there are fewer than 2 speakers
        if self.df.loc[cond_index][SPEAKER].nunique() < 2:
            return None
        
        # Sample index pairs
        index_AB = []
        while len(index_AB) < n:
            a_b = self.sample_indices(cond_index, n=2)
            # include samples whose speakers are different
            if self.df.loc[a_b][SPEAKER].nunique() == 2:
                index_AB.append(a_b)
        
        index_AB = np.asarray(index_AB)
        assert index_AB.shape == (n, 2)
        
        # get indices for each group
        index_A, index_B = index_AB[:,0], index_AB[:,1]
        
        return self.compute_JS(index_A, index_B)
    
    
    def group_hh(self, conditions={}, n=None):
        """
        Returns the estimated mean HH index and a CI
        
        Estimates the average Herfindahl–Hirschman Index 
        of all records meetings the conditons.
        
        Args:
        - subject: (str) subject to examine
        - party  : (str) party of interest
        - n      : (int) sample size
        
        Returns: a numpy array of length 3, where index...
        - 0 is the mean index point estimate:
        - 1 is the lower bound of a 95% CI
        - 2 is the upper bound of a 95% CI
        """
        # ensure that the topic predictions exist
        if self.topic_preds is None:
            self.predict_topics()
        
        # indicies meeting the conditions
        cond_index = self.cond_index(conditions)
        
        if n is None:
            return self.compute_HH(cond_index)
        else:
            samp_index = self.sample_indices(cond_index, n)
            return self.compute_HH(samp_index)
        
#     def analyze
        
#     def analyze(self, subjects):
#         """
#         Returns a dictionary of analysis.
        
#         for the entire dataset and each subject the following are computed:
#         - n_records
#         - n_records R
#         - n_records D
#         - all JS
#         - RD  JS
#         - RR  JS
#         - DD  JS
#         - all HH
#         - R   HH
#         - D   HH
#         """
        
#         entire = {''}
        
        
        


In [218]:
analyzer = RMN_Analyzer(rmn, df_all)

In [219]:
# preds = analyzer.topic_preds
analyzer.topic_preds = preds

In [220]:
np.sum(analyzer.bool_index(conditions={'party': 'R', 'subject': 'abortion'}))

982

In [221]:
# compute inter-party divergence on abortion
analyzer.inter_party_js(subject = 'abortion', n=100)

{'estimate': 0.11111702782010845,
 'lower': 0.1092181655534295,
 'upper': 0.11301589008678742}

In [222]:
analyzer.group_js({'party': 'D', 'subject': 'abortion'}, n=100)

{'estimate': 0.11439676071103051,
 'lower': 0.1083229381082281,
 'upper': 0.12047058331383298}

In [223]:
analyzer.group_js({'party': 'R', 'subject': 'abortion'}, n=100)

{'estimate': 0.11148811564938771,
 'lower': 0.10534804658928536,
 'upper': 0.11762818470949003}

In [224]:
analyzer.group_hh({'party': 'R', SUB_KEY: 'abortion'}, n=100)

{'estimate': 0.011153317987918854,
 'lower': 0.011041686421582491,
 'upper': 0.01126494972189328}

In [225]:
analyzer.n_records({'party': 'R'})

127236

In [226]:
N = 10000

data_dict = {
    'js_RD': analyzer.inter_party_divergence('health', n=N),
    'js_RR': analyzer.intra_party_divergence('R', 'health',  n=N), 
    'js_DD': analyzer.intra_party_divergence('D', 'health', n=N) 
}

AttributeError: 'RMN_Analyzer' object has no attribute 'inter_party_divergence'

In [33]:
import json 

with open('div_data.txt', 'w') as outfile:
    json.dump(data_dict, outfile)

In [32]:
os.getcwd()


'/home/rocassius/w266_final/scripts/modeling'

In [34]:
with open('div_data.txt') as json_file:
    data = json.load(json_file)

In [35]:
data

{'js_RD': {'estimate': 0.11085805681099464,
  'lower': 0.11000055448822368,
  'upper': 0.1117155591337656},
 'js_RR': {'estimate': 0.11106727010844802,
  'lower': 0.1105057778011546,
  'upper': 0.11162876241574145},
 'js_DD': {'estimate': 0.1105937581579881,
  'lower': 0.11002224760685128,
  'upper': 0.11116526870912488}}