In [43]:
import os
import sys
import pandas as pd
from functools import partial

In [44]:
sys.path.append("/home/rocassius/w266_final/scripts/assembly")
sys.path.append("/home/rocassius/w266_final/scripts/modeling")
sys.path

['/home/rocassius/w266_final/notebooks/exploratory/analysis',
 '',
 '/home/rocassius/anaconda3/lib/python3.7/site-packages',
 '/home/rocassius/anaconda3/lib/python37.zip',
 '/home/rocassius/anaconda3/lib/python3.7',
 '/home/rocassius/anaconda3/lib/python3.7/lib-dynload',
 '/home/rocassius/w266_final/scripts',
 '/home/rocassius/anaconda3/lib/python3.7/site-packages/IPython/extensions',
 '/home/rocassius/.ipython',
 '/home/rocassius/w266_final/scripts/assembly',
 '/home/rocassius/w266_final/scripts/modeling',
 '/home/rocassius/w266_final/scripts/assembly',
 '/home/rocassius/w266_final/scripts/modeling']

In [45]:
# os.chdir("/home/rocassius/w266_final/scripts/assembly")
from constant import DOC_ALL_PATH
from document import *

In [46]:
# os.chdir("/home/rocassius/w266_final/scripts/modeling")
from rmn import *
#from analysis import *
from rmn_analyzer import *
#from rmn_data_generator import *

In [47]:
df_all = load_documents(sessions=[99], read_path=DOC_PROPER_PATH)

In [48]:
# make rmn
rmn_path="/home/rocassius/gen-data/models"
rmn_name="full"
rmn = RMN(); rmn.load_rmn(name=rmn_name, save_path=rmn_path)

In [65]:
#====================#
#=*= RMN Analyzer =*=#
#====================#

# Class for analyzing an RMN

import numpy as np
import pandas as pd
from analysis import *

# variable constants
SUB = 'subject'
SPEAK = 'speakerid'
PARTY = 'party'
SESS = 'session'
# party constants
R = 'R'
D = 'D'
# metric constants
JS = 'js'
HH = 'hh'
N_REC = 'n_records'


class RMN_Analyzer(object):
    """Class for Analyzing an RMN with respect to a dataset
    """
    
    def __init__(self, rmn, df):
        """
        Args:
        - rmn: (RMN) the RMN to be used for analysis
        - df : (DataFrame) the dataframe to analyze
        """
        
        'Initialization'
        self.rmn = rmn
        self.df = df.reset_index(drop=True)
        self.topic_preds = None
        
    @property
    def index(self):
        return self.df.index
         
        
    def predict_topics(self, use_generator=True):
        """Computes the topic predictions for all observations
        """
        if use_generator:
            self.topic_preds = self.rmn.predict_topics_generator(self.df)
        else:
            self.topic_preds = self.rmn.predict_topics(self.df)
        
        
    def sample_indices(self, indices, n):
        """Returns a SRR of the indices provided
        """
        return np.random.choice(indices, n, replace=True)

    
    def bool_subset(self, col, value):
        """
        Returns a boolean vector for each observation in the
        dataframe indicating whether it meets the col == value condition
        """
        assert col in self.df.columns
        return self.df[col] == value
    
    
    def bool_index(self, conditions):
        """
        Returns a boolean vector for each observation in the
        dataframe indicating whether it meets all conditions
        
        Args:
        - conditions: (dict) dictionary of conditions
        
        Returns: 
        - pandas series of booleans indicating where all 
          of the conditions hold
        """
        # initialize bool index
        bool_index = (pd.Series(True)
                      .repeat(self.index.shape[0])
                      .reset_index(drop=True))
        
        for col, val in conditions.items():
            bool_index = bool_index & self.bool_subset(col, val)
            
        return bool_index
    
    
    def cond_index(self, conditions):
        """Returns indices of records meeting the conditions
        """
        return self.index[self.bool_index(conditions)]
    
    
    def n_records(self, conditions={}):
        """Returns the number of records meetings the conditions
        """
        return len(self.cond_index(conditions))
    
    
    def compute_JS(self, index_A, index_B, base=2):
        """
        Computes the mean pair-wise JS divergence and associated CI
        between indices in index_A and indices in index_B
        """
        p_A = self.topic_preds[index_A]
        p_B = self.topic_preds[index_B]
        js_list = [jensenshannon(p, q, base) for p, q in zip(p_A, p_B)]
        
        return mean_CI(js_list)
        
        
    def compute_HH(self, index):
        """
        Computes the mean HH index and associated CI between
        indices in index_A and indices in index_B
        """
        p = self.topic_preds[index]
        hh_list = [hh_index(q) for q in p]
        
        return mean_CI(hh_list)
          
    
    def inter_party_js(self, conditions, n):
        """
        Returns the estimated inter party JS divergence and a CI.
        
        Computes the inter party JS divergence between 
        Republicans and Democrats on a given subject
        
        Args:
        - subject: (str) subject to examine
        - n      : (int) sample size
        
        Returns: a numpy array of length 3, where
        - 0 is the mean divergence point estimate:
        - 1 is the lower bound of a 95% CI
        - 2 is the upper bound of a 95% CI
        """
        # ensure that the topic predictions exist
        if self.topic_preds is None:
            self.predict_topics()
        
        # find R and D indicies on the subject
        index_R = self.cond_index({**conditions, **{PARTY: R}})
        index_D = self.cond_index({**conditions, **{PARTY: D}})
        
        # return None if indices are insufficient
        if len(index_R)==0 or len(index_D)==0:
            return None
        
        # sample 
        samp_index_R = self.sample_indices(index_R, n)
        samp_index_D = self.sample_indices(index_D, n)
    
        return self.compute_JS(samp_index_R, samp_index_D)
    
    
    def group_js(self, conditions, n):
        """
        Returns the estimated mean JS divergence and a CI
        
        Estimates the average JS divergence between any two documents of
        a group defined by the conditions. A document by speaker _i_ is 
        never compared to another document by speaker _i_.
        
        
        Args:
        - conditions: (dict) dictionary of conditions
        - n         : (int) sample size
        
        Returns: a numpy array of length 3, where index...
        - 0 is the mean divergence point estimate:
        - 1 is the lower bound of a 95% CI
        - 2 is the upper bound of a 95% CI
        """
        # ensure that the topic predictions exist
        if self.topic_preds is None:
            self.predict_topics()
        
        # find indicies of party on the subject
        cond_index = self.cond_index(conditions)
        
        # Return none if there are fewer than 2 speakers
        if self.df.loc[cond_index][SPEAK].nunique() < 2:
            return None
        
        # Sample index pairs
        index_AB = []
        while len(index_AB) < n:
            a_b = self.sample_indices(cond_index, n=2)
            # include samples whose speakers are different
            if self.df.loc[a_b][SPEAK].nunique() == 2:
                index_AB.append(a_b)
        
        index_AB = np.asarray(index_AB)
        assert index_AB.shape == (n, 2)
        
        # get indices for each group
        index_A, index_B = index_AB[:,0], index_AB[:,1]
        
        return self.compute_JS(index_A, index_B)
    
    
    def group_hh(self, conditions={}, n=None):
        """
        Returns the estimated mean HH index and a CI
        
        Estimates the average Herfindahl–Hirschman Index 
        of all records meetings the conditons.
        
        Args:
        - subject: (str) subject to examine
        - party  : (str) party of interest
        - n      : (int) sample size
        
        Returns: a numpy array of length 3, where index...
        - 0 is the mean index point estimate:
        - 1 is the lower bound of a 95% CI
        - 2 is the upper bound of a 95% CI
        """
        # ensure that the topic predictions exist
        if self.topic_preds is None:
            self.predict_topics()
        
        # indicies meeting the conditions
        cond_index = self.cond_index(conditions)
        
        # return None if indices are insufficient
        if len(cond_index)==0:
            return None
        
        if n is None:
            return self.compute_HH(cond_index)
        else:
            samp_index = self.sample_indices(cond_index, n)
            return self.compute_HH(samp_index)
        
        
    def analyze_subset(self, conditions, n):
        """
        Returns a dictionary of analysis metrics for the subset 
        of records defined by the conditions.
        
        Note: It is recommended conditions be on subject
        
        Args:
        - conditions: (dict) dictionary of conditions
        - n         : (int) sample size for estimation of metrics
        
        for the entire dataset and for each subject the following are computed:
        - n_records, n_records_R
        - n_records_D
        - js
        - js_R
        - js_D
        - js_RD
        - hh
        - hh_R
        - hh_D
        
        Returns: a dictionary of metrics
        """
        # R and D added conditions
        conditions_R = {**conditions, **{PARTY: R}}
        conditions_D = {**conditions, **{PARTY: D}}
        
        # annotation tags
        _R = '_' + R
        _D = '_' + D
        _RD = _R + D
        
        metrics = {
            # n record data
            N_REC:    self.n_records(conditions),
            N_REC+_R: self.n_records(conditions_R),
            N_REC+_D: self.n_records(conditions_D),
            # JS divergence data
            JS:     self.group_js(conditions, n),
            JS+_R:  self.group_js(conditions_R, n),
            JS+_D:  self.group_js(conditions_D, n),
            JS+_RD: self.inter_party_js(conditions, n),
            # HH index data
            HH:    self.group_hh(conditions, n),
            HH+_R: self.group_hh(conditions_R, n),
            HH+_D: self.group_hh(conditions_D, n)
        }
        
        return metrics
    
        
    def analyze(self, subjects, n):
        """
        Returns a dictionary of analysis metrics at the subject level
        and at the session level (assuming self.df is the data of a
        single session).
        
        Args:
        - subjects: (array-like) list of subjects
        - n       : (int) sample size for estimation of metrics
        
        Returns: a dictionary of metrics
        """
        # analyze entire session dataset
        dataset_metrics = self.analyze_subset(conditions={}, n=n)
        
        # analyze by subject
        subject_metrics = {}
        for s in subjects:
            subject_metrics[s] = self.analyze_subset({SUB: s}, n)
        
        metrics = {'dataset' : dataset_metrics, 
                   'subjects': subject_metrics}
        
        return metrics
        

In [66]:
analyzer = RMN_Analyzer(rmn, df_all)

In [58]:
len(analyzer.cond_index({SUB: 'abo'}))

0

In [67]:
d = analyzer.inter_party_js({SUB: 'abo'}, n=100)
print(d)

None


In [40]:
def analyze_session(session, subjects, sample_n, doc_path, rmn):
    
    # read in session
    df = load_documents(sessions=[session], read_path=doc_path)

    # analyze
    analyzer = RMN_Analyzer(rmn, df)
    print("Analyzing Session %s ..." % format(session, '03d'))
    data = analyzer.analyze(subjects, sample_n)
    print("Data Gathered for Session %s. " % format(session, '03d'))

    # add session number
    data.update({SESS: session})
    
    return data

In [41]:
# declare analyzing function
analyze_func = partial(
    analyze_session, 
    subjects=['alcohol', 'abortion'], 
    sample_n=10, 
    doc_path=DOC_ALL_PATH,
    rmn=rmn)

In [42]:
analyze_func(99)

Analyzing Session 099 ...
Data Gathered for Session 099. 


{'dataset': {'n_records': 192754,
  'n_records_R': 100167,
  'n_records_D': 92587,
  'js': {'mean': 0.11685564444715517,
   'lower': 0.10127801081329894,
   'upper': 0.13243327808101138},
  'js_R': {'mean': 0.10365507017511097,
   'lower': 0.09027054627638337,
   'upper': 0.11703959407383857},
  'js_D': {'mean': 0.10713294070018926,
   'lower': 0.08975681611133758,
   'upper': 0.12450906528904095},
  'js_RD': {'mean': 0.10502656474422825,
   'lower': 0.08879243687236564,
   'upper': 0.12126069261609086},
  'hh': {'mean': 0.011203434318304062,
   'lower': 0.010876994928223988,
   'upper': 0.01152987389464865},
  'hh_R': {'mean': 0.011536749079823494,
   'lower': 0.011069369537284603,
   'upper': 0.012004128249833356},
  'hh_D': {'mean': 0.01154312863945961,
   'lower': 0.010917534220612535,
   'upper': 0.012168722313248625}},
 'subjects': {'alcohol': {'n_records': 638,
   'n_records_R': 328,
   'n_records_D': 310,
   'js': {'mean': 0.10636661430588054,
    'lower': 0.08836148725535084,


In [18]:
session = 99
subjects=['alcohol', 'abortion'] 
sample_n=10
doc_path=DOC_ALL_PATH


In [30]:
# read in session
df = df_all

rmn = RMN(); rmn.load_rmn(name=rmn_name, save_path=rmn_path)

# analyze
analyzer = RMN_Analyzer(rmn, df)
print("Analyzing Session %s ..." % session_str)


Analyzing Session 099 ...


In [32]:
analyzer.df

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,subject,session
0,99106241,MATHIAS,CHARLES,S,MD,M,R,for me to these words of thanks for his exempl...,alcohol,99
1,99109651,PRYOR,DAVID,S,AR,M,D,it is increasingly apparent that many of the s...,alcohol,99
2,99112531,HOLLINGS,ERNEST,S,SC,M,D,adopted without objection on the agriculture a...,alcohol,99
3,99106921,PROXMIRE,WILLIAM,S,WI,M,D,has stressed the need to reduce the demand for...,alcohol,99
4,99109801,SIMPSON,ALAN,S,WY,M,R,either because they do not have access to cook...,alcohol,99
...,...,...,...,...,...,...,...,...,...,...
221599,99108391,METZENBAUM,HOWARD,S,OH,M,D,i the chair for yielding we are moving into th...,trade,99
221600,99110251,FORD,WENDELL,S,KY,M,D,president as we debate the defense authorizati...,trade,99
221601,99114550,THOMAS,WILLIAM,H,CA,M,R,set of standards to which all have agreed and ...,trade,99
221602,99117740,MILLER,GEORGE,H,CA,M,D,just loves to beat legislators over the head a...,trade,99


In [33]:
data = analyzer.analyze(subjects, sample_n)
print("Data Gathered for Session %s. " % session_str)

# add session number
data.update({SESS: session})

return data

Data Gathered for Session 099. 


SyntaxError: 'return' outside function (<ipython-input-33-8e6efc88f2f5>, line 7)

In [22]:
analyzer.group_js(conditions={}, n=10)

AssertionError: 

In [23]:
analyzer.df

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,subject,session


In [24]:
df_all

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,document,subject,session
0,99106241,MATHIAS,CHARLES,S,MD,M,R,for me to these words of thanks for his exempl...,alcohol,99
1,99109651,PRYOR,DAVID,S,AR,M,D,it is increasingly apparent that many of the s...,alcohol,99
2,99112531,HOLLINGS,ERNEST,S,SC,M,D,adopted without objection on the agriculture a...,alcohol,99
3,99106921,PROXMIRE,WILLIAM,S,WI,M,D,has stressed the need to reduce the demand for...,alcohol,99
4,99109801,SIMPSON,ALAN,S,WY,M,R,either because they do not have access to cook...,alcohol,99
...,...,...,...,...,...,...,...,...,...,...
221599,99108391,METZENBAUM,HOWARD,S,OH,M,D,i the chair for yielding we are moving into th...,trade,99
221600,99110251,FORD,WENDELL,S,KY,M,D,president as we debate the defense authorizati...,trade,99
221601,99114550,THOMAS,WILLIAM,H,CA,M,R,set of standards to which all have agreed and ...,trade,99
221602,99117740,MILLER,GEORGE,H,CA,M,D,just loves to beat legislators over the head a...,trade,99


In [73]:
SAVE_PATH = '/home/rocassius/gen-data/data/div-1.txt'

In [74]:
import json

In [75]:
with open(SAVE_PATH) as json_file:
    data = json.load(json_file)

In [82]:
sesses =  [d['session'] for d in data]

In [83]:
sesses

[104, 99, 55]