In [1]:
import re
import os
import numpy as np
import pandas as pd

In [2]:
DATA_PATH = "gs://rwc1/data/"
# DATA_PATH = "../../data/"
HB_PATH = os.path.join(DATA_PATH, "hein-bound/")

BY_SPEAKER = "byspeaker_2gram_%s.txt"
SPEAKER_MAP = "%s_SpeakerMap.txt"
SPEECHES = "speeches_%s.txt"

In [43]:
# import speeches
speeches = pd.read_csv(os.path.join(HB_PATH, SPEECHES % '111'), sep = "|")

# import vocab master list
phrases_classes = pd.read_csv(os.path.join(DATA_PATH, "vocabulary/master_list.txt"), sep = "|")

# improt voteview data for congressional memeber names
voteview = pd.read_csv(os.path.join(DATA_PATH, "voteview/congress_ideology.csv"))
voteview['bioname'].head(n=4)

0      WASHINGTON, George
1    HUNTINGTON, Benjamin
2          SHERMAN, Roger
3       STURGES, Jonathan
Name: bioname, dtype: object

In [5]:
phrases_classes["_classify"].value_counts()

vocab                   4525243
stopword                1331580
bad_syntax               918939
co-occurring              26962
roberts                   10990
riddicks                   7585
roberts_and_riddicks        819
Name: _classify, dtype: int64

In [6]:
# stop phrases to search for
phrases_classes["_classify"].value_counts().sum() - phrases_classes["_classify"].value_counts()[0]

2296875

In [None]:
# manual stop words from Gentzkow et al.
manual_stopwords = ['absent','committee','gentlelady','hereabout','hereinafter','hereto','herewith' 'nay',
'pro','sir','thereabout','therebeforn','therein','theretofore','therewithal','whereat','whereinto','whereupon',
 'yea','adjourn','con','gentleman','hereafter','hereinbefore','heretofore','month','none','republican','speak',
 'thereafter','thereby','thereinafter','thereunder','today','whereby','whereof','wherever','yes','ask','democrat',
 'gentlemen','hereat','hereinto','hereunder','mr','now','say','speaker','thereagainst','therefor','thereof',
 'thereunto','whereabouts','wherefore','whereon','wherewith','yield','can','etc','gentlewoman','hereby','hereof',
 'hereunto','mrs','part','senator','tell','thereat','therefore','thereon','thereupon','whereafter','wherefrom',
 'whereto','wherewithal','chairman','gentleladies','gentlewomen','herein','hereon','hereupon','nai','per','shall',
 'thank','therebefore','therefrom','thereto','therewith','whereas','wherein','whereunder','will']

# list of US states
us_states_stopwords = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
  "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]

us_states_stopwords = [state.lower() for state in us_states_stopwords]

### Strategy 1: Embedded List of Stop Phrases

The initial strategy is to create an embedded list of stop phrases to search through. This involves splitting the big-gram phrases into a list of paired lists.

`[['bg_par11','bg_par12'], ['bg_par21','bg_par22'],..., ['bg_parN1','bg_parN2']]`

We would then take each speech and call the `split` method on it to create a list of unigrams of the speech.

`speech = 'this is some speech text'`

`speech.split() -> ['this','is','some','speech','text']`

`speech_list = speech.split()`


Using a for loop tracking the index `i` for each word in our speech, we check to see if the pair of unigrams 

`[speech_list[i], speech_list[i+1]]` is in our stop bigrams `[['bg_par11','bg_par12'], ['bg_par21','bg_par22'],..., ['bg_parN1','bg_parN2']]`

See implementation below.

In [7]:
# phrases_classes[phrases_classes['phrase'] == 'adam speak'] = 'madam speaker'

# select stop phrases classes from Gentzkow et al.
stop_classes = ['stopword','co-occurring','roberts','riddicks','roberts_and_riddicks']

# filter based on stop_classes
stop_phrases = phrases_classes[phrases_classes['_classify'].isin(stop_classes)]['phrase']

# split phrases into embedded paired lists
stop_phrases_emlist = stop_phrases.str.split().tolist()
stop_phrases_emlist[:5]

[['0', 'hatfield'], ['0', 'mr'], ['00', 'm'], ['00', 'p'], ['000', 'amend']]

We can test to see how log it takes to search through our list of stop phrases.

In [45]:
import time

# example of matching first phrase
start = time.time()
stop_phrases_emlist[0] in stop_phrases_emlist
end = time.time()

elapsed_first = end - start
print("Seconds elapsed for index 0 match:", elapsed_first)

# example of searching for last phrase

start = time.time()
stop_phrases_emlist[-1:] in stop_phrases_emlist
end = time.time()

elapsed_last = end - start
print("Seconds elapsed for index n-1 match:", elapsed_last)
print("Ratio of longest to shortest:",elapsed_last/elapsed_first)

# session 111 mean speech length
mean_speech_len = speeches['speech'].str.split().apply(len).mean()
print("Mean time to search for non-match phrase in single speech:", mean_speech_len*elapsed_last)
print("Estimated time to process session 111 speeches (minutes):",
      (speeches.shape[0]*mean_speech_len*elapsed_last)/60)

Seconds elapsed for index 0 match: 5.745887756347656e-05
Seconds elapsed for index n-1 match: 0.026239395141601562
Ratio of longest to shortest: 456.66390041493776
Mean time to search for non-match phrase in single speech: 5.132598435162971
Estimated time to process session 111 speeches (minutes): 15335.177604579925


In [31]:
j = 120
test_speech = speeches['speech'][j].lower()
test_speech_list = test_speech.split()
len(test_speech_list)

345

In [46]:
start = time.time()
for i in range(len(test_speech_list)-2):
    if test_speech_list[i+1] == '.':
        pass
    if [test_speech_list[i],test_speech_list[i+1]] in stop_phrases_emlist:
        del test_speech_list[i]
        del test_speech_list[i+1]
end = time.time()
elapsed = end - start
elapsed

18.075706481933594

In [17]:
' '.join(test_speech_list)

'madam speaker. i unanimous that during the session the 111th congress: (1) on legislative days of monday when the house convenes pursuant to house resolution 10. the house shall convene 90 minutes earlier than the time otherwise established by the resolution solely for the purpose of conducting morninghour debate. and (2) on legislative days of tuesday when the house convenes pursuant to house resolution 10: (a) before may 18. 2009. the house will convene for morninghour debate 90 minutes earlier than the time otherwise established by that resolution. and (b) after may 18. 2009. the house shall convene for morninghour debate hour than the time otherwise established by that resolution. and (3) on legislative days of monday or tuesday. when the house convenes for morninghour debate pursuant to an order other than house resolution 10. the house shall resume its 90 after the time otherwise established by that order. (4) the time for morninghour debate shall be limited to the 30 minutes al

### Strategy 2: Knuth-Morris-Pratt Adaptation

Sliding window across raw text that skips indicies in contracst to naive search through all indicies.

In [33]:
# phrases are now single sintring instead of paired list
stop_phrases_list = stop_phrases.tolist()
stop_phrases_list[:5]

['0 hatfield', '0 mr', '00 m', '00 p', '000 amend']

In [37]:
start = time.time()
hits = []
for k in range(len(stop_phrases_list)):
    if stop_phrases_list[i] in test_speech:
        hits.append(k)
end = time.time()
elapsed2 = end - start
print("seconds elapsed:",elapsed2)
hits

seconds elapsed: 1.04793119430542


[]

In [18]:
def kpm_prefix(pattern):
    """KPM search prefix array for indices skipping"""
    m = len(pattern)
    pt = [0]*m
    i = 0
    for j in range(m-1):
        while i > 0 and pattern[i] != pattern[j+1]:                
            i = pt[i-1]
        if pattern[i] == pattern[j+1]:
            i += 1
        pt[j+1] = i        
    return pt

In [19]:
kpm_prefix('aaba')

[0, 1, 0, 1]

In [20]:
def kpm_matcher(text,pattern):
    """KPM matter matcher"""
    match_indeces = []
    n = len(text)
    m = len(pattern)
    pt = kpm_prefix(pattern)
    i = 0
    for j in range(n-1):
        while i > -1 and pattern[i] != text[j+1]: # still creates infite loop if sequential characters are same
            i = pt[i]
        if pattern[i] == text[j+1]:
            i += 1
        if i == m:
            match_indeces.append(j+2-m)
            i = pt[i-1]
    print("End:", pattern)
    return match_indeces

In [39]:
# kpm_matcher('aabaacaadaabaaba','aaba')

In [40]:
# kpm_matcher('acat aacgacacagt aacgacacagt','aacacagt')

In [41]:
# f = 19
# 'acat acgacacagt acgacacagt'[f:f+len('acacagt')]

In [42]:
# stop_phrases_list[:5]

In [22]:
# start = time.time()
# matched_indecies = []
# for phrase in stop_phrases_list:
#     test_speech = re.sub(phrase, '', test_speech)
# end = time.time()
# elapsed = end - start
# elapsed

In [23]:
# phrases_classes[phrases_classes['phrase'] == 'h con']

In [24]:
# start = time.time()
# matched_indecies = []
# for phrase in stop_phrases_list:
#     mtch_indecies = kpm_matcher(test_speech,phrase)
#     matched_indecies.append(mtch_indecies)
# end = time.time()
# elapsed = end - start
# elapsed