In [1]:
import re
import os
import numpy as np
import pandas as pd

In [2]:
os.chdir("../../../scripts/assembly")
from session_speaker_assembly import *
from preprocess import *
from document import *
from constant import *

In [3]:
# import speeches
speeches = pd.read_csv(os.path.join(HB_PATH, SPEECHES % '111'), sep = "|")

# import vocab master list
phrases_classes = pd.read_csv(os.path.join(DATA_PATH, "vocabulary/master_list.txt"), sep = "|")

# improt voteview data for congressional memeber names
voteview = pd.read_csv(os.path.join(DATA_PATH, "voteview/congress_ideology.csv"))
voteview['bioname'].head(n=4)

0      WASHINGTON, George
1    HUNTINGTON, Benjamin
2          SHERMAN, Roger
3       STURGES, Jonathan
Name: bioname, dtype: object

### Preprocessing forecast

How did Gentzkow et al. do stopword removal?
Gentzkow used bigrams..


First, bigrams with bad syntax are flagged. A bigram has bad syntax if it contains
* any numbers, symbols, or punctuation;
* fewer than five characters, including the space; 
* a one-letter word; or 
* a word beginning with the first three letters of a month.


Second, bigrams containing the stem of a US-Congress-specific stopword are flagged. Stopwords come from three sources: 
* the manually selected stopwords in Table 11, 
* the names of states,
* the last names of all congresspeople recorded in the historical source.


Third, bigrams recording procedural speech are determined and flagged. These are bigrams that either directly appear in handbooks describing congressional procedure or frequently co-occur with the direct bigrams.



#### For us,
We don't have to remove bigrams, we can do First and Second at the unigram level.

The only removal we have to do at the bigram level is Roberts and riddicks

In [7]:
# manual stop words from Gentzkow et al.
manual_stopwords = ['absent','committee','gentlelady','hereabout','hereinafter','hereto','herewith' 'nay',
'pro','sir','thereabout','therebeforn','therein','theretofore','therewithal','whereat','whereinto','whereupon',
 'yea','adjourn','con','gentleman','hereafter','hereinbefore','heretofore','month','none','republican','speak',
 'thereafter','thereby','thereinafter','thereunder','today','whereby','whereof','wherever','yes','ask','democrat',
 'gentlemen','hereat','hereinto','hereunder','mr','now','say','speaker','thereagainst','therefor','thereof',
 'thereunto','whereabouts','wherefore','whereon','wherewith','yield','can','etc','gentlewoman','hereby','hereof',
 'hereunto','mrs','part','senator','tell','thereat','therefore','thereon','thereupon','whereafter','wherefrom',
 'whereto','wherewithal','chairman','gentleladies','gentlewomen','herein','hereon','hereupon','nai','per','shall',
 'thank','therebefore','therefrom','thereto','therewith','whereas','wherein','whereunder','will']

# list of US states
us_states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
  "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]

us_states_stopwords = " ".join(us_states_stopwords).lower().split(" ")

english_stop_words = stopwords.words('english')

# unigram_stops = [y for x in [manual_stopwords, us_states_stopwords] for y in x]

In [53]:
english_stop_words = stopwords.words('english')

In [61]:
english_stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [92]:
def stopword_regex(stopwords):
    """Compile a regular expression to match any of the words in stopwords"""
    
    return re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')


In [95]:
UNIGRAM_STOP = stopword_regex(english_stop_words + us_states_stopwords + manual_stopwords)
# UNIGRAM_STOP = stopword_regex(manual_stopwords[0:5])

In [96]:
UNIGRAM_STOP

re.compile(r"\b(i|me|my|myself|we|our|ours|ourselves|you|you're|you've|you'll|you'd|your|yours|yourself|yourselves|he|him|his|himself|she|she's|her|hers|herself|it|it's|its|itself|they|them|their|theirs|themselves|what|which|who|whom|this|that|that'll|these|those|am|is|are|was|were|be|been|being|have|has|had|having|do|does|did|doing|a|an|the|and|but|if|or|because|as|until|while|of|at|by|for|with|about|against|between|into|through|during|before|after|above|below|to|from|up|down|in|out|on|off|over|under|again|further|then|once|here|there|when|where|why|how|all|any|both|each|few|more|most|other|some|such|no|nor|not|only|own|same|so|than|too|very|s|t|can|will|just|don|don't|should|should've|now|d|ll|m|o|re|ve|y|ain|aren|aren't|couldn|couldn't|didn|didn't|doesn|doesn't|hadn|hadn't|hasn|hasn't|haven|haven't|isn|isn't|ma|mightn|mightn't|mustn|mustn't|needn|needn't|shan|shan't|shouldn|shouldn't|wasn|wasn't|weren|weren't|won|won't|wouldn|wouldn't|alabama|alaska|arizona|arkansas|california|color

In [105]:
s = "somewhere in virginia i will never go wherein  month whereat your data therebefore therefrom "

In [106]:
# re.findall("\\si\\s", s)
re.findall(UNIGRAM_STOP, s)

['in',
 'virginia',
 'i',
 'will',
 'wherein',
 'month',
 'whereat',
 'your',
 'therebefore',
 'therefrom']

In [107]:
s_filt = re.sub(UNIGRAM_STOP,"", s).strip()

# s_filt = re.sub("\svirginia\s"," ", s)
s_filt

'somewhere never go data'

In [5]:
# phrases_classes[phrases_classes["_classify"] == "roberts_and_riddicks"]

In [6]:
# phrases_classes[phrases_classes["_classify"] == "roberts"][:50]