In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import os

In [2]:
os.chdir("../../scripts/assembly")
from session_speaker_assembly import *
from preprocess import *
from document import *
from constant import *

In [3]:
# import speeches
speeches = pd.read_csv(os.path.join(HB_PATH, SPEECHES % '111'), sep = "|")

# import vocab master list
phrases_classes = pd.read_csv(os.path.join(DATA_PATH, "vocabulary/master_list.txt"), sep = "|")

# improt voteview data for congressional memeber names
voteview = pd.read_csv(os.path.join(DATA_PATH, "voteview/congress_ideology.csv"))
voteview['bioname'].head(n=4)

0      WASHINGTON, George
1    HUNTINGTON, Benjamin
2          SHERMAN, Roger
3       STURGES, Jonathan
Name: bioname, dtype: object

In [4]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [5]:
# manual stop words from Gentzkow et al.
manual_stopwords = ['absent','committee','gentlelady','hereabout','hereinafter','hereto','herewith' 'nay',
'pro','sir','thereabout','therebeforn','therein','theretofore','therewithal','whereat','whereinto','whereupon',
 'yea','adjourn','con','gentleman','hereafter','hereinbefore','heretofore','month','none','republican','speak',
 'thereafter','thereby','thereinafter','thereunder','today','whereby','whereof','wherever','yes','ask','democrat',
 'gentlemen','hereat','hereinto','hereunder','mr','now','say','speaker','thereagainst','therefor','thereof',
 'thereunto','whereabouts','wherefore','whereon','wherewith','yield','can','etc','gentlewoman','hereby','hereof',
 'hereunto','mrs','part','senator','tell','thereat','therefore','thereon','thereupon','whereafter','wherefrom',
 'whereto','wherewithal','chairman','gentleladies','gentlewomen','herein','hereon','hereupon','nai','per','shall',
 'thank','therebefore','therefrom','thereto','therewith','whereas','wherein','whereunder','will']

# list of US states
us_states_stopwords = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
  "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]

us_states_stopwords = [state.lower() for state in us_states_stopwords]

unigram_stops = [y for x in [manual_stopwords, us_states_stopwords] for y in x]

In [6]:
unigram_stops

['absent',
 'committee',
 'gentlelady',
 'hereabout',
 'hereinafter',
 'hereto',
 'herewithnay',
 'pro',
 'sir',
 'thereabout',
 'therebeforn',
 'therein',
 'theretofore',
 'therewithal',
 'whereat',
 'whereinto',
 'whereupon',
 'yea',
 'adjourn',
 'con',
 'gentleman',
 'hereafter',
 'hereinbefore',
 'heretofore',
 'month',
 'none',
 'republican',
 'speak',
 'thereafter',
 'thereby',
 'thereinafter',
 'thereunder',
 'today',
 'whereby',
 'whereof',
 'wherever',
 'yes',
 'ask',
 'democrat',
 'gentlemen',
 'hereat',
 'hereinto',
 'hereunder',
 'mr',
 'now',
 'say',
 'speaker',
 'thereagainst',
 'therefor',
 'thereof',
 'thereunto',
 'whereabouts',
 'wherefore',
 'whereon',
 'wherewith',
 'yield',
 'can',
 'etc',
 'gentlewoman',
 'hereby',
 'hereof',
 'hereunto',
 'mrs',
 'part',
 'senator',
 'tell',
 'thereat',
 'therefore',
 'thereon',
 'thereupon',
 'whereafter',
 'wherefrom',
 'whereto',
 'wherewithal',
 'chairman',
 'gentleladies',
 'gentlewomen',
 'herein',
 'hereon',
 'hereupon',
 

In [7]:
speeches.head()

Unnamed: 0,speech_id,speech
0,1110000001,The Representativeselect and their guests will...
1,1110000002,As directed by law. the Clerk of the House has...
2,1110000003,The quor closes that 428 Represer have respond...
3,1110000004,Credentials. regular in form. have been receiv...
4,1110000005,The Clerk is in receipt of a letter of resigna...


In [11]:
total_speech = " ".join(speeches["speech"].values)

In [12]:
total_speech[:500]

'The Representativeselect and their guests will please remain standing and join in the Pledge of Allegiance. As directed by law. the Clerk of the House has prepared the official roll of the Representativeselect. Certificates of election covering 435 seats in the 111th Congress have been received by the Clerk of the House. and the names of those persons whose credentials show that they were regularly elected as Representatives in accordance with the laws of their respective States or of the United'