In [1]:
import pandas as pd
import numpy as np
import json
import pickle
from tqdm import tqdm

In [130]:
raw_data = pd.read_csv('raw_original_data_floor_speeches_house.csv')
print(raw_data.info())
raw_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85977 entries, 0 to 85976
Data columns (total 5 columns):
Speaker_Bioguide_ID    85977 non-null object
Speaker_Name           85977 non-null object
Text                   85977 non-null object
Date                   85977 non-null object
Legislative Body       85977 non-null object
dtypes: object(5)
memory usage: 3.3+ MB
None


Unnamed: 0,Speaker_Bioguide_ID,Speaker_Name,Text,Date,Legislative Body
0,M001201,Mr. MITCHELL,"Mr. Speaker, I rise today in the spirit of Mad...",2017-07-20,House
1,B001250,Mr. BISHOP of Utah,"Mr. Speaker, I ask unanimous consent that all ...",2017-07-20,House
2,B001250,Mr. BISHOP of Utah,"Mr. Chair, I include in the Record my statemen...",2017-07-20,House
3,B001250,Mr. BISHOP of Utah,"Mr. Chair, I yield 5 minutes to the gentleman ...",2017-07-20,House
4,Y000033,Mr. YOUNG of Alaska,"Mr. Chairman, this is an issue that should hav...",2017-07-20,House


In [4]:
#1. we must have bio info on all the speakers
legis_info = json.load(open('legislator-info-1990-2020.json'))
legis_id_to_info = {}
for x in legis_info:
    legis_id_to_info[x['id']['bioguide']] = x
del legis_info

In [131]:
speakers_to_remove_based_on_non_availbility_of_bio_info = set() 
speakers = list(raw_data['Speaker_Bioguide_ID'])
for s in speakers:
    if s not in legis_id_to_info:
        speakers_to_remove_based_on_non_availbility_of_bio_info.add(s)
print(len(speakers_to_remove_based_on_non_availbility_of_bio_info))

1


In [132]:
raw_data = raw_data[~raw_data['Speaker_Bioguide_ID'].isin(speakers_to_remove_based_on_non_availbility_of_bio_info)]
print(raw_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85733 entries, 0 to 85976
Data columns (total 5 columns):
Speaker_Bioguide_ID    85733 non-null object
Speaker_Name           85733 non-null object
Text                   85733 non-null object
Date                   85733 non-null object
Legislative Body       85733 non-null object
dtypes: object(5)
memory usage: 3.9+ MB
None


In [133]:
#remove speakers if they gave less than 25 speeches - TBIP paper removed senators with less than 24 speeches.
speakers_to_remove_based_on_num_speeches = set()
speakers = set(raw_data['Speaker_Bioguide_ID'])
thresh = 25
for s in speakers:
    n_s = len(raw_data[raw_data['Speaker_Bioguide_ID']==s])
    if n_s < thresh:
        speakers_to_remove_based_on_num_speeches.add(s)
print(len(speakers_to_remove_based_on_num_speeches))

59


In [134]:
raw_data = raw_data[~raw_data['Speaker_Bioguide_ID'].isin(speakers_to_remove_based_on_num_speeches)]
print(raw_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85173 entries, 0 to 85976
Data columns (total 5 columns):
Speaker_Bioguide_ID    85173 non-null object
Speaker_Name           85173 non-null object
Text                   85173 non-null object
Date                   85173 non-null object
Legislative Body       85173 non-null object
dtypes: object(5)
memory usage: 3.9+ MB
None


In [18]:
# stopwords = set()
# names_in_cong_record = list(raw_data['Speaker_Name'])
# for n in names_in_cong_record:
#     l = n.split()
#     for x in l:
#         stopwords.add(x.lower())
# bioguide_ids = set(raw_data['Speaker_Bioguide_ID'])
# for bid in bioguide_ids:
#     name = list(legis_id_to_info[bid]['name'].values())
#     for x in name:
#         for z in x.split():
#             stopwords.add(z.lower().replace('"', '').replace("'", ''))
            
# #also add in the stopwords list used by TBIP paper authors to preprocess senate speeches data - it consists
# #all state names, cities, month names, days of week, and other stopwords/procedural terms - very useful. 
# stopwords_from_senate_speeches_tbip = open('../../setup/stopwords/senate_speeches.txt').readlines()
# stopwords_from_senate_speeches_tbip = list(map(lambda x:x.rstrip(), stopwords_from_senate_speeches_tbip))

# stopwords = stopwords.union(set(stopwords_from_senate_speeches_tbip))
# f = open('stopwords.txt', 'w')
# for i, x in enumerate(list(stopwords)):
#     f.write(x)
#     if i < len(stopwords) - 1:
#         f.write('\n')
# f.close()

In [19]:
stopwords = open('stopwords.txt', 'r').readlines()
stopwords = list(map(lambda x:x.rstrip(), stopwords))

555


In [None]:
#while it is possible to add more jargon terms perhaps, do not want to overdo stopwords, because words can be 
#highly contextual and have meaning.

In [99]:
#rest of the preprocessing is following the script provided in the TBIP repo by Vafa et al. - setup/senate_speeches_to_bag_of_words.py 
import os

import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer

In [100]:
speakers = list(raw_data['Speaker_Bioguide_ID'])
print(len(speakers))
speeches = list(raw_data['Text'])
print(len(speeches))

85173
85173


In [101]:
speaker_to_speaker_id = dict(
    [(y, x) for x, y in enumerate(sorted(set(speakers)))])
author_indices = np.array(
    [speaker_to_speaker_id[s] for s in speakers])
print(len(author_indices))
author_map = np.array(list(speaker_to_speaker_id.keys()))
print(len(author_map))

85173
513


In [102]:
print(len(speeches))

85173


In [103]:
count_vectorizer = CountVectorizer(min_df=0.001,
                                   max_df=0.75, 
                                   stop_words=stopwords, 
                                   ngram_range=(1, 3),
                                   token_pattern="[a-zA-Z]+")
# Learn initial document term matrix. This is only initial because we use it to
# identify words to exclude based on author counts.
counts = count_vectorizer.fit_transform(speeches)
vocabulary = np.array(
    [k for (k, v) in sorted(count_vectorizer.vocabulary_.items(), 
                            key=lambda kv: kv[1])])


  'stop_words.' % sorted(inconsistent))


In [104]:
print(counts.shape)
print(len(vocabulary))

(85173, 10588)
10588


In [105]:
author_to_inds = {}
for a in tqdm(list(author_map)):
    inds = []
    author_ind = speaker_to_speaker_id[a]
    for i, ind in enumerate(list(author_indices)):
        if ind==author_ind:
            inds.append(i)
    author_to_inds[a] = inds

100%|██████████| 513/513 [00:16<00:00, 30.23it/s]


In [106]:
def get_per_author_counts(counts, author_to_inds):
    list_of_arrays = []
    for a in author_to_inds:
        inds = author_to_inds[a]
        list_of_arrays.append(np.array(np.sum(counts[inds], 0)))#.reshape((1, counts.shape[1])))
    return np.concatenate(list_of_arrays, 0)

In [107]:
# Remove phrases spoken by less than 50 representatives
min_authors_per_word = 50
counts_per_author = get_per_author_counts(counts, author_to_inds)
print(counts_per_author.shape)

(513, 10588)


In [108]:
acceptable_words = []
for i in range(len(vocabulary)):
    if np.count_nonzero(counts_per_author[:, i]) >= min_authors_per_word:
        acceptable_words.append(i)
print(len(acceptable_words))

10151


In [109]:
count_vectorizer = CountVectorizer(ngram_range=(1, 3),
                                   vocabulary=vocabulary[acceptable_words])
counts = count_vectorizer.fit_transform(speeches)
vocabulary = np.array(
    [k for (k, v) in sorted(count_vectorizer.vocabulary_.items(), 
                            key=lambda kv: kv[1])])

In [110]:
print(counts.shape)
print(len(vocabulary))

(85173, 10151)
10151


In [111]:
# `n_gram_to_unigram` takes as key an index to an n-gram in the vocabulary
# and its value is a list of the vocabulary indices of the corresponding 
# unigrams.
n_gram_indices = np.where(
  np.array([len(word.split(' ')) for word in vocabulary]) > 1)[0]
n_gram_to_unigrams = {}
for n_gram_index in n_gram_indices:
    matching_unigrams = []
    for unigram in vocabulary[n_gram_index].split(' '):
        if unigram in vocabulary:
            matching_unigrams.append(np.where(vocabulary == unigram)[0][0])
    n_gram_to_unigrams[n_gram_index] = matching_unigrams

# `n_grams_to_bigrams` now breaks apart trigrams and higher to find bigrams 
# as subsets of these words.
n_grams_to_bigrams = {}
for n_gram_index in n_gram_indices:
    split_n_gram = vocabulary[n_gram_index].split(' ')
    n_gram_length = len(split_n_gram) 
    if n_gram_length > 2:
        bigram_matches = []
        for i in range(0, n_gram_length - 1):
            bigram = " ".join(split_n_gram[i:(i + 2)])
            if bigram in vocabulary:
                bigram_matches.append(np.where(vocabulary == bigram)[0][0])
        n_grams_to_bigrams[n_gram_index] = bigram_matches

In [112]:
# Go through counts, and remove a unigram each time a bigram superset 
# appears. Also remove a bigram each time a trigram superset appears.
# Note this isn't perfect: if bigrams overlap (e.g. "global health care" 
# contains "global health" and "health care"), we count them both. This
# may introduce a problem where we subract a unigram count twice, so we also
# ensure non-negativity.
#counts_dense = counts.toarray()
for i in tqdm(range(counts.shape[0])):
    n_grams_in_doc = np.where(counts[i, n_gram_indices].toarray() > 0)[0]
    sub_n_grams = n_gram_indices[n_grams_in_doc]
    for n_gram in sub_n_grams:
        counts[i, n_gram_to_unigrams[n_gram]] = sparse.csr_matrix(counts[i, n_gram_to_unigrams[n_gram]].toarray() - counts[i, n_gram])
        if n_gram in n_grams_to_bigrams:
            counts[i, n_grams_to_bigrams[n_gram]] = sparse.csr_matrix(counts[i, n_grams_to_bigrams[n_gram]].toarray() - counts[i, n_gram])
counts[counts < 0] = 0

100%|██████████| 85173/85173 [04:08<00:00, 342.53it/s]


In [113]:
print(counts.shape)

(85173, 10151)


In [116]:
# Remove speeches with no words.
existing_speeches = []#np.where(np.sum(counts_dense, axis=1) > 0)[0]
for i in tqdm(range(counts.shape[0])):
    if counts[i].sum() > 0:
        existing_speeches.append(i)
counts = counts[existing_speeches]
print(counts.shape)
author_indices = author_indices[existing_speeches]
print(author_indices.shape)

100%|██████████| 85173/85173 [00:22<00:00, 3744.90it/s]

(84063, 10151)
(84063,)





In [122]:
# Save data.

# `counts.npz` is a [num_documents, num_words] sparse matrix containing the
# word counts for each document.
sparse.save_npz("clean/counts.npz",
                counts.astype(np.float32))

# `author_indices.npy` is a [num_documents] vector where each entry is an
# integer indicating the author of the corresponding document.
np.save("clean/author_indices.npy", author_indices)

# `vocabulary.txt` is a [num_words] vector where each entry is a string
# denoting the corresponding word in the vocabulary.
np.savetxt("clean/vocabulary.txt", vocabulary, fmt="%s")

# `author_map.txt` is a [num_authors] vector of strings providing the bioguide ID of
# each author in the corpus.
np.savetxt("clean/author_map.txt", author_map, fmt="%s")

In [None]:
# `raw_documents.txt` contains all the documents we ended up using.
raw_documents = [document.replace("\n", ' ').replace("\r", ' ') 
                 for i, document in enumerate(speeches) if i in existing_speeches]

In [124]:
print(len(raw_documents))

84063


In [125]:
f = open('clean/raw_documents.txt', 'w')
for i, doc in enumerate(raw_documents):
    f.write(doc)
    if i < len(raw_documents) - 1:
        f.write('\n')
f.close()

In [127]:
len(existing_speeches)

84063

In [136]:
raw_data = raw_data.iloc[existing_speeches]
print(raw_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84063 entries, 0 to 85975
Data columns (total 5 columns):
Speaker_Bioguide_ID    84063 non-null object
Speaker_Name           84063 non-null object
Text                   84063 non-null object
Date                   84063 non-null object
Legislative Body       84063 non-null object
dtypes: object(5)
memory usage: 3.8+ MB
None


In [137]:
raw_data.to_csv('finalized_tbip_speech_set_raw_original_data_floor_speeches_house.csv', index=False)
#save this if needed. 