## The Baseline of Baselines: LDA on speaker bigrams in the 111th Congress

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# setting up file system
data_path = "gs://rwc1/data/"

In [20]:
# not informative, os can't detect file existence in a gs bucket.
os.path.isfile(os.path.join(data_path, "hein-bound/byspeaker_2gram_110.txt"))

False

In order to run these you need to have the pythonic file system installed on your vm: 
`pip install gcsfs`

In [25]:
# import master vocaulary with bigram classifications
bigrams_master = bigrams = pd.read_csv(os.path.join(data_path, "vocabulary/master_list.txt"), sep = "|")

# improt speaker bigram for session X
speaker_bigrams = pd.read_csv(os.path.join(data_path, "hein-bound/byspeaker_2gram_111.txt"), sep = "|")

# import speaker map for session X
speaker_map = pd.read_csv(os.path.join(data_path, "hein-bound/111_SpeakerMap.txt"), sep = "|")

In [23]:
# Note that the files are being retrieved from here:
print(os.path.join(data_path, "vocabulary/master_list.txt"))
print(os.path.join(data_path, "hein-bound/byspeaker_2gram_111.txt"))

gs://rwc1/data/vocabulary/master_list.txt
gs://rwc1/data/hein-bound/byspeaker_2gram_110.txt


The rest should be the same because we are computing in memory hereafter.

In [27]:
print(bigrams_master.shape)
bigrams_master.head()

(6822118, 2)


Unnamed: 0,phrase,_classify
0,0 0,bad_syntax
1,0 00,bad_syntax
2,0 000,bad_syntax
3,0 0000,bad_syntax
4,0 00000,bad_syntax


In [28]:
print(speaker_bigrams.shape)
speaker_bigrams.head()

(10467042, 3)


Unnamed: 0,speakerid,phrase,count
0,111113931,0 01,1
1,111113931,0 fervent,1
2,111113931,0 hope,1
3,111113931,0 sugar,1
4,111113931,000 20,1


In [29]:
print(speaker_map.shape)
speaker_map.head()

(112550, 10)


Unnamed: 0,speakerid,speech_id,lastname,firstname,chamber,state,gender,party,district,nonvoting
0,111120160,1110000007,LARSON,JOHN,H,CT,M,D,1.0,voting
1,111117010,1110000009,PENCE,MIKE,H,IN,M,R,6.0,voting
2,111118060,1110000013,BOEHNER,JOHN,H,OH,M,R,8.0,voting
3,111120780,1110000014,PELOSI,NANCY,H,CA,F,D,8.0,voting
4,111119830,1110000015,HOYER,STENY,H,MD,M,D,5.0,voting


In [33]:
# import speeches from session X
# line 55163 removed extar pipe "vote. (|) 1116 Messrs"
session_speeches = bigrams = pd.read_csv(
    os.path.join(data_path, "hein-bound/speeches_111.txt"), sep = "|", encoding = "ISO-8859-1")

ParserError: Error tokenizing data. C error: Expected 2 fields in line 55163, saw 3


In [None]:
print(session_speeches.shape)
session_speeches.head()

In [12]:
print("session_bigrams dim", speaker_bigrams.shape)
print("bigrams_master dim", bigrams_master.shape)

session_bigrams dim (10467042, 3)
bigrams_master dim (6822118, 2)


In [58]:
# generating valid session bigrams
session_bigrams = (speaker_bigrams
                   .merge(bigrams_master, how = "inner", on ="phrase")
                  )

In [59]:
valid_session_bigrams = session_bigrams[session_bigrams['_classify'] == "vocab"]

In [70]:
valid_session_bigrams.head()

Unnamed: 0,speakerid,phrase,count,_classify
16135,111113931,abil produc,1,vocab
16136,111114091,abil produc,2,vocab
16137,111114101,abil produc,1,vocab
16138,111115360,abil produc,1,vocab
16139,111116310,abil produc,2,vocab


In [85]:
# iteration ignoring phrase counts
# speaker_phrases = valid_session_bigrams.groupby('speakerid')['phrase'].apply(lambda x: "[%s]" % ', '.join(x))
speaker_phrases = valid_session_bigrams.groupby('speakerid')['phrase'].apply(list)

In [103]:
speaker_phrases#.apply(len).median()

speakerid
111113931    [abil produc, abil secur, abil strike, abl ass...
111113951    [accomplish thing, accord cbo, across aisl, ac...
111113981    [abl provid, abus practic, accord cbo, account...
111114011    [abl provid, abort right, academ institut, acc...
111114021    [academ standard, accomplish just, achiev awar...
111114091    [abil produc, abl provid, abl us, abus practic...
111114101    [abil produc, abl contribut, abl provid, accom...
111114121    [academ success, accept premis, accord cbo, ac...
111114171    [abl assess, accomplish just, accomplish thing...
111114321    [abl effect, abl provid, accomplish busi, achi...
111114331    [accomplish jurist, across america, across cou...
111114451    [baghdad iraq, better credit, bill also, commu...
111114511    [abl contribut, abort right, abund natur, acad...
111114561    [abl provid, account offic, across aisl, acros...
111114621    [account offic, across countri, across state, ...
111114651    [across state, admir much, advoc

In [90]:
dictionary = gensim.corpora.Dictionary(speaker_phrases)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abil produc
1 abil secur
2 abil strike
3 abl assess
4 abl contribut
5 abl effect
6 abl provid
7 abl us
8 abort right
9 abroad carri
10 abund natur


In [92]:
import gensim
bow_corpus = [dictionary.doc2bow(doc) for doc in speaker_phrases]

In [94]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [106]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=100, workers=2)

In [107]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.000*"need victim" + 0.000*"communiti safeti" + 0.000*"congress commiss" + 0.000*"maintain system" + 0.000*"help survivor" + 0.000*"state dc" + 0.000*"illeg trade" + 0.000*"servic crime" + 0.000*"campus public" + 0.000*"year rebuild"
Topic: 1 
Words: 0.000*"troubl troubl" + 0.000*"head execut" + 0.000*"give thing" + 0.000*"japan korea" + 0.000*"like illeg" + 0.000*"told thing" + 0.000*"spend rememb" + 0.000*"bill hadnt" + 0.000*"elect that" + 0.000*"regul theyr"
Topic: 2 
Words: 0.000*"engin depart" + 0.000*"among strongest" + 0.000*"colleg largest" + 0.000*"servic extraordinari" + 0.000*"sad learn" + 0.000*"becom ever" + 0.000*"firefight everi" + 0.000*"colombia remain" + 0.000*"leftbehind parent" + 0.000*"gave nation"
Topic: 3 
Words: 0.000*"secur imper" + 0.000*"impact unit" + 0.000*"understand occur" + 0.000*"world requir" + 0.000*"stryker vehicl" + 0.000*"author servic" + 0.000*"iran becom" + 0.000*"fuel tank" + 0.000*"support must" + 0.000*"area natur"
Topic: 4 

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [55]:
# speaker with most speeches
speaker_map.groupby('speakerid').agg({'speech_id':'count'}).idxmax()

speech_id    111120961
dtype: int64

In [53]:
# bigram classes
session_bigrams["_classify"].unique()

array(['bad_syntax', 'roberts_and_riddicks', 'co-occurring', 'riddicks',
       'vocab', 'stopword', 'roberts'], dtype=object)

In [38]:
# session speaker count
speaker_map['speakerid'].unique().shape[0]

557

In [46]:
speaker_map.drop(['speech_id'], axis=1).drop_duplicates().head()

Unnamed: 0,speakerid,lastname,firstname,chamber,state,gender,party,district,nonvoting
0,111120160,LARSON,JOHN,H,CT,M,D,1.0,voting
1,111117010,PENCE,MIKE,H,IN,M,R,6.0,voting
2,111118060,BOEHNER,JOHN,H,OH,M,R,8.0,voting
3,111120780,PELOSI,NANCY,H,CA,F,D,8.0,voting
4,111119830,HOYER,STENY,H,MD,M,D,5.0,voting
