In [1]:
import pandas as pd 
df = pd.read_csv('../data/us_congressional_record/us_congress_speeches_processed.csv')
df = df.sample(n=50000, random_state = 42)
df

Unnamed: 0.1,Unnamed: 0,doc,party,speech_year,doc_clean
691903,691903,"Speaker, I would like to express my strong su...",Republican,2005,like express support convey land use heliport ...
351367,351367,"Speaker, this bill, H R. 2277 authorizes $75,...",Democrat,1979,bill authorize airport airway trust fund year ...
250310,250310,"Speaker, I appreciate having this opportunity...",Democrat,1972,appreciate have opportunity discuss colleague ...
644446,644446,The gentleman is absolutely correct. I rememb...,Democrat,2000,gentleman remember friend say start road life ...
803736,803736,I thank the gentleman from Florida for some o...,Republican,2002,thank gentleman opportunity share passion view...
...,...,...,...,...,...
79650,79650,"Chairman, I rise in opposition to the amendme...",Democrat,1958,rise opposition amendment offer gentleman answ...
544086,544086,"Speaker, there is one central question that m...",Republican,1993,question answer people issue health care trust...
856936,856936,"Speaker, I very much appreciate the gentleman...",Republican,2003,appreciate gentleman come discuss tonight brin...
811229,811229,"Chairman, will the gentlewoman yield? Chairm...",Democrat,1997,gentlewoman yield like reemphasize gentlewoman...


In [2]:
import sys
sys.path.append('../gtm/')
from corpus import GTMCorpus
from gtm import GTM

# Create a GTMCorpus object
train_dataset = GTMCorpus(
    df, 
    prevalence = "~ party + C(speech_year)",  
    content = "~ party + C(speech_year)" 
)

train_dataset.M_prevalence_covariates.shape

  from .autonotebook import tqdm as notebook_tqdm


(50000, 95)

In [3]:
train_dataset.M_prevalence_covariates

array([[1., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [4]:
train_dataset.prevalence_colnames

['Intercept',
 'party[T.Republican]',
 'C(speech_year)[T.1921]',
 'C(speech_year)[T.1922]',
 'C(speech_year)[T.1923]',
 'C(speech_year)[T.1924]',
 'C(speech_year)[T.1925]',
 'C(speech_year)[T.1926]',
 'C(speech_year)[T.1928]',
 'C(speech_year)[T.1929]',
 'C(speech_year)[T.1930]',
 'C(speech_year)[T.1931]',
 'C(speech_year)[T.1932]',
 'C(speech_year)[T.1933]',
 'C(speech_year)[T.1934]',
 'C(speech_year)[T.1935]',
 'C(speech_year)[T.1936]',
 'C(speech_year)[T.1937]',
 'C(speech_year)[T.1938]',
 'C(speech_year)[T.1939]',
 'C(speech_year)[T.1940]',
 'C(speech_year)[T.1941]',
 'C(speech_year)[T.1942]',
 'C(speech_year)[T.1943]',
 'C(speech_year)[T.1944]',
 'C(speech_year)[T.1945]',
 'C(speech_year)[T.1946]',
 'C(speech_year)[T.1947]',
 'C(speech_year)[T.1948]',
 'C(speech_year)[T.1949]',
 'C(speech_year)[T.1950]',
 'C(speech_year)[T.1951]',
 'C(speech_year)[T.1952]',
 'C(speech_year)[T.1953]',
 'C(speech_year)[T.1954]',
 'C(speech_year)[T.1955]',
 'C(speech_year)[T.1956]',
 'C(speech_year)[

In [5]:
# Train the model
tm = GTM(
    train_dataset, 
    n_topics=50,
    doc_topic_prior='dirichlet', # logistic_normal
    alpha=0.02,
    prevalence_covariates_regularization=0.1,
    update_prior=True,
    encoder_hidden_layers=[], # structure of the encoder neural net
    decoder_hidden_layers=[300], # structure of the decoder neural net
    num_epochs=100,
    print_every=10000,
    log_every=1,
    w_prior=None,
    batch_size=250
)


Epoch   1	Mean Training Loss:5235.6164893

Topic_0: ['bill', 'gentleman', 'go', 'year', 'amendment']
Topic_1: ['bill', 'year', 'program', 'provide', 'increase']
Topic_2: ['bill', 'gentleman', 'year', 'go', 'people']
Topic_3: ['bill', 'year', 'go', 'gentleman', 'amendment']
Topic_4: ['bill', 'gentleman', 'go', 'time', 'say']
Topic_5: ['bill', 'gentleman', 'year', 'go', 'make']
Topic_6: ['bill', 'year', 'go', 'say', 'make']
Topic_7: ['bill', 'gentleman', 'year', 'amendment', 'make']
Topic_8: ['bill', 'say', 'people', 'year', 'make']
Topic_9: ['bill', 'year', 'gentleman', 'people', 'go']
Topic_10: ['bill', 'go', 'year', 'make', 'time']
Topic_11: ['gentleman', 'bill', 'year', 'go', 'say']
Topic_12: ['bill', 'go', 'gentleman', 'year', 'say']
Topic_13: ['bill', 'gentleman', 'people', 'year', 'go']
Topic_14: ['bill', 'go', 'amendment', 'program', 'year']
Topic_15: ['bill', 'gentleman', 'amendment', 'go', 'year']
Topic_16: ['bill', 'go', 'year', 'make', 'say']
Topic_17: ['gentleman', 'bill', 


Epoch   2	Mean Training Loss:1190.1162759

Topic_0: ['gentleman', 'amendment', 'yield', 'think', 'bill']
Topic_1: ['program', 'increase', 'cost', 'industry', 'price']
Topic_2: ['gentleman', 'yield', 'amendment', 'think', 'say']
Topic_3: ['gentleman', 'amendment', 'yield', 'bill', 'committee']
Topic_4: ['gentleman', 'yield', 'amendment', 'think', 'say']
Topic_5: ['gentleman', 'amendment', 'bill', 'yield', 'say']
Topic_6: ['bill', 'year', 'make', 'go', 'time']
Topic_7: ['gentleman', 'amendment', 'bill', 'committee', 'yield']
Topic_8: ['bill', 'program', 'year', 'make', 'provide']
Topic_9: ['people', 'year', 'say', 'go', 'time']
Topic_10: ['year', 'people', 'nation', 'country', 'support']
Topic_11: ['gentleman', 'yield', 'say', 'amendment', 'think']
Topic_12: ['gentleman', 'amendment', 'yield', 'bill', 'committee']
Topic_13: ['gentleman', 'amendment', 'yield', 'bill', 'think']
Topic_14: ['people', 'man', 'freedom', 'life', 'honor']
Topic_15: ['amendment', 'rule', 'committee', 'motion', '


Epoch   3	Mean Training Loss:1000.8871356

Topic_0: ['gentleman', 'yield', 'think', 'amendment', 'thank']
Topic_1: ['price', 'industry', 'market', 'cost', 'program']
Topic_2: ['gentleman', 'yield', 'think', 'thank', 'say']
Topic_3: ['gentleman', 'amendment', 'yield', 'consent', 'bill']
Topic_4: ['gentleman', 'yield', 'think', 'amendment', 'minute']
Topic_5: ['gentleman', 'yield', 'thank', 'amendment', 'time']
Topic_6: ['child', 'health', 'school', 'family', 'education']
Topic_7: ['authorize', 'project', 'authorization', 'section', 'provision']
Topic_8: ['program', 'increase', 'cost', 'bill', 'income']
Topic_9: ['people', 'know', 'say', 'man', 'friend']
Topic_10: ['serve', 'honor', 'man', 'freedom', 'nation']
Topic_11: ['gentleman', 'yield', 'think', 'say', 'thank']
Topic_12: ['gentleman', 'amendment', 'yield', 'bill', 'committee']
Topic_13: ['gentleman', 'yield', 'amendment', 'thank', 'think']
Topic_14: ['honor', 'man', 'freedom', 'tribute', 'love']
Topic_15: ['rule', 'thereto', 'amen


Epoch   4	Mean Training Loss:908.9577148

Topic_0: ['gentleman', 'yield', 'commodity', 'farmer', 'price']
Topic_1: ['cost', 'market', 'price', 'industry', 'import']
Topic_2: ['gentleman', 'yield', 'thank', 'gentlewoman', 'think']
Topic_3: ['gentleman', 'amendment', 'yield', 'tempore', 'consent']
Topic_4: ['gentleman', 'yield', 'amendment', 'think', 'tempore']
Topic_5: ['school', 'child', 'student', 'woman', 'teacher']
Topic_6: ['student', 'school', 'education', 'teacher', 'child']
Topic_7: ['water', 'project', 'recreation', 'authorize', 'development']
Topic_8: ['income', 'tax', 'benefit', 'increase', 'cost']
Topic_9: ['say', 'people', 'go', 'know', 'get']
Topic_10: ['serve', 'honor', 'service', 'tribute', 'dedication']
Topic_11: ['gentleman', 'yield', 'thank', 'friend', 'gentlewoman']
Topic_12: ['amendment', 'gentleman', 'appropriation', 'bill', 'yield']
Topic_13: ['gentleman', 'yield', 'tempore', 'amendment', 'objection']
Topic_14: ['passing', 'honor', 'tribute', 'love', 'hero']
Topi


Epoch   5	Mean Training Loss:876.7112225

Topic_0: ['income', 'price', 'tax', 'industry', 'farmer']
Topic_1: ['industry', 'oil', 'market', 'import', 'price']
Topic_2: ['gentleman', 'yield', 'thank', 'gentlewoman', 'appreciate']
Topic_3: ['amendment', 'tempore', 'designate', 'thereto', 'vessel']
Topic_4: ['gentleman', 'yield', 'amendment', 'gentlewoman', 'reclaim']
Topic_5: ['school', 'teacher', 'student', 'parent', 'child']
Topic_6: ['student', 'school', 'education', 'teacher', 'classroom']
Topic_7: ['water', 'recreation', 'project', 'feasibility', 'irrigation']
Topic_8: ['income', 'benefit', 'tax', 'coverage', 'increase']
Topic_9: ['say', 'people', 'go', 'know', 'thing']
Topic_10: ['honor', 'serve', 'career', 'dedication', 'service']
Topic_11: ['gentleman', 'thank', 'yield', 'friend', 'miss']
Topic_12: ['appropriation', 'bill', 'amendment', 'motion', 'fund']
Topic_13: ['industry', 'market', 'import', 'consumer', 'price']
Topic_14: ['passing', 'love', 'honor', 'tribute', 'hero']
Topic


Epoch   6	Mean Training Loss:888.3719089

Topic_0: ['surplus', 'tax', 'income', 'cent', 'price']
Topic_1: ['industry', 'import', 'market', 'oil', 'production']
Topic_2: ['gentleman', 'yield', 'thank', 'gentlewoman', 'appreciate']
Topic_3: ['vessel', 'wildlife', 'contamination', 'water', 'powerplant']
Topic_4: ['gentleman', 'yield', 'amendment', 'gentlewoman', 'reclaim']
Topic_5: ['school', 'teacher', 'child', 'parent', 'student']
Topic_6: ['student', 'school', 'education', 'teacher', 'program']
Topic_7: ['water', 'recreation', 'reservoir', 'project', 'irrigation']
Topic_8: ['income', 'coverage', 'tax', 'benefit', 'premium']
Topic_9: ['say', 'go', 'get', 'people', 'thing']
Topic_10: ['serve', 'honor', 'career', 'service', 'dedication']
Topic_11: ['gentleman', 'thank', 'friend', 'miss', 'tribute']
Topic_12: ['appropriation', 'bill', 'spending', 'budget', 'revenue']
Topic_13: ['industry', 'import', 'market', 'price', 'oil']
Topic_14: ['love', 'man', 'wife', 'passing', 'honor']
Topic_15: 


Epoch   7	Mean Training Loss:924.6463876

Topic_0: ['surplus', 'cent', 'income', 'tax', 'farmer']
Topic_1: ['industry', 'market', 'import', 'oil', 'production']
Topic_2: ['gentleman', 'yield', 'thank', 'gentlewoman', 'appreciate']
Topic_3: ['contamination', 'vessel', 'evaporation', 'centiliter', 'wildlife']
Topic_4: ['gentleman', 'yield', 'gentlewoman', 'amendment', 'thank']
Topic_5: ['school', 'student', 'child', 'pregnancy', 'teacher']
Topic_6: ['student', 'education', 'school', 'teacher', 'program']
Topic_7: ['water', 'reservoir', 'navigation', 'feasibility', 'recreation']
Topic_8: ['coverage', 'income', 'benefit', 'premium', 'insurance']
Topic_9: ['say', 'go', 'get', 'people', 'thing']
Topic_10: ['serve', 'honor', 'career', 'dedication', 'service']
Topic_11: ['gentleman', 'thank', 'love', 'friend', 'miss']
Topic_12: ['appropriation', 'bill', 'spending', 'revenue', 'motion']
Topic_13: ['import', 'industry', 'market', 'oil', 'price']
Topic_14: ['love', 'man', 'wife', 'honor', 'tribu


Epoch   8	Mean Training Loss:875.2583270

Topic_0: ['cent', 'surplus', 'tax', 'income', 'farmer']
Topic_1: ['industry', 'market', 'import', 'produce', 'percent']
Topic_2: ['gentleman', 'yield', 'thank', 'gentlewoman', 'appreciate']
Topic_3: ['contamination', 'steelmaking', 'evaporation', 'centiliter', 'vessel']
Topic_4: ['gentleman', 'yield', 'gentlewoman', 'reclaim', 'amendment']
Topic_5: ['school', 'student', 'desegregation', 'pregnancy', 'child']
Topic_6: ['student', 'education', 'school', 'teacher', 'program']
Topic_7: ['water', 'feasibility', 'reservoir', 'desalt', 'recreation']
Topic_8: ['coverage', 'premium', 'income', 'insurance', 'benefit']
Topic_9: ['say', 'go', 'get', 'people', 'thing']
Topic_10: ['serve', 'career', 'achievement', 'honor', 'dedication']
Topic_11: ['love', 'wife', 'miss', 'friend', 'son']
Topic_12: ['appropriation', 'spending', 'bill', 'revenue', 'motion']
Topic_13: ['import', 'oil', 'industry', 'price', 'production']
Topic_14: ['love', 'man', 'freedom', 'so


Epoch   9	Mean Training Loss:857.0579782

Topic_0: ['cent', 'surplus', 'tax', 'farmer', 'income']
Topic_1: ['industry', 'market', 'cost', 'percent', 'import']
Topic_2: ['gentleman', 'yield', 'thank', 'gentlewoman', 'friend']
Topic_3: ['contamination', 'evaporation', 'consumptive', 'allencompassing', 'steelmaking']
Topic_4: ['gentleman', 'yield', 'gentlewoman', 'reclaim', 'tempore']
Topic_5: ['school', 'pregnancy', 'desegregation', 'violence', 'student']
Topic_6: ['student', 'education', 'school', 'teacher', 'college']
Topic_7: ['water', 'desalt', 'recreation', 'navigation', 'reservoir']
Topic_8: ['coverage', 'premium', 'income', 'insurance', 'benefit']
Topic_9: ['go', 'say', 'get', 'people', 'thing']
Topic_10: ['achievement', 'service', 'veteran', 'award', 'serve']
Topic_11: ['love', 'son', 'hero', 'humor', 'friend']
Topic_12: ['appropriation', 'gentleman', 'motion', 'spending', 'amendment']
Topic_13: ['import', 'oil', 'steel', 'trade', 'price']
Topic_14: ['love', 'freedom', 'man', 'g


Epoch  10	Mean Training Loss:859.7552722

Topic_0: ['cent', 'farmer', 'surplus', 'wheat', 'bushel']
Topic_1: ['industry', 'market', 'cost', 'product', 'import']
Topic_2: ['gentleman', 'friend', 'yield', 'thank', 'gentlewoman']
Topic_3: ['contamination', 'greenway', 'upcountry', 'essex', 'mineout']
Topic_4: ['gentleman', 'yield', 'gentlewoman', 'reclaim', 'tempore']
Topic_5: ['school', 'pregnancy', 'desegregation', 'child', 'student']
Topic_6: ['student', 'education', 'school', 'teacher', 'college']
Topic_7: ['water', 'desalination', 'desalt', 'navigation', 'energy']
Topic_8: ['coverage', 'premium', 'insurance', 'income', 'benefit']
Topic_9: ['go', 'say', 'get', 'people', 'thing']
Topic_10: ['achievement', 'service', 'veteran', 'graduate', 'career']
Topic_11: ['love', 'hero', 'son', 'wife', 'humor']
Topic_12: ['appropriation', 'gentleman', 'motion', 'amendment', 'spending']
Topic_13: ['import', 'steel', 'oil', 'trade', 'export']
Topic_14: ['love', 'man', 'son', 'courage', 'hero']
Topic


Epoch  11	Mean Training Loss:865.7283594

Topic_0: ['cent', 'bushel', 'farmer', 'wheat', 'surplus']
Topic_1: ['industry', 'cost', 'product', 'market', 'percent']
Topic_2: ['gentleman', 'friend', 'thank', 'yield', 'gentlewoman']
Topic_3: ['sulphur', 'ecosystem', 'steamboat', 'contamination', 'evaporator']
Topic_4: ['gentleman', 'yield', 'gentlewoman', 'reclaim', 'tempore']
Topic_5: ['pregnancy', 'school', 'child', 'desegregation', 'crime']
Topic_6: ['student', 'teacher', 'education', 'school', 'classroom']
Topic_7: ['water', 'desalt', 'oil', 'energy', 'navigation']
Topic_8: ['coverage', 'premium', 'insurance', 'income', 'beneficiary']
Topic_9: ['go', 'get', 'say', 'money', 'people']
Topic_10: ['achievement', 'service', 'graduate', 'veteran', 'award']
Topic_11: ['championship', 'love', 'hero', 'sing', 'sadness']
Topic_12: ['appropriation', 'motion', 'gentleman', 'spending', 'amendment']
Topic_13: ['steel', 'import', 'trade', 'export', 'industry']
Topic_14: ['freedom', 'man', 'love', 'co


Epoch  12	Mean Training Loss:868.1837610

Topic_0: ['cent', 'wheat', 'bushel', 'farmer', 'surplus']
Topic_1: ['industry', 'cost', 'percent', 'market', 'product']
Topic_2: ['gentleman', 'friend', 'passing', 'thank', 'yield']
Topic_3: ['ecosystem', 'monitoring', 'escapement', 'contamination', 'neutrino']
Topic_4: ['gentleman', 'yield', 'tempore', 'gentlewoman', 'reclaim']
Topic_5: ['pregnancy', 'child', 'school', 'crime', 'desegregation']
Topic_6: ['student', 'education', 'teacher', 'school', 'classroom']
Topic_7: ['water', 'energy', 'desalt', 'reactor', 'oil']
Topic_8: ['premium', 'coverage', 'income', 'insurance', 'beneficiary']
Topic_9: ['go', 'get', 'people', 'money', 'pay']
Topic_10: ['service', 'achievement', 'veteran', 'graduate', 'student']
Topic_11: ['championship', 'genocide', 'mourn', 'sing', 'coach']
Topic_12: ['motion', 'spending', 'appropriation', 'gentleman', 'recommit']
Topic_13: ['steel', 'import', 'trade', 'export', 'industry']
Topic_14: ['man', 'freedom', 'parade', 'l


Epoch  13	Mean Training Loss:867.7683496

Topic_0: ['bushel', 'cent', 'wheat', 'farmer', 'corn']
Topic_1: ['industry', 'percent', 'cost', 'market', 'product']
Topic_2: ['gentleman', 'passing', 'thank', 'bereavement', 'friend']
Topic_3: ['ecosystem', 'monitoring', 'escapement', 'evaporation', 'btu']
Topic_4: ['gentleman', 'yield', 'reclaim', 'gentlewoman', 'tempore']
Topic_5: ['desegregation', 'child', 'crime', 'school', 'teacher']
Topic_6: ['student', 'education', 'teacher', 'school', 'classroom']
Topic_7: ['water', 'energy', 'desalt', 'distillation', 'desalination']
Topic_8: ['coverage', 'premium', 'income', 'beneficiary', 'insurance']
Topic_9: ['care', 'go', 'percent', 'money', 'get']
Topic_10: ['service', 'achievement', 'veteran', 'skill', 'graduate']
Topic_11: ['hero', 'championship', 'love', 'mourn', 'sing']
Topic_12: ['spending', 'appropriation', 'budget', 'amendment', 'conferee']
Topic_13: ['steel', 'trade', 'import', 'export', 'oil']
Topic_14: ['freedom', 'man', 'love', 'hero'


Epoch  14	Mean Training Loss:864.3323972

Topic_0: ['cent', 'dairy', 'bushel', 'corn', 'barley']
Topic_1: ['percent', 'cost', 'industry', 'market', 'price']
Topic_2: ['gentleman', 'passing', 'thank', 'championship', 'friend']
Topic_3: ['evaporation', 'ecosystem', 'escapement', 'reliction', 'technology']
Topic_4: ['gentleman', 'yield', 'reclaim', 'gentlewoman', 'tempore']
Topic_5: ['offender', 'child', 'desegregation', 'pregnancy', 'crime']
Topic_6: ['education', 'student', 'teacher', 'school', 'classroom']
Topic_7: ['water', 'reactor', 'energy', 'lake', 'desalination']
Topic_8: ['coverage', 'premium', 'income', 'pension', 'beneficiary']
Topic_9: ['care', 'profit', 'percent', 'go', 'money']
Topic_10: ['service', 'achievement', 'art', 'graduate', 'veteran']
Topic_11: ['championship', 'hero', 'mourn', 'love', 'sadness']
Topic_12: ['appropriation', 'recommit', 'conferee', 'motion', 'waive']
Topic_13: ['steel', 'trade', 'export', 'oil', 'import']
Topic_14: ['freedom', 'hero', 'love', 'hono

Early stopping at Epoch 14. Reverting to Epoch 9


In [17]:
# Assess the quality of the learned word embeddings 
# Top 8 closest words to a specific word

import torch
import torch.nn.functional as F

specific_word = 'patriot'

word_id = [i for i,w in enumerate(train_dataset.vocab) if w == specific_word][0]

words = tm.AutoEncoder.decoder['dec_1'].weight.T

logit = torch.matmul(words.T[word_id], words)

beta = F.softmax(logit)

tm.AutoEncoder.eval()
topic_words = []
vals, indices = torch.topk(beta, 8)
vals = vals.cpu().tolist()
indices = indices.cpu().tolist()
[tm.id2token[idx] for idx in indices]

  beta = F.softmax(logit)


['patriot',
 'heroism',
 'hero',
 'abolitionist',
 'comrade',
 'invader',
 'glory',
 'oppressor']

In [18]:
dfc = tm.estimate_effect(train_dataset, n_samples=10, topic_ids=None)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [09:13<00:00, 55.31s/it]


In [30]:
dfc[dfc['topic'] == 49]

Unnamed: 0,topic,covariate,mean,sd
4655,49,Intercept,0.029131,0.004138
4656,49,party[T.Republican],0.001078,0.000076
4657,49,C(speech_year)[T.1921],-0.000941,0.004953
4658,49,C(speech_year)[T.1922],-0.007769,0.005213
4659,49,C(speech_year)[T.1923],0.008030,0.005422
...,...,...,...,...
4745,49,C(speech_year)[T.2010],-0.002466,0.004058
4746,49,C(speech_year)[T.2011],-0.002068,0.004197
4747,49,C(speech_year)[T.2012],-0.001290,0.003996
4748,49,C(speech_year)[T.2013],-0.001662,0.004133


In [None]:
import statsmodels.api as sm
Y = tm.get_doc_topic_distribution(train_dataset)
X = train_dataset.M_prevalence_covariates
model = sm.OLS(Y[:,43],X)
results = model.fit()
covs = train_dataset.prevalence_colnames
pd.DataFrame([covs, results.params])

In [None]:
tm.get_top_docs(train_dataset, topic_id = 47)

In [None]:
ldavis_format = tm.get_ldavis_data_format(train_dataset)
import pyLDAvis
gtm_vis_data = pyLDAvis.prepare(**ldavis_format, sort_topics=False)
pyLDAvis.display(gtm_vis_data)

In [None]:
tm.plot_wordcloud(topic_id = 18)