In [1]:
# Load a random sample of speeches pronounced in the floor of the US Congress between 1994 and 2024
import pandas as pd
df = pd.read_csv('../data/us_congress_speeches_sample.csv')

print("Number of speeches: {}".format(len(df)))

# Path setup
import sys
sys.path.append('../src/')
from corpus import Corpus
from models import GTM
from utils import bert_embeddings_from_list
from sklearn.feature_extraction.text import CountVectorizer
import torch

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" # to avoid some warnings

def embed_fn_bert(texts):
    return bert_embeddings_from_list(
        texts=texts,
        sbert_model_to_load="paraphrase-multilingual-mpnet-base-v2",
        batch_size=8,
        max_seq_length=128,
        device="cuda" if torch.cuda.is_available() else "cpu"
    )

# Create vectorizer
default_vectorizer = CountVectorizer()

# Define modalities
modalities = {
    "text": {
        "column": "doc_clean",
        "views": {
            "embedding": {
                "type": "embedding",
                "embed_fn": embed_fn_bert
            },
            "bow": {
                "type": "bow",
                "vectorizer": default_vectorizer
            }
        }
    }
}

# Create dataset
train_dataset = Corpus(df, modalities=modalities)

Number of speeches: 28731


In [3]:
encoder_args = {
    "text_embedding": {
        "hidden_dims": [256,64],
        "activation": "relu",
        "bias": True,
        "dropout": 0.2
    }
}

decoder_args = {
    "text_bow": {
        "hidden_dims": [64,256],
        "activation": "relu",
        "bias": True,
        "dropout": 0.2
    }
}

In [4]:
# Train the model
tm = GTM(
    train_data=train_dataset,
    n_topics=20,
    encoder_args=encoder_args,
    decoder_args=decoder_args
)


Epoch   1	Mean Training Loss:8.7709431


Epoch   2	Mean Training Loss:8.5055890


Epoch   3	Mean Training Loss:8.3729568


Epoch   4	Mean Training Loss:8.2937131


Epoch   5	Mean Training Loss:8.1979330


Epoch   6	Mean Training Loss:8.1677784


Epoch   7	Mean Training Loss:8.1438777


Epoch   8	Mean Training Loss:8.1272968


Epoch   9	Mean Training Loss:8.1174837


Epoch  10	Mean Training Loss:8.0993053


Epoch  11	Mean Training Loss:8.0860619


Epoch  12	Mean Training Loss:8.0751244


Epoch  13	Mean Training Loss:8.0665987


Epoch  14	Mean Training Loss:8.0561958


Epoch  15	Mean Training Loss:8.0527145


Epoch  16	Mean Training Loss:8.0473793


Epoch  17	Mean Training Loss:8.0375476


Epoch  18	Mean Training Loss:8.0334343


Epoch  19	Mean Training Loss:8.0243886


Epoch  20	Mean Training Loss:8.0181381


Epoch  21	Mean Training Loss:8.0110870


Epoch  22	Mean Training Loss:8.0065778


Epoch  23	Mean Training Loss:7.9978029


Epoch  24	Mean Training Loss:7.9946293


Epoch  25	Mean 

In [5]:
print(
    "\n".join(
        [
            "{}: {}".format(str(k), str(v))
            for k, v in tm.get_topic_words(topK=5).items()
        ]
    )
)

Topic_0: ['cancer', 'health', 'care', 'child', 'patient']
Topic_1: ['budget', 'go', 'cut', 'spending', 'debt']
Topic_2: ['service', 'community', 'honor', 'recognize', 'life']
Topic_3: ['team', 'win', 'championship', 'game', 'season']
Topic_4: ['nuclear', 'country', 'weapon', 'other', 'world']
Topic_5: ['human', 'right', 'resolution', 'peace', 'other']
Topic_6: ['service', 'family', 'life', 'honor', 'man']
Topic_7: ['energy', 'oil', 'fuel', 'gas', 'bill']
Topic_8: ['tax', 'health', 'cut', 'go', 'family']
Topic_9: ['know', 'go', 'friend', 'good', 'get']
Topic_10: ['such', 'bill', 'subcommittee', 'section', 'include']
Topic_11: ['school', 'student', 'education', 'program', 'teacher']
Topic_12: ['budget', 'go', 'bill', 'spending', 'debt']
Topic_13: ['go', 'tax', 'job', 'country', 'get']
Topic_14: ['bill', 'law', 'legislation', 'other', 'use']
Topic_15: ['bill', 'business', 'follow', 'leader', 'no']
Topic_16: ['go', 'judge', 'law', 'election', 'other']
Topic_17: ['bill', 'business', 'compan

In [21]:
# We use multilingual embeddings as input (for encoding) and decode Bag of Words matrices.
# This means the model can also predict topic shares out of sample for other languages (e.g., French).

import pandas as pd
import numpy as np

# Define the list of French sentences
sentences = [
    "L'arme nucléaire est un atout géopolitique indéniable.",
    "Cette équipe de football est incroyable.",
    "Une nouvelle taxe sur la consommation entrera en vigueur dès janvier.",
    "Il faut réformer l'école publique avant que ce ne soit trop tard."
]

# Create the DataFrame
df2 = pd.DataFrame({'speech': sentences})

modalities = {
    "text": {
        "column": "speech",
        "views": {
            "embedding": {
                "type": "embedding",
                "embed_fn": embed_fn_bert
            }
        }
    }
}

# Create dataset
french_dataset = Corpus(df2, modalities=modalities)

the_topic_shares = tm.get_doc_topic_distribution(french_dataset)

# Get the top topic per document
top_topics = np.argmax(the_topic_shares, axis=1)

# Get topic words dictionary
topic_words = tm.get_topic_words(topK=5)

# Format and print the results
for i, (sentence, topic_id) in enumerate(zip(sentences, top_topics)):
    topic_id = int(topic_id)  # Cast to plain Python int
    words = ", ".join(topic_words["Topic_{}".format(topic_id)])
    print(f"Sentence {i+1}: {sentence}")
    print(f"→ Top Topic {topic_id}: {words}\n")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Sentence 1: L'arme nucléaire est un atout géopolitique indéniable.
→ Top Topic 4: nuclear, country, weapon, other, world

Sentence 2: Cette équipe de football est incroyable.
→ Top Topic 3: team, win, championship, game, season

Sentence 3: Une nouvelle taxe sur la consommation entrera en vigueur dès janvier.
→ Top Topic 8: tax, health, cut, go, family

Sentence 4: Il faut réformer l'école publique avant que ce ne soit trop tard.
→ Top Topic 11: school, student, education, program, teacher

