In [None]:
import pandas as pd
df = pd.read_csv('../data/us_politicians.csv')
df.dropna(subset=['doc_clean'], inplace=True)

import numpy as np
df['random'] = np.random.normal(size = len(df.index))

train = df.sample(frac=0.9,random_state=42)
test = df.drop(train.index).reset_index(drop=True)
train = train.reset_index(drop=True)

In [2]:
import sys
sys.path.append('../gtm/')
from corpus import GTMCorpus
from gtm import GTM
from sklearn.feature_extraction.text import CountVectorizer

# ---- 1. Fit vectorizer on training set ----
vectorizer = CountVectorizer(max_features=13000)  
vectorizer.fit(train["doc_clean"])

# ---- 2. Define modalities using this vectorizer ----
modalities = {
    "default": {
        "column": "doc_clean",
        "views": {
            "bow": {
                "type": "bow",
                "vectorizer": vectorizer
            }
        }
    }
}

# ---- 3. Create GTMCorpus datasets ----
train_dataset = GTMCorpus(
    train,
    modalities=modalities,
    labels="~random-1",
    content="~1"
)

test_dataset = GTMCorpus(
    test,
    modalities=modalities,  
    labels="~random-1",
    content="~1"
)

In [3]:
# Train the model
tm = GTM(
    train_dataset, 
    test_dataset,
    n_topics=20,
    predictor_type='regressor', # 'regressor' for continuous variables such as GDP,
    w_pred_loss=1
)


Epoch   1	Mean Training Loss:12.0643077

Topic_0: ['good', 'less', 'information', 'state', 'fear', 'figure', 'address', 'oppose']
Topic_1: ['subcommittee', 'delay', 'describe', 'interested', 'policy', 'participate', 'majority', 'consider']
Topic_2: ['money', 'turn', 'most', 'investment', 'aware', 'security', 'relief', 'vast']
Topic_3: ['regard', 'resolution', 'immediate', 'emergency', 'seem', 'close', 'large', 'person']
Topic_4: ['statement', 'particular', 'attempt', 'produce', 'testimony', 'give', 'value', 'majority_leader']
Topic_5: ['accord', 'find', 'late', 'apply', 'maintenance', 'damage', 'oppose', 'revenue']
Topic_6: ['next', 'requirement', 'decision', 'agency', 'particular', 'information', 'approach', 'dispose']
Topic_7: ['thank', 'know', 'colleague', 'preserve', 'interested', 'institution', 'money', 'good']
Topic_8: ['general', 'condition', 'program', 'station', 'decision', 'many', 'product', 'appropriate']
Topic_9: ['great', 'point', 'capital', 'propose', 'avoid', 'weapon', 

In [5]:
tm.get_predictions(train_dataset, to_numpy=False)[0:10]

tensor([[0.0697],
        [0.0683],
        [0.1211],
        [0.0688],
        [0.0650],
        [0.0661],
        [0.1159],
        [0.0706],
        [0.0758],
        [0.0651]], device='cuda:0')