In [None]:
import sys
sys.path.append('../gtm/')
sys.path.append('../simulation/')
import matplotlib.pyplot as plt
import numpy as np
from scipy.optimize import linear_sum_assignment
from random import random
import torch
import pandas as pd
from tqdm import tqdm
from corpus import GTMCorpus
from gtm import GTM
from simulations import generate_docs_by_lda

In [None]:
cossim_list = []
true_doc_topic_list = []
estimated_doc_topic_list = []

doc_topic_prior = 'dirichlet'
update_prior = True
num_epochs = 2
num_iter = 100
num_jobs = 4
num_topics = 4
num_covs = 5
num_docs = 5000
min_words = 50
max_words = 100
voc_size = 1000

np.random.seed(42)
lambda_ = np.random.rand(num_covs, num_topics)
lambda_ = lambda_ - lambda_[:, 0][:, None]
sqrt_sigma = np.random.rand(num_topics, num_topics)
sigma = sqrt_sigma * sqrt_sigma.T

dict_betas = {}
for i in range(num_topics):
    dict_betas[i] = {}
    for c in range(num_covs):
        dict_betas[i][c] = []

for i in range(num_iter):
    
    gtm_model_args = {
        "n_topics": num_topics,
        "num_epochs":num_epochs,
        "update_prior": update_prior,
        "w_prior":None,
        "doc_topic_prior": doc_topic_prior,
        "decoder_type": "mlp",
        "encoder_hidden_layers":[],
        "decoder_hidden_layers":[],
        "decoder_bias":False,
        "batch_size":200,
        "print_every":10000,
        "log_every":100,
        "seed":42
        }
    
    df_true_dist_list_gtm, df_test = generate_docs_by_lda(
        num_docs, 
        num_topics, 
        num_covs, 
        doc_topic_prior, 
        lambda_, 
        sigma,
        min_words, 
        max_words, 
        voc_size, 
        num_jobs,
        seed=i
    )  
    
    test_dataset = GTMCorpus(
        df_test,
        prevalence = "~ cov_0 + cov_1 + cov_2 + cov_3 + cov_4 - 1"
    )
    tm_test = GTM(
        train_data = test_dataset,
        **gtm_model_args
    )
    df_doc_topic_gtm = pd.DataFrame(
            tm_test.get_doc_topic_distribution(test_dataset),
            index=["Doc{}".format(i) for i in range(num_docs)],
            columns=["Topic{}".format(i) for i in range(num_topics)],
        )
    true_df = df_true_dist_list_gtm[0]
    estimated_df = df_doc_topic_gtm

    ### matching the columns of estimated doc_topic dist with those of true doc_topic dist by maximizing dot-product
    score_list = []
    for true_col in true_df.columns:
        true_target_col = true_df.loc[:, true_col]
        score_list_per_row = []
        for col in estimated_df.columns:
            target_col = estimated_df.loc[:, col]
            score_list_per_row.append(np.dot(target_col, true_target_col))
        score_list.append(score_list_per_row)
    corres_num_topic_dict_gtm = {}
    corres_num_topic_dict_gtm_bis = {}
    score_matrix = pd.DataFrame(score_list)
    true_topics, estimated_topics = linear_sum_assignment(-score_matrix)
    for true_topic, estimated_topic in zip(true_topics, estimated_topics):
        corres_num_topic_dict_gtm["Topic{}".format(true_topic)] = "Topic{}".format(
            estimated_topic
        )
        corres_num_topic_dict_gtm_bis[true_topic] = estimated_topic
    reanged_df_gtm = estimated_df.loc[:, corres_num_topic_dict_gtm.values()]
    reanged_df_gtm.columns = corres_num_topic_dict_gtm.keys()

    true_doc_topic_list.append(true_df)
    estimated_doc_topic_list.append(reanged_df_gtm)

    lambda_hat = tm_test.prior.lambda_
    lambda_hat = lambda_hat - lambda_hat[:, corres_num_topic_dict_gtm_bis[0]][:, None]

    for i in range(num_topics):
        for c in range(num_covs):
            dict_betas[i][c] = dict_betas[i][c] + [lambda_hat[c, corres_num_topic_dict_gtm_bis[i]]]    

    ### calculating the cossim scores between true and estimated doc_topic dist
    cossim_score_gtm = []
    for col in true_df.columns:
        inner_res = []
        series_1 = true_df.loc[:, col]
        series_2 = reanged_df_gtm.loc[:, col]
        cossim_score_gtm.append(
            np.dot(series_1.T, series_2)
            / (np.linalg.norm(series_1) * np.linalg.norm(series_2)))
    cossim_list.append(cossim_score_gtm)
    print("finished {} topics' simulation".format(num_topics))
    print()

In [None]:
fig = plt.figure(figsize=(10,5),facecolor="white", tight_layout=True)
ax = fig.add_subplot(1, 1, 1,xlabel="Topic", ylabel="Cosine Similarity")
df_score_res = pd.DataFrame(cossim_list,index=["Iter_{}".format(i) for i in range(num_iter)])
x = np.array(range(num_topics))
ax.set_xticks(x)
mean = df_score_res.mean(axis=0)
std = df_score_res.std(axis=0)
ax.bar(x, mean, yerr=std)
#plt.savefig('doc_topic_proportions.png')

In [None]:
from scipy.stats import norm
from scipy.stats import gaussian_kde

i = 1
c = 3

data = dict_betas[i][c]
kde = gaussian_kde(data)
x = np.linspace(min(data) - 1, max(data) + 1, 1000)
plt.fill_between(x, kde(x), alpha=0.5, color='lightblue', label='Kernel Density of Estimates')
vertical_line_value = round(lambda_[c,i], 2)
plt.axvline(x=vertical_line_value, color='black', linestyle='-', linewidth=2, label='True Value = {}'.format(vertical_line_value))
plt.xlabel('Estimates')
plt.ylabel('Density')
plt.legend()
plt.show()

In [None]:
lambda_.round(2)

In [None]:
lambda_hat.round(2)