In [3]:
import pandas as pd
import sys
sys.path.append('../gtm/')
from corpus import GTMCorpus
from gtm_customized import GTM
import pickle as p
import os
language = 'en-zh'

In [4]:
def load_examples(language='en'):
	df = pd.read_csv('../data/wiki_shorts/{}/corpus/docs.txt'.format(language), header=None, delimiter='\t')
	df.columns = ['doc_clean']
	# df = df.head(1000)
	return df

In [5]:
if not os.path.exists('train_dataset_intfloat-e5-large2-{}.pkl'.format(language)):
# if True:
	df_en = load_examples('en')
	df_zh = load_examples('zh')

	# merge two dfs into one df, with additional column specifying the language
	df_en['language'] = 'en'
	df_zh['language'] = 'zh'

	# randomly select 100 documents from each language
	# df_en = df_en.sample(n=100)
	# df_zh = df_zh.sample(n=100)

	# Concatenate the two DataFrames
	df = pd.concat([df_en, df_zh], ignore_index=True)

	train_dataset = GTMCorpus(
		df,
		count_words=True,
		embeddings_type='SentenceTransformer',
		sbert_model_to_load='intfloat/multilingual-e5-large',
		content=None,
		batch_size=64,
		max_seq_length=512)
	print('Saving train_dataset_intfloat-e5-large2-{}.pkl'.format(language))
	with open('train_dataset_intfloat-e5-large2-{}.pkl'.format(language), 'wb') as f:
		p.dump(train_dataset, f)
else:
	print('Loading train_dataset_intfloat-e5-large2-{}.pkl'.format(language))
	with open('train_dataset_intfloat-e5-large2-{}.pkl'.format(language), 'rb') as f:
		train_dataset = p.load(f)

print('dataset loaded')

In [29]:
# Train the model
tm = GTM(
	train_dataset,
	n_topics=6,
	doc_topic_prior='dirichlet', # logistic_normal, dirichlet
	alpha=0.02,
	update_prior=False,
	encoder_input='embeddings', # 'bow', 'embeddings'
	separate_decoders=True,
	encoder_hidden_layers=[], # structure of the encoder neural net
	decoder_hidden_layers=[256], # structure of the decoder neural net
	encoder_bias=True,
	decoder_bias=True,
	num_epochs=0,
	print_every=10000,
	dropout=0.0,
	learning_rate=0.01,
	log_every=1,
	w_prior=None,
	batch_size=512,
	patience=5,
	save_path='../ckpt2/task3',
	ckpt='../ckpt2/task3/best_model.ckpt'
)

In [30]:
import numpy as np

def inspect(tm, ds):
	doc_topic_distribution = tm.get_doc_topic_distribution(ds)

	print('Number of documents per topic')
	print('Topic 0: {}'.format((doc_topic_distribution.argmax(-1) == 0).sum()))
	print('Topic 1: {}'.format((doc_topic_distribution.argmax(-1) == 1).sum()))
	print('Topic 2: {}'.format((doc_topic_distribution.argmax(-1) == 2).sum()))
	print('Topic 3: {}'.format((doc_topic_distribution.argmax(-1) == 3).sum()))
	print('Topic 4: {}'.format((doc_topic_distribution.argmax(-1) == 4).sum()))
	print('Topic 5: {}'.format((doc_topic_distribution.argmax(-1) == 5).sum()))

	# show five random documents per topic
	for topic in range(tm.n_topics):
		print('Topic {}'.format(topic))
		print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
		for i in np.random.choice(np.where(doc_topic_distribution.argmax(-1) == topic)[0], 5):
			print('=' * 50)
			print(ds.df.iloc[i]['doc_clean'])
			print('----------')
			print('Topic distribution = {}'.format(doc_topic_distribution[i]))

In [31]:
'''
Epoch 100	Mean Training Loss:1846.3313570
'''

In [32]:
tm

In [33]:
inspect(tm, train_dataset)

In [34]:
# write a function to write all examples of the same topic to a file
def write_topic_to_file(topic_id, ds, doc_topic_distribution, path):
	with open(path, 'w') as f:
		for i in np.where(doc_topic_distribution.argmax(-1) == topic_id)[0]:
			f.write(ds.df.iloc[i]['doc_clean'] + '\n')

In [35]:
doc_topic_distribution = tm.get_doc_topic_distribution(train_dataset)

In [36]:
import os
if not os.path.exists('../data/task3'):
	os.makedirs('../data/task3')

In [37]:
write_topic_to_file(0, train_dataset, doc_topic_distribution, '../data/task3/topic_0.txt')
write_topic_to_file(1, train_dataset, doc_topic_distribution, '../data/task3/topic_1.txt')
write_topic_to_file(2, train_dataset, doc_topic_distribution, '../data/task3/topic_2.txt')
write_topic_to_file(3, train_dataset, doc_topic_distribution, '../data/task3/topic_3.txt')
write_topic_to_file(4, train_dataset, doc_topic_distribution, '../data/task3/topic_4.txt')
write_topic_to_file(5, train_dataset, doc_topic_distribution, '../data/task3/topic_5.txt')

In [38]:
# read labels
def read_labels(language='en'):
	with open('../data/wiki_shorts/{}/corpus/docs.txt'.format(language), 'r') as file:
		docs = file.readlines()
	with open('../data/wiki_shorts/{}/labels.txt'.format(language), 'r') as file:
		labels = file.readlines()
	
	# one to one mapping of docs to labels
	doc2label = {}
	for i in range(len(docs)):
		doc2label[docs[i][:100].strip()] = int(labels[i].strip())
	return doc2label

In [39]:
doc2label_en = read_labels('en')
doc2label_zh = read_labels('zh')

In [40]:
# for each topic, find its majority label
from collections import defaultdict
def find_majority_label(topic):
	label2cnt = defaultdict(int)
	labels = []
	with open('../data/task3/topic_{}.txt'.format(topic), 'r') as file:
		lines = file.readlines()
		for line in lines:
			k = line[:100].strip()
			if k in doc2label_en:
				v = doc2label_en[k]
			else:
				v = doc2label_zh[k]
			label2cnt[v] += 1
			labels.append(v)
	predicted = max(label2cnt, key=lambda k: label2cnt[k])
	return label2cnt, labels, predicted

In [41]:
_, labels_0, predicted_0 = find_majority_label(0)
_, labels_1, predicted_1 = find_majority_label(1)
_, labels_2, predicted_2 = find_majority_label(2)
_, labels_3, predicted_3 = find_majority_label(3)
_, labels_4, predicted_4 = find_majority_label(4)
_, labels_5, predicted_5 = find_majority_label(5)

In [42]:
print(predicted_0, predicted_1, predicted_2, predicted_3, predicted_4, predicted_5)

In [43]:
final_labels = labels_0 + labels_1 + labels_2 + labels_3 + labels_4 + labels_5

In [44]:
len(final_labels)

In [45]:
final_pred = [predicted_0]*len(labels_0) + [predicted_1]*len(labels_1) + [predicted_2]*len(labels_2) + [predicted_3]*len(labels_3) + [predicted_4]*len(labels_4) + [predicted_5]*len(labels_5)

In [46]:
len(final_pred)

In [47]:
model_pred = [0]*len(labels_0) + [1]*len(labels_1) + [2]*len(labels_2) + [3]*len(labels_3) + [4]*len(labels_4) + [5]*len(labels_5)

In [48]:
from sklearn.metrics import f1_score, accuracy_score, adjusted_rand_score
f1_macro = f1_score(y_true=final_labels, y_pred=final_pred, average='macro')
f1_micro = f1_score(y_true=final_labels, y_pred=final_pred, average='micro')
acc = accuracy_score(y_true=final_labels, y_pred=final_pred)
ars = adjusted_rand_score(labels_true=final_labels, labels_pred=model_pred)

In [49]:
print('f1_macro = {}'.format(f1_macro))
print('f1_micro = {}'.format(f1_micro))
print('acc = {}'.format(acc))
print('ars = {}'.format(ars))