In [19]:
'''
To reproduce the results, first download the dataset from the following link:
https://drive.google.com/file/d/1YlS3Ojni-MFSB5f1rXgd-Su4G4kLxC3Y/view?usp=sharing

Then unzip the dataset.zip under the project directory.
'''

'\nTo reproduce the results, first download the dataset from the following link:\n\nThen unzip the dataset.zip under the project directory.\n'

In [20]:
import pandas as pd
import sys
from gtm.corpus import GTMCorpus
from gtm.gtm import GTM
import pickle as p
import os
from compute_metrics import compute
import sys

import warnings
warnings.filterwarnings('ignore')

In [21]:
MODEL_NAME = 'intfloat/multilingual-e5-large'

In [22]:
def load_examples(language='en'):
  df = pd.read_csv('./dataset/docs_original_{}.txt'.format(language), header=None, delimiter='\t')
  df.columns = ['index', 'doc_clean']
  df['language'] = language
  # df = df.head(1000)
  return df

In [23]:
df_en = load_examples('en')
df_zh = load_examples('zh')

print('English examples = {}'.format(len(df_en)))
print('Chinese examples = {}'.format(len(df_zh)))

English examples = 11043
Chinese examples = 10135


In [24]:
num_en = len(df_en)
num_zh = len(df_zh)
# English always comes the first
df = pd.concat([df_en, df_zh]).reset_index()
print('Total examples = {}'.format(len(df)))

Total examples = 21178


In [25]:
def create_dataset(language='en'):
  if not os.path.exists('train_dataset_{}-{}.pkl'.format(MODEL_NAME.split('/')[1], language)):
    print('Loading examples for {}'.format(language))
    train_dataset = GTMCorpus(
      df,
      embeddings_type='SentenceTransformer',
      vectorizer_args = {'ngram_range':(1, 1), 'max_df':0.99, 'min_df':0.001, 'stop_words':'english'},
      sbert_model_to_load=MODEL_NAME,
      content=None,
      prevalence=None,
      batch_size=128,
      max_seq_length=512,
      num_en=num_en,
      num_zh=num_zh,
      language=language)

    print('Saving train_dataset_{}-{}.pkl'.format(MODEL_NAME.split('/')[1], language))
    with open('train_dataset_{}-{}.pkl'.format(MODEL_NAME.split('/')[1], language), 'wb') as f:
      p.dump(train_dataset, f)
  else:
    with open('train_dataset_{}-{}.pkl'.format(MODEL_NAME.split('/')[1], language), 'rb') as f:
      train_dataset = p.load(f)
  return train_dataset

In [26]:
train_dataset_en = create_dataset('en')
train_dataset_zh = create_dataset('zh')

In [27]:
tm_en = GTM(
    train_dataset_en,
    n_topics=6,
    doc_topic_prior='dirichlet',
    update_prior=False,
    encoder_hidden_layers=[256],
    decoder_hidden_layers=[256],
    learning_rate=1e-3,
    num_workers=0,
    patience=3,
    num_epochs=1000,
    encoder_input='bow',
    ckpt_folder='./ckpt_task1_redo_en',
    # ckpt='./ckpt_task1_redo_en/best_model.ckpt',
)


Epoch   1	Mean Training Loss:1421.7371607

Topic_0: ['release', 'make', 'year', 'state', 'new', 'use', 'film', 'time']
Topic_1: ['film', 'use', 'year', 'new', 'time', 'make', 'restaurant', 'food']
Topic_2: ['new', 'film', 'release', 'know', 'make', 'use', 'include', 'year']
Topic_3: ['album', 'release', 'woman', 'record', 'government', 'year', 'right', 'group']
Topic_4: ['film', 'use', 'new', 'make', 'include', 'know', 'family', 'year']
Topic_5: ['specie', 'bird', 'chinese', 'use', 'know', 'film', 'family', 'white']



Epoch   2	Mean Training Loss:1344.6944238

Topic_0: ['woman', 'year', 'state', 'right', 'government', 'school', 'use', 'time']
Topic_1: ['film', 'restaurant', 'company', 'new', 'food', 'open', 'location', 'star']
Topic_2: ['film', 'new', 'year', 'make', 'state', 'know', 'use', 'release']
Topic_3: ['album', 'release', 'record', 'hop', 'hip', 'music', 'group', 'right']
Topic_4: ['bird', 'specie', 'white', 'breed', 'make', 'chicken', 'black', 'genus']
Topic_5: ['chinese', 

In [28]:
print('Computing metrics for English... BOW')
compute(tm_en, train_dataset_en, './output_task1_BOW_en_{}'.format(MODEL_NAME))

Computing metrics for English... BOW
3 5 0 1 2 4
f1_macro = 0.9397780386824346
f1_micro = 0.941139183192973
acc = 0.941139183192973
ars = 0.8682099367464389
--------------------------------------------------


In [29]:
tm_zh = GTM(
    train_dataset_zh,
    n_topics=6,
    doc_topic_prior='dirichlet',
    update_prior=False,
    encoder_hidden_layers=[256],
    decoder_hidden_layers=[256],
    learning_rate=1e-3,
    num_workers=0,
    patience=3,
    num_epochs=1000,
    encoder_input='bow',
    ckpt_folder='./ckpt_task1_redo_zh',
    # ckpt='./ckpt_task1_redo_zh/best_model.ckpt',
)


Epoch   1	Mean Training Loss:853.1959248

Topic_0: ['台湾', '中国', '电影', '美国', '香港', '他们', '日本', '英语']
Topic_1: ['中国', '台湾', '认为', '美国', '国家', '他们', '地区', '历史']
Topic_2: ['台湾', '中国', '美国', '电影', '大学', '日本', '香港', '第一']
Topic_3: ['台湾', '大学', '音乐', '电影', '中国', '美国', '第一', '专辑']
Topic_4: ['中国', '台湾', '日本', '美国', '香港', '主要', '电影', '地区']
Topic_5: ['中国', '学名', '地区', '台湾', '主要', '可以', '分布', '动物']



Epoch   2	Mean Training Loss:799.8463066

Topic_0: ['电影', '他们', '自己', '上映', '美国', '故事', '动画', '发现']
Topic_1: ['主义', '美国', '中国', '运动', '年月', '教会', '台湾', '政府']
Topic_2: ['美国', '电影', '中国', '台湾', '英语', '日本', '作品', '第一']
Topic_3: ['台湾', '音乐', '大学', '中国', '专辑', '担任', '歌手', '乐团']
Topic_4: ['台湾', '中国', '学名', '香港', '动物', '分布', '地区', '主要']
Topic_5: ['学名', '中国', '地区', '分布', '台湾', '物种', '牠们', '动物']



Epoch   3	Mean Training Loss:790.3492098

Topic_0: ['电影', '他们', '上映', '自己', '动画', '美国', '发现', '剧情']
Topic_1: ['台湾', '主义', '中国', '运动', '政府', '年月', '认为', '革命']
Topic_2: ['电影', '美国', '作品', '台湾', '日本', '英语', '中国', '自己

In [30]:
print('Computing metrics for Chinese... BOW')
compute(tm_zh, train_dataset_zh, './output_task1_BOW_zh_{}'.format(MODEL_NAME))

Computing metrics for Chinese... BOW
0 3 0 1 4 2
f1_macro = 0.5650677911750467
f1_micro = 0.6476566354218056
acc = 0.6476566354218056
ars = 0.41295399098946156
--------------------------------------------------


In [31]:
print('Computing metrics for Chinese... BOW, zh2en')
compute(tm_zh, train_dataset_en, './output_task1_BOW_zh_zh2en_{}'.format(MODEL_NAME))

Computing metrics for Chinese... BOW, zh2en
5 3 0 1 4 2
f1_macro = 0.22986404299182628
f1_micro = 0.2611609164176401
acc = 0.2611609164176401
ars = 0.021097720141060483
--------------------------------------------------


In [32]:
print('Computing metrics for English... BOW, en2zh')
compute(tm_en, train_dataset_zh, './output_task1_BOW_zh_en2zh_{}'.format(MODEL_NAME))

Computing metrics for English... BOW, en2zh
3 1 0 1 1 3
f1_macro = 0.12706910645802394
f1_micro = 0.23058707449432658
acc = 0.23058707449432658
ars = 0.003997926122177804
--------------------------------------------------


In [33]:
tm_en = GTM(
    train_dataset_en,
    n_topics=6,
    doc_topic_prior='dirichlet',
    update_prior=False,
    encoder_hidden_layers=[256],
    decoder_hidden_layers=[256],
    learning_rate=1e-3,
    num_workers=0,
    patience=3,
    num_epochs=1000,
    encoder_input='embeddings',
    ckpt_folder='./ckpt_task1_redo_en_emb',
    # ckpt='./ckpt_task1_redo_en_emb/best_model.ckpt',
)


Epoch   1	Mean Training Loss:1430.8183728

Topic_0: ['film', 'state', 'release', 'include', 'album', 'record', 'year', 'group']
Topic_1: ['state', 'release', 'film', 'album', 'use', 'group', 'year', 'know']
Topic_2: ['film', 'state', 'use', 'new', 'release', 'album', 'know', 'include']
Topic_3: ['release', 'album', 'year', 'film', 'new', 'state', 'include', 'know']
Topic_4: ['release', 'year', 'album', 'new', 'film', 'state', 'know', 'use']
Topic_5: ['state', 'album', 'film', 'release', 'know', 'year', 'record', 'new']



Epoch   2	Mean Training Loss:1389.9713526

Topic_0: ['include', 'film', 'make', 'use', 'new', 'know', 'year', 'state']
Topic_1: ['release', 'state', 'film', 'know', 'include', 'use', 'year', 'new']
Topic_2: ['new', 'release', 'know', 'include', 'album', 'year', 'film', 'use']
Topic_3: ['release', 'album', 'record', 'new', 'year', 'group', 'include', 'know']
Topic_4: ['release', 'album', 'year', 'new', 'right', 'film', 'include', 'group']
Topic_5: ['film', 'know', 'us

In [34]:
print('Computing metrics for English... Embeddings')
compute(tm_en, train_dataset_en, './output_task1_redo_en_emb')

Computing metrics for English... Embeddings
2 4 0 1 3 5
f1_macro = 0.8803765050948384
f1_micro = 0.8813728153581454
acc = 0.8813728153581454
ars = 0.7483313628795738
--------------------------------------------------


In [35]:
tm_zh = GTM(
    train_dataset_zh,
    n_topics=6,
    doc_topic_prior='dirichlet',
    update_prior=False,
    encoder_hidden_layers=[256],
    decoder_hidden_layers=[256],
    learning_rate=1e-3,
    num_workers=0,
    patience=3,
    num_epochs=1000,
    encoder_input='embeddings',
    ckpt_folder='./ckpt_task1_redo_zh_emb',
    # ckpt='./ckpt_task1_redo_zh_emb/best_model.ckpt',
)


Epoch   1	Mean Training Loss:859.0559301

Topic_0: ['台湾', '音乐', '大学', '电影', '中国', '美国', '主义', '开始']
Topic_1: ['台湾', '美国', '大学', '电影', '中国', '音乐', '成为', '第一']
Topic_2: ['台湾', '电影', '中国', '美国', '音乐', '大学', '年月', '开始']
Topic_3: ['台湾', '中国', '大学', '电影', '音乐', '美国', '第一', '开始']
Topic_4: ['台湾', '电影', '中国', '美国', '大学', '音乐', '开始', '第一']
Topic_5: ['台湾', '电影', '音乐', '中国', '美国', '开始', '大学', '自己']



Epoch   2	Mean Training Loss:833.9036723

Topic_0: ['台湾', '电影', '中国', '美国', '音乐', '英语', '他们', '大学']
Topic_1: ['台湾', '电影', '美国', '中国', '他们', '第一', '大学', '成为']
Topic_2: ['台湾', '中国', '电影', '美国', '音乐', '第一', '大学', '日本']
Topic_3: ['台湾', '中国', '音乐', '大学', '电影', '美国', '第一', '专辑']
Topic_4: ['台湾', '中国', '电影', '美国', '大学', '音乐', '第一', '日本']
Topic_5: ['台湾', '电影', '美国', '中国', '他们', '开始', '音乐', '自己']



Epoch   3	Mean Training Loss:821.9422477

Topic_0: ['电影', '学名', '他们', '发现', '自己', '美国', '上映', '英语']
Topic_1: ['电影', '他们', '美国', '自己', '英语', '中国', '发现', '学名']
Topic_2: ['台湾', '中国', '大学', '年月', '音乐', '第一', '香港', '运动

In [36]:
print('Computing metrics for Chinese... Embeddings')
compute(tm_zh, train_dataset_zh, './output_task1_redo_zh_emb_{}'.format(MODEL_NAME))

Computing metrics for Chinese... Embeddings
2 4 3 1 3 0
f1_macro = 0.632074770278098
f1_micro = 0.7209669462259497
acc = 0.7209669462259497
ars = 0.4976229846514392
--------------------------------------------------


In [37]:
print('Computing metrics for Chinese... Embeddings, zh2en')
compute(tm_zh, train_dataset_en, './output_task1_redo_zh_emb_zh2en_{}'.format(MODEL_NAME))

Computing metrics for Chinese... Embeddings, zh2en
2 4 3 1 4 0
f1_macro = 0.6537581842360735
f1_micro = 0.7064203567871049
acc = 0.7064203567871049
ars = 0.530294426266735
--------------------------------------------------


In [38]:
print('Computing metrics for English... Embeddings, en2zh')
compute(tm_en, train_dataset_zh, './output_task1_redo_zh_emb_en2zh_{}'.format(MODEL_NAME))

Computing metrics for English... Embeddings, en2zh
2 4 0 1 3 5
f1_macro = 0.5769282299039263
f1_micro = 0.5800690675875678
acc = 0.5800690675875678
ars = 0.3326120011724781
--------------------------------------------------
