In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [34]:
# Load the full dataset
df_wiki_train = pd.read_json("../Datasets/Wiki/train.metadata.jsonl", lines=True)

# Preview the structure
print(df_wiki_train.columns)


Index(['id', 'text', 'supercategory', 'category', 'subcategory', 'page_name',
       'tokenized_text'],
      dtype='object')


In [35]:
#how many text are there?
print(df_wiki_train.shape)

(14290, 7)


In [36]:
df_wiki_train.head(5)

Unnamed: 0,id,text,supercategory,category,subcategory,page_name,tokenized_text
0,2_CicelyMaryBarker,Cicely Mary Barker = Cicely Mary Barker ( 28 J...,Language and literature,Language and literature,"Writers, publishers, and critics",Cicely Mary Barker,june february english illustrator best known s...
1,210_GertrudeBarrowsBennett,Gertrude Barrows Bennett = Gertrude Barrows Be...,Language and literature,Language and literature,"Writers, publishers, and critics",Gertrude Barrows Bennett,major female writer fantasy science fiction un...
2,470_MeraldaWarren,Meralda Warren = Meralda Elva Junior Warren ( ...,Language and literature,Language and literature,"Writers, publishers, and critics",Meralda Warren,junior warren born june artist poet remote bri...
3,588_WilhelmBusch,Wilhelm Busch = Heinrich Christian Wilhelm Bus...,Language and literature,Language and literature,"Writers, publishers, and critics",Wilhelm Busch,christian april january german poet illustrato...
4,743_GabrielGarcíaMárquez,Gabriel García Márquez = Gabriel José de la Co...,Language and literature,Language and literature,"Writers, publishers, and critics",Gabriel García Márquez,american spanish march april novelist short st...


In [None]:
#how many supervategory, category, subcategory are there?
print(df_wiki_train.supercategory.nunique())
print(df_wiki_train.category.nunique())
print(df_wiki_train.subcategory.nunique())

15
45
279


In [None]:
#!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.1 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.17.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.4 kB)
Downloading gensim-4.3.3-cp312-cp312-macosx_11_0_arm64.whl (24.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading smart_open-7.1.0-py3-none-any.whl (61 kB)
Downloading wrapt-1.17.2-cp312-cp312-macosx_11_0_arm64.whl (38 kB)
Installing collected packages: wrapt, smart-open, gensim
Successfully installed gensim-4.3.3 smart-open-7.1.0 wrapt-1.17.2


In [20]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
#from gensim.models.wrappers import LdaMallet
from gensim.matutils import Sparse2Corpus

In [None]:
# === 2. Tokenize ===
tokenized_docs = df_wiki_train['tokenized_text'].apply(str.split).tolist()

# === 3. Create dictionary and corpus ===
dictionary = Dictionary(tokenized_docs) # think dictionary as a vocabulary index
dictionary.filter_extremes(no_below=5, no_above=0.9) 
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs] # each doc has :[(word_id_1, count), (word_id_2, count), ...]
 
# === 4. Train the LDA model ===
K = 20  # choose your number of topics
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=K,
    random_state=42,
    passes=10, #go over entire corpus 10 times, like 10 epochs
    iterations=100, #how carefully read each page? 
)



In [53]:
# === 5. Get D x K document-topic matrix ===
doc_topic_matrix = np.zeros((len(corpus), K))
for i, bow in enumerate(corpus):
    for topic_id, prob in lda_model.get_document_topics(bow, minimum_probability=0):
        doc_topic_matrix[i, topic_id] = prob

doc_topic_matrix.shape

(14290, 20)

In [54]:
# === 6. Get K x V topic-word matrix ===
topic_word_matrix = lda_model.get_topics()  # shape: (K, V)
topic_word_matrix.shape

(20, 15000)

In [None]:
# === 7. Save outputs ===
np.save("../ExperimentResults/LDA/Wiki/k_20/train_theta_k_20.npy", doc_topic_matrix)
np.save("../ExperimentResults/LDA/Wiki/k_20/train_beta_k_20.npy", topic_word_matrix)

In [None]:
#save the mapping 
import json
import pickle


with open("../ExperimentResults/LDA/Wiki/k_20/vocab_k_20.json", "w") as f:
    json.dump(dictionary.token2id, f)

# Save the reverse mapping (id2word) as well
id2word = {v: k for k, v in dictionary.token2id.items()}
with open("../ExperimentResults/LDA/Wiki/k_20/id2word_k_20.json", "w") as f:
    json.dump(id2word, f)


# Save the corpus
with open("../ExperimentResults/LDA/Wiki/k_20/corpus_bow_k_20.pkl", "wb") as f:
    pickle.dump(corpus, f)

In [57]:

# === 8. Save top words per topic to a .txt file ===
top_words_per_topic = []
n_words = 20  # top N words
for i in range(K):
    words = [word for word, prob in lda_model.show_topic(i, topn=n_words)]
    top_words_per_topic.append(words)

# Save to file
with open("../ExperimentResults/LDA/Wiki/k_20/topics.txt", "w") as f:
    for i, words in enumerate(top_words_per_topic):
        f.write(f"Topic {i}: {', '.join(words)}\n")





In [62]:
# === 9. (Optional) Assign dominant topic to each doc ===
dominant_topic_id = doc_topic_matrix.argmax(axis=1)

# 2. Get top-1 word for each topic
topic_representative_word = {
    topic_id: lda_model.show_topic(topic_id, topn=1)[0][0]
    for topic_id in range(K)
}

# 3. Map each doc to its top-1 topic word
assigned_topic_words = [topic_representative_word[topic] for topic in dominant_topic_id]

# 4. Add to DataFrame
df_wiki_train['dominant_topic'] = dominant_topic_id
df_wiki_train['assigned_topic'] = assigned_topic_words

df_wiki_train.head(5)


Unnamed: 0,id,text,supercategory,category,subcategory,page_name,tokenized_text,dominant_topic,assigned_topic
0,2_CicelyMaryBarker,Cicely Mary Barker = Cicely Mary Barker ( 28 J...,Language and literature,Language and literature,"Writers, publishers, and critics",Cicely Mary Barker,june february english illustrator best known s...,11,book
1,210_GertrudeBarrowsBennett,Gertrude Barrows Bennett = Gertrude Barrows Be...,Language and literature,Language and literature,"Writers, publishers, and critics",Gertrude Barrows Bennett,major female writer fantasy science fiction un...,11,book
2,470_MeraldaWarren,Meralda Warren = Meralda Elva Junior Warren ( ...,Language and literature,Language and literature,"Writers, publishers, and critics",Meralda Warren,junior warren born june artist poet remote bri...,11,book
3,588_WilhelmBusch,Wilhelm Busch = Heinrich Christian Wilhelm Bus...,Language and literature,Language and literature,"Writers, publishers, and critics",Wilhelm Busch,christian april january german poet illustrato...,11,book
4,743_GabrielGarcíaMárquez,Gabriel García Márquez = Gabriel José de la Co...,Language and literature,Language and literature,"Writers, publishers, and critics",Gabriel García Márquez,american spanish march april novelist short st...,11,book


In [63]:
# output the dominant topic and assigned topic
df_wiki_train[['dominant_topic', 'assigned_topic']].to_csv("../ExperimentResults/LDA/Wiki/k_20/train_dominant_topic.csv", index=False)