# Group members
Vishnu Ram A/L Karthigesa Naidu SW01083727
Thong Hao Hong SW01083725
Jeevesh SW01083692

In [3]:


# STEP 1: Import libraries
import pandas as pd
import gensim
from gensim import corpora
from gensim.models import CoherenceModel, LdaModel
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
import string




In [4]:
# STEP 2: Load dataset (only 'text' column)
df = pd.read_csv('news_dataset.csv')
df = df[['text']].dropna()

# STEP 3: Preprocessing function
def preprocess(text):
    tokens = word_tokenize(text.lower())                          
    tokens = [t for t in tokens if t.isalpha()]                  
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]            
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]             
    return tokens

df['tokens'] = df['text'].apply(preprocess)

# STEP 4: Create dictionary and corpus
dictionary = corpora.Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(text) for text in df['tokens']]

# STEP 5: LDA Model
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=4,
                     random_state=42,
                     passes=10,
                     alpha='auto')

coherence_model_lda = CoherenceModel(model=lda_model, texts=df['tokens'], dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()

print("Coherence Score:", coherence_score)

topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx + 1}: {topic}")

Coherence Score: 0.6281369823043936
Topic 1: 0.011*"x" + 0.010*"key" + 0.007*"use" + 0.007*"system" + 0.007*"file" + 0.006*"chip" + 0.005*"one" + 0.005*"encryption" + 0.005*"program" + 0.004*"would"
Topic 2: 0.007*"year" + 0.006*"game" + 0.006*"would" + 0.005*"one" + 0.005*"team" + 0.005*"get" + 0.004*"like" + 0.004*"good" + 0.004*"new" + 0.004*"time"
Topic 3: 0.058*"q" + 0.048*"max" + 0.031*"g" + 0.030*"r" + 0.026*"p" + 0.026*"db" + 0.024*"n" + 0.023*"x" + 0.017*"w" + 0.017*"k"
Topic 4: 0.009*"people" + 0.008*"would" + 0.007*"one" + 0.005*"think" + 0.005*"government" + 0.004*"know" + 0.004*"say" + 0.004*"right" + 0.004*"u" + 0.004*"time"
