In [1]:
pip install nltk gensim pandas matplotlib


Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import nltk
import gensim
import gensim.corpora as corpora
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from gensim.models import CoherenceModel, LdaModel
import matplotlib.pyplot as plt

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv('news_dataset.csv')
df = df[['text']].dropna()

# Preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens]
    return tokens

df['tokens'] = df['text'].apply(preprocess)

# Dictionary and Corpus
id2word = corpora.Dictionary(df['tokens'])
corpus = [id2word.doc2bow(text) for text in df['tokens']]

# Build LDA Model
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=4, random_state=42, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)

# Topics
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")

# Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['tokens'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda:.4f}')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kingd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kingd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kingd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Topic: 0 
Words: 0.012*"would" + 0.010*"peopl" + 0.009*"one" + 0.007*"know" + 0.006*"govern" + 0.006*"say" + 0.006*"think" + 0.006*"like" + 0.006*"go" + 0.006*"right"

Topic: 1 
Words: 0.011*"post" + 0.010*"presid" + 0.008*"group" + 0.008*"new" + 0.008*"privaci" + 0.007*"administr" + 0.006*"secur" + 0.006*"inform" + 0.006*"stephanopoulo" + 0.006*"interest"

Topic: 2 
Words: 0.111*"q" + 0.044*"x" + 0.037*"n" + 0.035*"e" + 0.027*"k" + 0.019*"p" + 0.019*"c" + 0.015*"f" + 0.014*"bh" + 0.014*"r"

Topic: 3 
Words: 0.035*"key" + 0.027*"use" + 0.024*"encrypt" + 0.017*"chip" + 0.015*"db" + 0.013*"system" + 0.010*"anonym" + 0.009*"file" + 0.008*"data" + 0.008*"comput"

Coherence Score: 0.6380


In [13]:
##Devendran a/l Pathamanathan SW01083554
##Ishamie Syazwina IS01083940

## Interpretation of Coherence Score

##The coherence score is used to check how good the topics are. 
##A higher score which is close to 1.0, means the topics makes much more sense and the keywords in each topic are related. 
##In this case, the coherence score of 0.6380 shows that the model managed to group the text into topics that are understandable. 
##This means the topic modeling was successful. Thank you <3.

