In [11]:
!pip install pandas nltk gensim pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   -------------------- ------------------- 1.3/2.6 MB 8.4 MB/s eta 0:00:01
   ---------------------------------------- 2.6/2.6 MB 7.9 MB/s eta 0:00:00
Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [12]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim_models


In [29]:
df = pd.read_csv(r"C:\Users\night\Downloads\news_dataset.csv")
texts = df['text'].dropna().tolist()


In [31]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\night\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\night\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
ps = PorterStemmer()

In [35]:
def preprocess(text):
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    tokens = [ps.stem(word) for word in tokens if word.isalpha() and word not in stop_words]  # Remove stopwords and apply stemming
    return tokens

In [37]:
processed_texts = [preprocess(text) for text in texts]

In [38]:
dictionary = corpora.Dictionary(processed_texts)
corpus = [dictionary.doc2bow(text) for text in processed_texts]


In [39]:
lda_model = LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15)


In [40]:
coherence_model = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()

In [41]:
topics = lda_model.print_topics(num_words=5)

In [44]:
print(f"Coherence Score: {coherence_score}")
print("\nTopics:")
for topic in topics:
    print(topic)

Coherence Score: 0.6168683749401351

Topics:
(0, '0.016*"use" + 0.010*"key" + 0.007*"file" + 0.007*"system" + 0.007*"encrypt"')
(1, '0.005*"year" + 0.005*"armenian" + 0.005*"game" + 0.005*"team" + 0.004*"state"')
(2, '0.063*"x" + 0.059*"q" + 0.053*"max" + 0.034*"g" + 0.033*"r"')
(3, '0.011*"would" + 0.009*"one" + 0.009*"peopl" + 0.007*"think" + 0.006*"like"')


In [45]:
pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

PreparedData(topic_coordinates=              x         y  topics  cluster       Freq
topic                                                
3      0.178340  0.013921       1        1  45.820775
0      0.121021  0.181499       2        1  27.951748
1      0.098085 -0.206076       3        1  19.940552
2     -0.397446  0.010656       4        1   6.286924, topic_info=        Term         Freq        Total Category  logprob  loglift
1586       x  5396.000000  5396.000000  Default  30.0000  30.0000
3247       q  3975.000000  3975.000000  Default  29.0000  29.0000
2971     max  3522.000000  3522.000000  Default  28.0000  28.0000
1051       g  2250.000000  2250.000000  Default  27.0000  27.0000
4047       r  2233.000000  2233.000000  Default  26.0000  26.0000
...      ...          ...          ...      ...      ...      ...
2112       b   985.555926  1161.903034   Topic4  -4.2181   2.6021
2501       c   974.298169  1321.513240   Topic4  -4.2296   2.4619
7385  printf   133.115833   145.466407 