In [None]:
!pip install gensim nltk

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [None]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import nltk,re
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train=fetch_20newsgroups(subset='train',shuffle=True,random_state=42)
documents=newsgroups_train.data

In [None]:
def clean_text(text):
  text=re.sub(r'\W+',' ',text.lower())
  tokens=[word for word in text.split() if word not in stop_words]
  return tokens

In [None]:
#Tokenize all documents
tokenize_docs=[clean_text(doc) for doc in documents]

In [None]:
dictionary=Dictionary(tokenize_docs)
dictionary.filter_extremes(no_below=10,no_above=0.9)
corpus=[dictionary.doc2bow(doc) for doc in tokenize_docs]

In [None]:
lda_model=LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=8,
    random_state=42,
    alpha='auto',
    per_word_topics=True
    )



In [None]:
print("\n TOP WORDS IN EACH TOPIC:")
for idx,topic in lda_model.show_topics(formatted=False):
  print(f"Topic{idx+1}:{'|'.join([word for word,_ in topic])}")


 TOP WORDS IN EACH TOPIC:
Topic1:god|x|one|would|people|edu|jesus|say|think|believe
Topic2:people|would|edu|one|government|com|key|think|writes|like
Topic3:ax|max|q|r|3|p|0d|g|7|g9v
Topic4:_|edu|1|x|com|graphics|2|file|mail|image
Topic5:edu|com|writes|article|posting|would|host|one|nntp|university
Topic6:0|1|2|3|4|5|7|6|8|25
Topic7:edu|com|1|one|use|drive|2|would|windows|system
Topic8:x|1|w|r|_|c|p|g|6|8


In [None]:
coherence_model_lda=CoherenceModel(model=lda_model,texts=tokenize_docs,dictionary=dictionary,coherence='c_v')
coherence_lda=coherence_model_lda.get_coherence()
print(f"\n Coherence Score:{coherence_lda:.4f}")


 Coherence Score:0.6447


# Parameter What it Does Typical Values

In [None]:
! pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [None]:
import pyLDAvis
import pyLDAvis.gensim_models

In [None]:
vis_data=pyLDAvis.gensim_models.prepare(lda_model,corpus,dictionary,sort_topics=False)


In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.display(vis_data)