In [16]:
import gensim
import gensim.corpora as corpora
from gensim.models.ldamodel import LdaModel
import nltk
from nltk.corpus import stopwords
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt

In [17]:
# Download the stopwords from NLTK
nltk.download('stopwords')
stop_words = stopwords.words('english')

# Sample documents
documents = [
    "Natural language processing is a fascinating field of study.",
    "Machine learning and deep learning are subsets of artificial intelligence.",
    "Text mining involves the process of extracting meaningful information from text.",
    "Topic modeling is a technique for discovering abstract topics within a collection of documents.",
    "LDA is a popular topic modeling algorithm in the field of NLP.",
    "Deep learning has revolutionized the field of artificial intelligence."
]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# Step 1: Preprocess the Text
def preprocess(text):
    tokens = gensim.utils.simple_preprocess(text, deacc=True)
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

processed_docs = [preprocess(doc) for doc in documents]

In [19]:
# Step 2: Create a Dictionary and Corpus
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [20]:
# Print dictionary and corpus
print("Dictionary:")
for key, value in dictionary.items():
    print(f"{key}: {value}")

Dictionary:
0: fascinating
1: field
2: language
3: natural
4: processing
5: study
6: artificial
7: deep
8: intelligence
9: learning
10: machine
11: subsets
12: extracting
13: information
14: involves
15: meaningful
16: mining
17: process
18: text
19: abstract
20: collection
21: discovering
22: documents
23: modeling
24: technique
25: topic
26: topics
27: within
28: algorithm
29: lda
30: nlp
31: popular
32: revolutionized


In [21]:
print("\nCorpus:")
for doc in corpus:
    print(doc)


Corpus:
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]
[(6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1)]
[(12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 2)]
[(19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1)]
[(1, 1), (23, 1), (25, 1), (28, 1), (29, 1), (30, 1), (31, 1)]
[(1, 1), (6, 1), (7, 1), (8, 1), (9, 1), (32, 1)]


In [22]:
# Step 3: Train the LDA Model
num_topics = 3
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42, passes=10)

In [23]:
# Step 4: Print the Topics
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWords: {topic}')

Topic: 0 
Words: 0.079*"field" + 0.078*"processing" + 0.078*"fascinating" + 0.078*"study" + 0.078*"language" + 0.078*"natural" + 0.020*"modeling" + 0.020*"topic" + 0.020*"revolutionized" + 0.020*"deep"
Topic: 1 
Words: 0.101*"learning" + 0.071*"intelligence" + 0.071*"deep" + 0.071*"artificial" + 0.040*"abstract" + 0.040*"documents" + 0.040*"collection" + 0.040*"topics" + 0.040*"discovering" + 0.040*"technique"
Topic: 2 
Words: 0.090*"text" + 0.051*"modeling" + 0.051*"topic" + 0.051*"nlp" + 0.051*"algorithm" + 0.051*"popular" + 0.051*"lda" + 0.051*"field" + 0.051*"meaningful" + 0.051*"process"
