In [5]:
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import nltk
import pandas as pd

In [6]:
# Sample data
documents = [
    "Natural language processing is a fascinating field of study.",
    "Machine learning and deep learning are subsets of artificial intelligence.",
    "Text mining involves the process of extracting meaningful information from text.",
    "Topic modeling is a technique for discovering abstract topics within a collection of documents.",
    "LDA is a popular topic modeling algorithm in the field of NLP.",
    "Deep learning has revolutionized the field of artificial intelligence."
]

In [7]:
nltk.download('stopwords')

# Define stop words
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Step 1: Preprocess the Text
def preprocess(text):
    tokens = simple_preprocess(text, deacc=True)  # Tokenization and normalization
    tokens = [token for token in tokens if token not in stop_words]  # Stop word removal
    return tokens

In [9]:
# Apply preprocessing to each document
processed_docs = [preprocess(doc) for doc in documents]

# Print processed documents
for i, doc in enumerate(processed_docs):
    print(f"Document {i + 1}: {doc}")

Document 1: ['natural', 'language', 'processing', 'fascinating', 'field', 'study']
Document 2: ['machine', 'learning', 'deep', 'learning', 'subsets', 'artificial', 'intelligence']
Document 3: ['text', 'mining', 'involves', 'process', 'extracting', 'meaningful', 'information', 'text']
Document 4: ['topic', 'modeling', 'technique', 'discovering', 'abstract', 'topics', 'within', 'collection', 'documents']
Document 5: ['lda', 'popular', 'topic', 'modeling', 'algorithm', 'field', 'nlp']
Document 6: ['deep', 'learning', 'revolutionized', 'field', 'artificial', 'intelligence']


In [10]:
# Create a dictionary and corpus
dictionary = Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [11]:
# Build the LDA model
num_topics = 3
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42)

In [12]:
# Topic-word distribution
print("\nTopic-Word Distribution:")
topics = lda_model.show_topics(formatted=False)
topic_word_dist = {}
for topic_num, words in topics:
    topic_word_dist[topic_num] = {word: prob for word, prob in words}
    print(f"Topic {topic_num}: {topic_word_dist[topic_num]}")


Topic-Word Distribution:
Topic 0: {'field': 0.07617569, 'processing': 0.075079665, 'fascinating': 0.07493144, 'study': 0.07486473, 'language': 0.07442345, 'natural': 0.07428721, 'text': 0.022472566, 'extracting': 0.0217278, 'involves': 0.021021795, 'process': 0.020985322}
Topic 1: {'learning': 0.09580206, 'intelligence': 0.0667359, 'deep': 0.06666222, 'artificial': 0.06646762, 'field': 0.039246384, 'topic': 0.038355812, 'machine': 0.03829403, 'subsets': 0.038262185, 'abstract': 0.038261633, 'revolutionized': 0.03822168}
Topic 2: {'text': 0.07931718, 'modeling': 0.05485446, 'field': 0.054565594, 'topic': 0.054432344, 'nlp': 0.053706195, 'algorithm': 0.05359196, 'popular': 0.053562265, 'lda': 0.053146146, 'meaningful': 0.046936154, 'process': 0.0456014}


In [13]:
# Document-topic distribution
print("\nDocument-Topic Distribution:")
doc_topic_dist = []
for i, doc in enumerate(corpus):
    doc_topics = lda_model.get_document_topics(doc)
    doc_topic_dist.append(doc_topics)
    print(f"Document {i}: {doc_topics}")


Document-Topic Distribution:
Document 0: [(0, 0.90296334), (1, 0.04835667), (2, 0.048679963)]
Document 1: [(0, 0.04207211), (1, 0.9159677), (2, 0.041960176)]
Document 2: [(0, 0.037648376), (1, 0.038033746), (2, 0.9243179)]
Document 3: [(0, 0.03388926), (1, 0.93064), (2, 0.03547071)]
Document 4: [(0, 0.043428745), (1, 0.043789327), (2, 0.9127819)]
Document 5: [(0, 0.050577495), (1, 0.89983165), (2, 0.04959086)]


In [14]:
# Conditional probabilities for each word in each document
print("\nConditional Probabilities for each word in each document:")
conditional_probs = []
for i, doc in enumerate(corpus):
    doc_conditional_probs = []
    for word_id, freq in doc:
        word = dictionary[word_id]
        word_conditional_probs = []
        for topic_num in range(num_topics):
            prob = lda_model.get_term_topics(word_id, minimum_probability=0)[topic_num][1]
            word_conditional_probs.append((topic_num, prob))
        doc_conditional_probs.append((word, word_conditional_probs))
    conditional_probs.append(doc_conditional_probs)
    print(f"Document {i}: {doc_conditional_probs}")


Conditional Probabilities for each word in each document:
Document 0: [('fascinating', [(0, 0.050226413), (1, 0.0015088858), (2, 0.002024885)]), ('field', [(0, 0.05145469), (1, 0.02625679), (2, 0.036553912)]), ('language', [(0, 0.049725533), (1, 0.0015475627), (2, 0.0021446473)]), ('natural', [(0, 0.04959125), (1, 0.0014811677), (2, 0.0022877285)]), ('processing', [(0, 0.050372623), (1, 0.0014498696), (2, 0.0020572364)]), ('study', [(0, 0.050160613), (1, 0.0014648661), (2, 0.0021093497)])]
Document 1: [('artificial', [(0, 0.0032517237), (1, 0.053283826), (2, 0.0025308228)]), ('deep', [(0, 0.0032583605), (1, 0.05347902), (2, 0.0023859714)]), ('intelligence', [(0, 0.003171573), (1, 0.05355291), (2, 0.0023964662)]), ('learning', [(0, 0.0032791658), (1, 0.08282172), (2, 0.002440297)]), ('machine', [(0, 0.0028751972), (1, 0.025329415), (2, 0.0020620103)]), ('subsets', [(0, 0.00291496), (1, 0.02529845), (2, 0.0020549772)])]
Document 2: [('extracting', [(0, 0.003881533), (1, 0.0070027392), (

In [15]:
# Topic distribution for each document
print("\nTopic Distribution for each document:")
for i, doc in enumerate(corpus):
    doc_topic_dist = lda_model.get_document_topics(doc)
    print(f"Document {i}: {doc_topic_dist}")


Topic Distribution for each document:
Document 0: [(0, 0.90296745), (1, 0.04835655), (2, 0.048675966)]
Document 1: [(0, 0.042072743), (1, 0.91596746), (2, 0.04195976)]
Document 2: [(0, 0.03764845), (1, 0.038036905), (2, 0.9243146)]
Document 3: [(0, 0.03388926), (1, 0.93064), (2, 0.03547073)]
Document 4: [(0, 0.043428488), (1, 0.043789923), (2, 0.9127816)]
Document 5: [(0, 0.050557405), (1, 0.8998589), (2, 0.04958371)]


In [16]:
# Display the results in a tabular format using pandas
# Topic-word distribution
topic_word_df = pd.DataFrame(topic_word_dist).fillna(0)
print("\nTopic-Word Distribution (Tabular):")
print(topic_word_df)


Topic-Word Distribution (Tabular):
                       0         1         2
field           0.076176  0.039246  0.054566
processing      0.075080  0.000000  0.000000
fascinating     0.074931  0.000000  0.000000
study           0.074865  0.000000  0.000000
language        0.074423  0.000000  0.000000
natural         0.074287  0.000000  0.000000
text            0.022473  0.000000  0.079317
extracting      0.021728  0.000000  0.000000
involves        0.021022  0.000000  0.000000
process         0.020985  0.000000  0.045601
learning        0.000000  0.095802  0.000000
intelligence    0.000000  0.066736  0.000000
deep            0.000000  0.066662  0.000000
artificial      0.000000  0.066468  0.000000
topic           0.000000  0.038356  0.054432
machine         0.000000  0.038294  0.000000
subsets         0.000000  0.038262  0.000000
abstract        0.000000  0.038262  0.000000
revolutionized  0.000000  0.038222  0.000000
modeling        0.000000  0.000000  0.054854
nlp             0.0

In [17]:
# Document-topic distribution
doc_topic_df = pd.DataFrame([[prob for _, prob in lda_model.get_document_topics(doc, minimum_probability=0)] for doc in corpus])
doc_topic_df.columns = [f'Topic {i}' for i in range(num_topics)]
print("\nDocument-Topic Distribution (Tabular):")
print(doc_topic_df)


Document-Topic Distribution (Tabular):
    Topic 0   Topic 1   Topic 2
0  0.902967  0.048356  0.048676
1  0.042072  0.915968  0.041960
2  0.037648  0.038035  0.924317
3  0.033889  0.930640  0.035470
4  0.043428  0.043789  0.912783
5  0.050554  0.899862  0.049584


In [18]:
# Conditional probabilities for each word in each document
conditional_probs_df = []
for i, doc in enumerate(conditional_probs):
    for word, probs in doc:
        for topic_num, prob in probs:
            conditional_probs_df.append((f'Doc {i}', word, f'Topic {topic_num}', prob))
conditional_probs_df = pd.DataFrame(conditional_probs_df, columns=['Document', 'Word', 'Topic', 'Probability'])
print("\nConditional Probabilities (Tabular):")
print(conditional_probs_df)


Conditional Probabilities (Tabular):
    Document            Word    Topic  Probability
0      Doc 0     fascinating  Topic 0     0.050226
1      Doc 0     fascinating  Topic 1     0.001509
2      Doc 0     fascinating  Topic 2     0.002025
3      Doc 0           field  Topic 0     0.051455
4      Doc 0           field  Topic 1     0.026257
..       ...             ...      ...          ...
118    Doc 5        learning  Topic 1     0.082822
119    Doc 5        learning  Topic 2     0.002440
120    Doc 5  revolutionized  Topic 0     0.002912
121    Doc 5  revolutionized  Topic 1     0.025259
122    Doc 5  revolutionized  Topic 2     0.002084

[123 rows x 4 columns]
