Practical - 5 : Implement LSA (Latent Semantic Analysis) and Topic Model.

In [1]:
# Import libraries
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
import pandas as pd

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Sample documents
documents = [
    "Natural language processing enables computers to understand human language.",
    "Machine learning improves NLP with deep learning models.",
    "Topic modeling and LSA are used for document analysis.",
    "Latent Dirichlet Allocation is a generative topic model.",
    "Word embeddings like Word2Vec and Glove help in NLP tasks.",
    "Support Vector Machines can be used for text classification."
]

In [3]:
# Remove stopwords
stop_words = stopwords.words('english')

PART 1: LSA using SVD

In [4]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words=stop_words)
X_tfidf = tfidf.fit_transform(documents)

# Apply LSA (Truncated SVD)
lsa_model = TruncatedSVD(n_components=2)
lsa_topic_matrix = lsa_model.fit_transform(X_tfidf)

# Display topics
print("\n🔹 LSA Topics (Top words per component):")
terms = tfidf.get_feature_names_out()
for i, comp in enumerate(lsa_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key=lambda x: x[1], reverse=True)[:5]
    print(f"Topic {i + 1}: ", [term for term, _ in sorted_terms])


🔹 LSA Topics (Top words per component):
Topic 1:  ['topic', 'used', 'analysis', 'document', 'lsa']
Topic 2:  ['learning', 'nlp', 'word', 'word2vec', 'embeddings']


PART 2: Topic Modeling using LDA

In [5]:
# Count Vectorization
cv = CountVectorizer(stop_words=stop_words)
X_cv = cv.fit_transform(documents)

# Apply LDA
lda = LatentDirichletAllocation(n_components=2, random_state=0)
lda.fit(X_cv)

# Display LDA topics
print("\n🔹 LDA Topics (Top words per topic):")
for idx, topic in enumerate(lda.components_):
    top_words = [cv.get_feature_names_out()[i] for i in topic.argsort()[-5:]]
    print(f"Topic {idx + 1}: {top_words}")


🔹 LDA Topics (Top words per topic):
Topic 1: ['enables', 'understand', 'topic', 'used', 'language']
Topic 2: ['word', 'embeddings', 'word2vec', 'learning', 'nlp']
