<a href="https://colab.research.google.com/github/SHANUMOLVJ/LDA-topic-modeling/blob/main/LDA_topic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries (if not already installed)
!pip install gensim pyLDAvis matplotlib seaborn



In [None]:
# Step 1: Data Collection
from sklearn.datasets import fetch_20newsgroups
import nltk

In [None]:
# Load the dataset
newsgroups_data = fetch_20newsgroups(subset='all')
documents = newsgroups_data.data

In [None]:
# Step 2: Text Preprocessing
# Install necessary NLTK data (Colab requires this to be done explicitly)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Function to preprocess text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

In [None]:
# Apply preprocessing to all documents
processed_documents = [preprocess_text(doc) for doc in documents]

In [None]:
# Step 3: Topic Modeling with LDA
# Install necessary libraries (Colab-specific)
import gensim
from gensim import corpora
import pyLDAvis.gensim
from IPython.display import display

In [None]:
# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(processed_documents)
corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

  and should_run_async(code)


In [None]:
# Apply LDA
lda_model = gensim.models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

  and should_run_async(code)


In [None]:
# Print the top words in each topic
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWords: {topic}\n')

Topic: 0 
Words: 0.017*"x" + 0.012*"file" + 0.009*"image" + 0.008*"window" + 0.006*"program" + 0.004*"use" + 0.004*"line" + 0.004*"version" + 0.004*"software" + 0.004*"system"

Topic: 1 
Words: 0.009*"line" + 0.008*"subject" + 0.008*"organization" + 0.008*"writes" + 0.006*"article" + 0.006*"game" + 0.006*"would" + 0.005*"year" + 0.005*"think" + 0.005*"dont"

Topic: 2 
Words: 0.009*"god" + 0.007*"people" + 0.007*"one" + 0.006*"would" + 0.005*"say" + 0.004*"subject" + 0.004*"line" + 0.004*"writes" + 0.004*"u" + 0.004*"christian"

Topic: 3 
Words: 0.014*"line" + 0.013*"subject" + 0.012*"organization" + 0.006*"university" + 0.006*"writes" + 0.006*"nntppostinghost" + 0.005*"one" + 0.005*"would" + 0.005*"article" + 0.005*"like"

Topic: 4 
Words: 0.005*"would" + 0.005*"one" + 0.004*"subject" + 0.004*"organization" + 0.004*"line" + 0.004*"key" + 0.004*"use" + 0.004*"u" + 0.004*"people" + 0.003*"article"



  and should_run_async(code)


In [None]:
# Visualize the topics using pyLDAvis
lda_vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)

  and should_run_async(code)


In [None]:
# Show pyLDAvis visualization in Colab
pyLDAvis.display(lda_vis)

  and should_run_async(code)


In [None]:
# Step 4: Word Embeddings with Word2Vec
from gensim.models import Word2Vec
import numpy as np

  and should_run_async(code)


In [None]:
# Train Word2Vec on the preprocessed documents
word2vec_model = Word2Vec(processed_documents, vector_size=100, window=5, min_count=2, workers=4)

  and should_run_async(code)


In [None]:
# Represent each document as an averaged word vector
def document_vector(doc):
    return np.mean([word2vec_model.wv[word] for word in doc if word in word2vec_model.wv], axis=0)

doc_vectors = [document_vector(doc) for doc in processed_documents]

  and should_run_async(code)


In [None]:
# Step 5: Document Similarity and Clustering
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt  # Ensure this import is present
import seaborn as sns

  and should_run_async(code)


In [None]:
# Filter out documents that returned NaN vectors due to no matching words in Word2Vec
doc_vectors = [vec for vec in doc_vectors if vec is not None]

  and should_run_async(code)


In [None]:
# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(doc_vectors)

  and should_run_async(code)


In [None]:
print("cosine similarity matrix:\n",similarity_matrix)

cosine similarity matrix:
 [[1.0000001  0.5677768  0.70964104 ... 0.64623606 0.6856889  0.8031547 ]
 [0.5677768  1.0000001  0.36961216 ... 0.83540845 0.7299082  0.7191541 ]
 [0.70964104 0.36961216 1.0000001  ... 0.4968099  0.6096514  0.7591161 ]
 ...
 [0.64623606 0.83540845 0.4968099  ... 1.0000001  0.68885905 0.823871  ]
 [0.6856889  0.7299082  0.6096514  ... 0.68885905 1.0000002  0.80723405]
 [0.8031547  0.7191541  0.7591161  ... 0.823871   0.80723405 1.0000004 ]]


  and should_run_async(code)


In [None]:
# Perform KMeans clustering
kmeans = KMeans(n_clusters=5)
clusters = kmeans.fit_predict(similarity_matrix)

  and should_run_async(code)
  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(similarity_matrix, annot=False, cmap='viridis')
plt.title('Document Similarity Heatmap')
plt.show()
