<a href="https://colab.research.google.com/github/Rakshithts/myprojects/blob/main/NLP_assignment_bbc_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gensim
!pip install pyLDAvis
!pip install bertopic umap-learn hdbscan


In [None]:
#Load and Preview Data
import pandas as pd

# Load dataset
df = pd.read_csv("/content/bbc-text.csv")

# Preview
print(df.head())
print(df['category'].value_counts())


In [None]:
#Text Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W|\d', ' ', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['text'].apply(preprocess)
tokenized_docs = [doc.split() for doc in df['clean_text']]


In [None]:
#LDA Topic Modeling (Gensim)
from gensim import corpora
from gensim.models import LdaModel

dictionary = corpora.Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=42, passes=10)
lda_model.print_topics()


In [None]:

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)


In [None]:
#LSA Topic Modeling (TruncatedSVD)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
X_tfidf = vectorizer.fit_transform(df['clean_text'])

lsa_model = TruncatedSVD(n_components=5, random_state=42)
lsa_topic_matrix = lsa_model.fit_transform(X_tfidf)

terms = vectorizer.get_feature_names_out()
for i, comp in enumerate(lsa_model.components_):
    sorted_terms = sorted(zip(terms, comp), key=lambda x: x[1], reverse=True)[:10]
    print(f"LSA Topic {i}: {[t[0] for t in sorted_terms]}")


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# LDA
for i in range(5):
    plt.figure()
    plt.imshow(WordCloud(background_color='white').fit_words(dict(lda_model.show_topic(i, 20))))
    plt.axis("off")
    plt.title(f"LDA Topic {i}")
    plt.show()


In [None]:
import seaborn as sns
import numpy as np

topic_dist = [lda_model.get_document_topics(corpus[i]) for i in range(10)]
matrix = np.zeros((10, 5))
for i, dist in enumerate(topic_dist):
    for topic_id, prob in dist:
        matrix[i, topic_id] = prob

sns.heatmap(matrix, annot=True, cmap="YlGnBu")
plt.xlabel("Topics")
plt.ylabel("Documents")
plt.title("Topic Distribution in First 10 Docs")
plt.show()


In [None]:
#BERTopic Modeling


from bertopic import BERTopic

docs = df['clean_text'].tolist()
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(docs)

topic_model.get_topic_info().head()


In [None]:
import numpy as np
import pandas as pd
from datetime import timedelta

# Simulate timestamps if not available
df['date'] = pd.date_range(start='2021-01-01', periods=len(df), freq='D')

In [None]:
topics_over_time = topic_model.topics_over_time(docs, df['date'])
topic_model.visualize_topics_over_time(topics_over_time)

df['dominant_topic'] = topics
df['topic_name'] = df['dominant_topic'].apply(lambda t: topic_model.get_topic(t)[0][0] if t != -1 else "No topic")

In [None]:
#Coherence Score for LDA and LSA
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LsiModel

# LDA
coherence_lda = CoherenceModel(model=lda_model, texts=tokenized_docs, dictionary=dictionary, coherence='c_v')
# print("LDA Coherence Score:", coherence_lda.get_coherence())

# LSA (LSI)
lsi_model = LsiModel(corpus=corpus, id2word=dictionary, num_topics=5)
coherence_lsa = CoherenceModel(model=lsi_model, texts=tokenized_docs, dictionary=dictionary, coherence='c_v')
# print("LSA Coherence Score:", coherence_lsa.get_coherence())

from gensim.models.coherencemodel import CoherenceModel

# Prepare BERTopic topics in gensim format
# Get topic-word mapping for all topics (excluding -1 which means no topic assigned)
bertopic_topics = topic_model.get_topics()
valid_topic_ids = [t for t in bertopic_topics.keys() if t != -1]

# Convert to list of words per topic
bertopic_topic_words = []
for topic_id in valid_topic_ids:
    words = [word for word, _ in bertopic_topics[topic_id]]
    bertopic_topic_words.append(words)

# Compute coherence
bertopic_coherence_model = CoherenceModel(
    topics=bertopic_topic_words,
    texts=tokenized_docs,  # from earlier preprocessing
    dictionary=dictionary,  # from LDA prep
    coherence='c_v'
)

# print("BERTopic Coherence Score:", bertopic_coherence_model.get_coherence())



In [None]:
print("Model Coherence Scores:")
print(f"✅ LDA Coherence:      {coherence_lda.get_coherence():.4f}")
print(f"✅ LSA Coherence:      {coherence_lsa.get_coherence():.4f}")
print(f"✅ BERTopic Coherence: {bertopic_coherence_model.get_coherence():.4f}")


In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
topic_summaries = {}

for topic_id in df['dominant_topic'].unique():
    topic_docs = df[df['dominant_topic'] == topic_id]['text'].tolist()
    combined_text = " ".join(topic_docs[:5])
    if len(combined_text) > 1000:
        combined_text = combined_text[:1000]
    summary = summarizer(combined_text, max_length=120, min_length=40, do_sample=False)
    topic_summaries[topic_id] = summary[0]['summary_text']
    print(f"Topic {topic_id} Summary:\n{summary[0]['summary_text']}\n")

