In [1]:
from google.colab import files
uploaded = files.upload()


Saving bbc_news.csv to bbc_news.csv


In [1]:
# ==============================
# üß© STEP 1: Install Dependencies
# ==============================
!pip install gensim pyLDAvis nltk spacy scikit-learn
!python -m spacy download en_core_web_sm

# ==============================
# üìö STEP 2: Import Libraries
# ==============================
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.coherencemodel import CoherenceModel

# ==============================
# üìÇ STEP 3: Load Uploaded Dataset
# ==============================
df = pd.read_csv('bbc_news.csv')
print("Dataset Loaded Successfully ‚úÖ")
print(df.head())

# Make sure there's a text column
if 'text' not in df.columns:
    print("\n‚ö†Ô∏è Please confirm the column name that contains article text.")
else:
    print("\nTotal Articles:", len(df))

# ==============================
# üßπ STEP 4: Preprocessing
# ==============================
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser','ner'])

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    doc = nlp(" ".join(tokens))
    lemmas = [token.lemma_ for token in doc if token.lemma_ not in stop_words and len(token.lemma_) > 2]
    return lemmas

print("\nPreprocessing the text, please wait...")
processed_texts = [preprocess(t) for t in df['text']]
print("‚úÖ Preprocessing Complete")

# ==============================
# üìñ STEP 5: Prepare Dictionary & Corpus
# ==============================
dictionary = corpora.Dictionary(processed_texts)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in processed_texts]

# ==============================
# üî• STEP 6: LDA Topic Modeling
# ==============================
num_topics = 5

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=100,
    update_every=1,
    chunksize=100,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

print("\n=======================")
print("üß† LDA Topics Extracted")
print("=======================")
for idx, topic in lda_model.print_topics(-1):
    print(f"\nTopic {idx}:\n{topic}")

# ==============================
# üìä STEP 7: pyLDAvis Visualization
# ==============================
pyLDAvis.enable_notebook()
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis)

# ==============================
# üìà STEP 8: Evaluate Coherence
# ==============================
coherence_model_lda = CoherenceModel(
    model=lda_model,
    texts=processed_texts,
    dictionary=dictionary,
    coherence='c_v'
)
coherence_lda = coherence_model_lda.get_coherence()
print("\nLDA Coherence Score:", coherence_lda)

# ==============================
# ü§ñ STEP 9: NMF Topic Modeling
# ==============================
tfidf = TfidfVectorizer(max_df=0.5, min_df=5, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text'].astype(str))

nmf = NMF(n_components=num_topics, random_state=1, init='nndsvd').fit(tfidf_matrix)
feature_names = tfidf.get_feature_names_out()

print("\n=======================")
print("üß© NMF Topics Extracted")
print("=======================")
for topic_idx, topic in enumerate(nmf.components_):
    print(f"\nNMF Topic #{topic_idx}:")
    print(" | ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))

print("\n‚úÖ Task 5 Completed Successfully!")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.8/12.8 MB[0m [31m79.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m‚ö† Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Dataset Loaded Successfully ‚úÖ
                                               title  \
0  Ukraine: Angry Zelensky vows to punish Russian...   
1  War in Ukraine: Taking cover in a town under a...   
2         Ukraine war 'catastrophic for global food

[nltk_data] Downloading package punkt to /root/nltk_data...
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
[nltk_data]   Unzipping tokenizers/punkt.zip.
  return datetime.utcnow().replace(tzinfo=utc)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().repl


Preprocessing the text, please wait...


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


KeyError: 'text'

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [2]:
import pandas as pd

df = pd.read_csv('bbc_news.csv')
print(df.columns)


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


Index(['title', 'pubDate', 'guid', 'link', 'description'], dtype='object')


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [3]:
# =============================
# üìò TASK 5: Topic Modeling on News Articles
# =============================

# Install libraries (run only once)
!pip install gensim pyLDAvis nltk scikit-learn wordcloud

# -----------------------------
# üì¶ 1. Import Required Libraries
# -----------------------------
import pandas as pd
import nltk
import re
import gensim
from gensim import corpora
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
nltk.download('stopwords')

# -----------------------------
# üìÇ 2. Load Dataset
# -----------------------------
# Replace with your uploaded filename if different
df = pd.read_csv('/content/BBC News Train.csv')
print("Columns in dataset:", df.columns.tolist())
df.head()

# -----------------------------
# ‚úèÔ∏è 3. Preprocess Text Data
# -----------------------------
stop_words = set(stopwords.words('english'))

def preprocess(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(tokens)

# Apply preprocessing to the description column
df['clean_text'] = df['description'].apply(preprocess)

# -----------------------------
# üìä 4. Tokenize for Gensim
# -----------------------------
tokenized_texts = [text.split() for text in df['clean_text']]

# Create Dictionary and Corpus
dictionary = corpora.Dictionary(tokenized_texts)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

# -----------------------------
# üß© 5. LDA Topic Modeling
# -----------------------------
lda_model = gensim.models.LdaModel(corpus=corpus,
                                   id2word=dictionary,
                                   num_topics=5,
                                   random_state=42,
                                   passes=10)

# Display topics
print("\nüß† LDA Topics:")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

# -----------------------------
# üåê 6. Visualize LDA Topics (pyLDAvis)
# -----------------------------
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
lda_display

# -----------------------------
# ‚òÅÔ∏è 7. WordCloud Visualization
# -----------------------------
for idx, topic in lda_model.show_topics(formatted=False, num_words=15):
    plt.figure(figsize=(6, 4))
    plt.imshow(WordCloud(background_color='white').fit_words(dict(topic)))
    plt.axis("off")
    plt.title(f"LDA Topic #{idx}")
    plt.show()

# -----------------------------
# ‚ö° 8. Compare with NMF
# -----------------------------
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df['clean_text'])

nmf_model = NMF(n_components=5, random_state=42)
nmf_model.fit(X)

feature_names = vectorizer.get_feature_names_out()

def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))
        print()

print("\nüß© NMF Topics:")
display_topics(nmf_model, feature_names, 10)

# -----------------------------
# üåü 9. WordCloud for NMF Topics
# -----------------------------
for topic_idx, topic in enumerate(nmf_model.components_):
    plt.figure(figsize=(6, 4))
    plt.imshow(WordCloud(background_color='white').fit_words(
        {feature_names[i]: topic[i] for i in topic.argsort()[:-15 - 1:-1]}))
    plt.axis("off")
    plt.title(f"NMF Topic #{topic_idx}")
    plt.show()


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date



  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date



  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date



  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date



  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

FileNotFoundError: [Errno 2] No such file or directory: '/content/BBC News Train.csv'

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [4]:
# -----------------------------
# Task 5 - Text Classification using BBC News Dataset
# -----------------------------

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# -----------------------------
# Step 1: Load the Dataset
# -----------------------------
df = pd.read_csv('/content/bbc_news.csv')
print("Columns in dataset:", df.columns.tolist())
df.head()

# -----------------------------
# Step 2: Check for Missing Values
# -----------------------------
print("\nMissing values in dataset:")
print(df.isnull().sum())

# -----------------------------
# Step 3: Data Cleaning
# -----------------------------
# Some datasets have columns named "description", "text", or "content"
# Let's identify the main text column automatically
text_column = None
for col in df.columns:
    if 'text' in col.lower() or 'description' in col.lower() or 'content' in col.lower():
        text_column = col
        break

if text_column is None:
    raise KeyError("No text column found! Please check your CSV file column names.")

print(f"\n‚úÖ Using text column: {text_column}")

# Drop any rows with missing text or category
df = df.dropna(subset=[text_column, 'category'])

# -----------------------------
# Step 4: Split the Data
# -----------------------------
X = df[text_column]
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -----------------------------
# Step 5: Convert Text to TF-IDF Features
# -----------------------------
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# -----------------------------
# Step 6: Train Naive Bayes Classifier
# -----------------------------
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# -----------------------------
# Step 7: Evaluate Model
# -----------------------------
y_pred = model.predict(X_test_tfidf)
acc = accuracy_score(y_test, y_pred)

print(f"\nüéØ Model Accuracy: {acc*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# -----------------------------
# Step 8: Confusion Matrix Visualization
# -----------------------------
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

Columns in dataset: ['title', 'pubDate', 'guid', 'link', 'description']

Missing values in dataset:
title          0
pubDate        0
guid           0
link           0
description    0
dtype: int64

‚úÖ Using text column: description


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


KeyError: ['category']

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [5]:
# -----------------------------
# Task 5: Topic Modeling on BBC News Articles
# -----------------------------

# Install required libraries
!pip install gensim pyLDAvis nltk --quiet

# -----------------------------
# Step 1: Import libraries
# -----------------------------
import pandas as pd
import nltk
from nltk.corpus import stopwords
from gensim import corpora, models
import pyLDAvis.gensim_models
import pyLDAvis
import re
import string

# -----------------------------
# Step 2: Load the dataset
# -----------------------------
df = pd.read_csv('/content/bbc_news.csv')
print("Columns:", df.columns.tolist())

# We'll use the 'description' column as text
df = df[['description']].dropna()

# -----------------------------
# Step 3: Text Preprocessing
# -----------------------------
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()                                     # lowercase
    text = re.sub(f"[{string.punctuation}]", " ", text)      # remove punctuation
    text = re.sub(r'\d+', '', text)                         # remove numbers
    words = text.split()
    words = [w for w in words if w not in stop_words and len(w) > 3]  # remove stopwords and short words
    return words

df['tokens'] = df['description'].apply(clean_text)

# -----------------------------
# Step 4: Create Dictionary and Corpus
# -----------------------------
dictionary = corpora.Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(text) for text in df['tokens']]

# -----------------------------
# Step 5: Train LDA Model
# -----------------------------
lda_model = models.LdaModel(corpus,
                            num_topics=5,
                            id2word=dictionary,
                            passes=10,
                            random_state=42)

# -----------------------------
# Step 6: Display Topics
# -----------------------------
print("\nüß© Top Words Per Topic:\n")
for idx, topic in lda_model.print_topics(num_topics=5, num_words=10):
    print(f"Topic #{idx+1}: {topic}\n")

# -----------------------------
# Step 7: Visualize Topics using pyLDAvis
# -----------------------------
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

Columns: ['title', 'pubDate', 'guid', 'link', 'description']


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=


üß© Top Words Per Topic:

Topic #1: 0.009*"people" + 0.008*"could" + 0.008*"government" + 0.007*"trump" + 0.005*"says" + 0.005*"help" + 0.005*"health" + 0.005*"front" + 0.004*"allegations" + 0.004*"writes"

Topic #2: 0.017*"says" + 0.014*"league" + 0.014*"manchester" + 0.011*"city" + 0.008*"israel" + 0.008*"united" + 0.008*"premier" + 0.007*"manager" + 0.006*"football" + 0.006*"boss"

Topic #3: 0.017*"england" + 0.015*"world" + 0.012*"first" + 0.009*"final" + 0.007*"second" + 0.007*"time" + 0.007*"paris" + 0.007*"women" + 0.007*"watch" + 0.006*"year"

Topic #4: 0.011*"people" + 0.008*"says" + 0.007*"killed" + 0.007*"police" + 0.007*"president" + 0.007*"died" + 0.006*"ukraine" + 0.006*"year" + 0.005*"london" + 0.005*"south"

Topic #5: 0.018*"says" + 0.010*"party" + 0.009*"election" + 0.009*"former" + 0.009*"minister" + 0.008*"said" + 0.007*"first" + 0.007*"police" + 0.006*"labour" + 0.006*"government"



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
