In [None]:
# 📦 Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, PCA ,TruncatedSVD
from sklearn.cluster import KMeans
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# 🔽 Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# 📥 Step 1: Load Dataset
print("Loading dataset...")
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
docs = newsgroups.data

# 🧹 Step 2: Clean and Preprocess Text
def clean_text(text):
    text = re.sub(r'\W+', ' ', text.lower())
    words = text.split()
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in words if word not in stop_words])

# Limit to first 2000 documents for performance
print("Cleaning text...")
docs_cleaned = [clean_text(doc) for doc in docs[:2000]]

# 🔠 Step 3: TF-IDF Vectorization for K-Means
print("Vectorizing with TF-IDF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=5, stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(docs_cleaned)

# 🔀 Step 4: Apply K-Means Clustering
print("Applying K-Means clustering...")
kmeans = KMeans(n_clusters=20, random_state=42)
kmeans.fit(X_tfidf)

# 📉 Step 5: Visualize Clusters with PCA
print("Reducing dimensions for visualization...")
svd = TruncatedSVD(n_components=2, random_state=42)
scatter_data = svd.fit_transform(X_tfidf)

plt.figure(figsize=(10, 6))
plt.scatter(scatter_data[:, 0], scatter_data[:, 1], c=kmeans.labels_, cmap='tab20', s=10)
plt.title('K-Means Clustering of 20 Newsgroups (2D SVD Projection)')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.colorbar(label='Cluster ID')
plt.tight_layout()
plt.show()

# 🧮 Step 6: Count Vectorizer for LDA
print("Vectorizing with CountVectorizer for LDA...")
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_count = count_vectorizer.fit_transform(docs_cleaned)

# 📊 Step 7: Apply LDA for Topic Modeling
print("Applying Latent Dirichlet Allocation (LDA)...")
lda = LatentDirichletAllocation(n_components=10, max_iter=10, learning_method='online', random_state=42)
lda.fit(X_count)

# 📌 Step 8: Print Top Words in Topics
def print_topics(model, vectorizer, top_n=10):
    words = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(model.components_):
        print(f"\n🔹 Topic #{idx + 1}:")
        print(", ".join([words[i] for i in topic.argsort()[:-top_n - 1:-1]]))

print("\n🧠 Top Words in LDA Topics:")
print_topics(lda, count_vectorizer)


ModuleNotFoundError: No module named 'nltk'