In [None]:
!pip install -q sentence-transformers matplotlib seaborn gensim scikit-learn nltk wordcloud

In [None]:
import re
import os
import math
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from typing import List, Tuple, Dict, Optional

# NLP
import nltk
from nltk.corpus import reuters, stopwords
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize

# Vectorization & Clustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, normalized_mutual_info_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!pip install gensim==3.8.3

In [None]:
!pip install sumy

In [None]:
from sumy.summarizers.lex_rank import LexRankSummarizer

In [None]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

def summarize_cluster(text, sentences=2):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary_sentences = summarizer(parser.document, sentences)
    return " ".join([str(s) for s in summary_sentences])

In [None]:
try:
    from sentence_transformers import SentenceTransformer
    sbert_available = True
except Exception:
    sbert_available = False

In [None]:
def load_reuters_dataset(max_docs: Optional[int] = 1000) -> pd.DataFrame:
    """
    Load Reuters dataset from NLTK into a DataFrame with columns:
    - doc_id
    - title (first sentence or truncated first 10 words)
    - text
    - categories (list)
    """
    fileids = reuters.fileids()
    texts = []
    for fid in fileids[:max_docs]:
        raw = reuters.raw(fid)
        # Create a short title from first sentence or first 10 words
        sents = sent_tokenize(raw)
        if len(sents) > 0:
            title = sents[0].strip()
            if len(title.split()) > 12:
                title = " ".join(title.split()[:12]) + "..."
        else:
            title = "Reuters article " + fid
        cats = reuters.categories(fid)
        texts.append({'doc_id': fid, 'title': title, 'text': raw, 'categories': cats})
    df = pd.DataFrame(texts)
    print(f"Loaded {len(df)} Reuters documents.")
    return df

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')


In [None]:
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text: str,
                    remove_numbers: bool = True,
                    remove_punct: bool = True,
                    do_lemmatize: bool = True) -> str:

    text = text.lower()

    text = text.replace('\n', ' ')

    text = re.sub(r'http\S+|www\.\S+', ' ', text)

    if remove_numbers:
        text = re.sub(r'\d+', ' ', text)

    if remove_punct:
        text = re.sub(r'[^\w\s]', ' ', text)

    tokens = word_tokenize(text)

    tokens = [t for t in tokens if len(t) > 2 and t not in STOPWORDS]

    if do_lemmatize:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

def preprocess_dataframe(df: pd.DataFrame, text_col: str = 'text', new_col: str = 'clean_text') -> pd.DataFrame:
    df[new_col] = df[text_col].apply(preprocess_text)
    return df


In [None]:
def vectorize_text(docs, max_features=5000, ngram_range=(1,2)):
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        ngram_range=ngram_range
    )
    vectors = vectorizer.fit_transform(docs)
    return vectorizer, vectors


In [None]:
def run_kmeans(vectors, k=5, random_state=42):
    kmeans = KMeans(n_clusters=k, random_state=random_state, n_init=10)
    labels = kmeans.fit_predict(vectors)
    return kmeans, labels
def run_dbscan(X, eps=0.8, min_samples=5):

    if hasattr(X, "toarray"):
        X = X.toarray()

    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(X)
    return dbscan, labels


In [None]:
def top_tfidf_terms_per_cluster(tfidf_matrix, labels, vectorizer, top_n: int = 8) -> Dict[int, List[Tuple[str, float]]]:
    terms = np.array(vectorizer.get_feature_names_out())
    labels = np.array(labels)
    cluster_terms = {}

    for cluster in sorted(set(labels)):

        mask = (labels == cluster)
        if np.sum(mask) == 0:
            cluster_terms[cluster] = []
            continue


        cluster_vec = tfidf_matrix[mask].mean(axis=0)


        if hasattr(cluster_vec, "A1"):
            cluster_vec = np.squeeze(np.asarray(cluster_vec))


        top_idx = np.argsort(cluster_vec)[-top_n:][::-1]
        top_terms = [(terms[i], float(cluster_vec[i])) for i in top_idx]

        cluster_terms[cluster] = top_terms

    return cluster_terms

In [None]:
from typing import Dict, List, Tuple
import numpy as np

def top_tfidf_terms_per_cluster(tfidf_matrix, labels, vectorizer, top_n: int = 8) -> Dict[int, List[Tuple[str, float]]]:
    terms = np.array(vectorizer.get_feature_names_out())
    labels = np.array(labels)
    cluster_terms = {}

    for cluster in sorted(set(labels)):
        mask = (labels == cluster)

        if np.sum(mask) == 0:
            cluster_terms[cluster] = []
            continue

        cluster_vec = tfidf_matrix[mask].mean(axis=0)

        # Convert sparse matrix to dense array
        if hasattr(cluster_vec, "A1"):
            cluster_vec = np.squeeze(np.asarray(cluster_vec))

        top_idx = np.argsort(cluster_vec)[-top_n:][::-1]
        top_terms = [(terms[i], float(cluster_vec[i])) for i in top_idx]
        cluster_terms[cluster] = top_terms

    return cluster_terms

In [None]:
from typing import Dict, List, Tuple, Optional
import pandas as pd

def print_cluster_summary(
    df: pd.DataFrame,
    labels: List[int],
    cluster_terms: Dict[int, List[Tuple[str, float]]],
    summaries: Optional[Dict[int, str]] = None,
    top_docs_per_cluster: int = 5
):
    df = df.copy()
    df['cluster'] = labels

    for cluster in sorted(set(labels)):
        cluster_df = df[df['cluster'] == cluster]

        print("=" * 80)
        print(f"Cluster {cluster} | {len(cluster_df)} docs")


        top_terms = cluster_terms.get(cluster, [])
        label_terms = ", ".join([t for t, _ in top_terms[:6]]) if top_terms else "N/A"
        print(f"Cluster label (top terms): {label_terms}")


        if summaries and cluster in summaries:
            print(f"Cluster summary: {summaries[cluster]}")

        print("\nTop documents (title + snippet):")

        for _, row in cluster_df.head(top_docs_per_cluster).iterrows():
            snippet_words = row['clean_text'].split()
            snippet = " ".join(snippet_words[:35])
            if len(snippet_words) > 35:
                snippet += "..."

            print(f" - [{row['doc_id']}] {row['title']}")
            print(f"   snippet: {snippet}")
            print()


In [None]:
def run_pipeline(
    max_docs: int = 2000,
    n_clusters: int = 10,
    clustering_algo: str = 'kmeans',
    dbscan_eps: float = 0.7,
    dbscan_min_samples: int = 5,
    tfidf_max_features: int = 10000,
    random_state: int = 42,
    summarize: bool = True,
    use_sbert: bool = False
):

    df = load_reuters_dataset(max_docs=max_docs)
    df = df[df['text'].str.len() > 50].reset_index(drop=True)
    print(f"After filtering short docs: {len(df)} documents")


    print("Preprocessing text (lowercase, tokenize, stopword removal, lemmatize)...")
    df = preprocess_dataframe(df, text_col='text', new_col='clean_text')


    print("Vectorizing with TF-IDF...")
    tfidf_X, vectorizer = vectorize_tfidf(
        df['clean_text'].tolist(),
        max_features=tfidf_max_features
    )


    if use_sbert:
        if not sbert_available:
            print("sentence-transformers not installed; using TF-IDF embeddings.")
            embeddings = tfidf_X.toarray()
        else:
            print("Computing SBERT embeddings...")
            emb = sbert_embeddings(df['clean_text'].tolist())
            embeddings = np.array(emb)
    else:
        embeddings = tfidf_X.toarray()


    if clustering_algo == 'kmeans':
        print(f"Clustering with KMeans, k={n_clusters} ...")
        labels, model = cluster_kmeans(
            embeddings,
            k=n_clusters,
            random_state=random_state
        )
    elif clustering_algo == 'dbscan':
        print(f"Clustering with DBSCAN, eps={dbscan_eps}, min_samples={dbscan_min_samples} ...")
        labels, model = cluster_dbscan(
            embeddings,
            eps=dbscan_eps,
            min_samples=dbscan_min_samples
        )
    else:
        raise ValueError("clustering_algo must be 'kmeans' or 'dbscan'")


    print("Generating cluster labels (top TF-IDF terms per cluster)...")
    cluster_terms = top_tfidf_terms_per_cluster(
        tfidf_X,
        labels,
        vectorizer,
        top_n=12
    )


    cluster_summaries = {}
    if summarize:
        print("Generating cluster summaries (TextRank / gensim)...")
        for cluster in sorted(set(labels)):
            docs_for_summary = []
            sub_df = df[labels == cluster]
            for title, txt in zip(sub_df['title'], sub_df['text']):
                docs_for_summary.append(title + ". " + txt[:1000])

            if len(docs_for_summary) == 0:
                cluster_summaries[cluster] = ""
            else:
                cluster_summaries[cluster] = summarize_cluster_text(
                    docs_for_summary,
                    summary_sentences=2
                )


    print("Evaluating clustering...")
    eval_res = evaluate_clustering(
        embeddings,
        labels,
        ground_truth=None
    )
    print("Evaluation metrics:", eval_res)


    print_cluster_summary(
        df,
        labels,
        cluster_terms,
        summaries=cluster_summaries,
        top_docs_per_cluster=5
    )


    print("Visualizing clusters (PCA then t-SNE)...")
    visualize_clusters(embeddings, labels, method='pca',
                       title_suffix=f"algo={clustering_algo}")
    visualize_clusters(embeddings, labels, method='tsne',
                       title_suffix=f"algo={clustering_algo}")


    return {
        'df': df,
        'labels': labels,
        'cluster_terms': cluster_terms,
        'cluster_summaries': cluster_summaries,
        'eval': eval_res,
        'model': model
    }


In [None]:
import nltk


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('reuters')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
def load_reuters_dataset(max_docs: Optional[int] = 1000) -> pd.DataFrame:
    fileids = reuters.fileids()
    texts = []
    for fid in fileids[:max_docs]:
        raw = reuters.raw(fid)
        sents = sent_tokenize(raw)
        title = sents[0].strip() if sents else "Reuters article " + fid
        if len(title.split()) > 12:
            title = " ".join(title.split()[:12]) + "..."
        cats = reuters.categories(fid)
        texts.append({'doc_id': fid, 'title': title, 'text': raw, 'categories': cats})
    df = pd.DataFrame(texts)
    print(f"Loaded {len(df)} Reuters documents.")
    return df

In [None]:
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text: str,
                    remove_numbers: bool = True,
                    remove_punct: bool = True,
                    do_lemmatize: bool = True) -> str:
    text = text.lower()
    text = text.replace('\n', ' ')
    text = re.sub(r'http\S+|www\.\S+', ' ', text)
    if remove_numbers:
        text = re.sub(r'\d+', ' ', text)
    if remove_punct:
        text = re.sub(r'[^\w\s]', ' ', text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if len(t) > 2 and t not in STOPWORDS]
    if do_lemmatize:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

def preprocess_dataframe(df: pd.DataFrame, text_col: str = 'text', new_col: str = 'clean_text') -> pd.DataFrame:
    df[new_col] = df[text_col].apply(preprocess_text)
    return df

def vectorize_tfidf(docs: List[str], max_features: int = 10000, ngram_range=(1,2)):
    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
    X = vectorizer.fit_transform(docs)
    return X, vectorizer

def sbert_embeddings(docs: List[str], model_name: str = 'all-MiniLM-L6-v2', batch_size: int = 32):
    if not sbert_available:
        raise RuntimeError("sentence-transformers not available.")
    model = SentenceTransformer(model_name)
    embeddings = model.encode(docs, show_progress_bar=True, batch_size=batch_size)
    return embeddings


def cluster_kmeans(X, k: int = 10, random_state: int = 42):
    kmeans = KMeans(n_clusters=k, random_state=random_state, n_init=10)
    kmeans.fit(X)
    return kmeans.labels_, kmeans

def cluster_dbscan(X, eps: float = 0.5, min_samples: int = 5, metric: str = 'cosine'):
    if hasattr(X, "toarray"):
        X_in = X.toarray()
    else:
        X_in = X
    db = DBSCAN(eps=eps, min_samples=min_samples, metric=metric)
    labels = db.fit_predict(X_in)
    return labels, db


def top_tfidf_terms_per_cluster(tfidf_matrix, labels, vectorizer, top_n=8) -> Dict[int, List[Tuple[str, float]]]:
    terms = np.array(vectorizer.get_feature_names_out())
    labels = np.array(labels)
    cluster_terms = {}
    for cluster in sorted(set(labels)):
        mask = (labels == cluster)
        if np.sum(mask) == 0:
            cluster_terms[cluster] = []
            continue
        cluster_vec = tfidf_matrix[mask].mean(axis=0)
        if hasattr(cluster_vec, "A1"):
            cluster_vec = np.squeeze(np.asarray(cluster_vec))
        top_idx = np.argsort(cluster_vec)[-top_n:][::-1]
        top_terms = [(terms[i], float(cluster_vec[i])) for i in top_idx]
        cluster_terms[cluster] = top_terms
    return cluster_terms


def summarize_cluster_text(docs: List[str], summary_sentences: int = 2, ratio: Optional[float]=None) -> str:
    combined = "\n".join(docs)
    try:
        if ratio:
            s = gensim_summarize(combined, ratio=ratio)
        else:
            approx_words = summary_sentences * 20
            s = gensim_summarize(combined, word_count=approx_words)
        if not s or len(s.strip()) == 0:
            raise ValueError("Empty summary")
        return s.replace("\n", " ")
    except Exception:
        sents = sent_tokenize(combined)
        return " ".join(sents[:summary_sentences]) if len(sents) >= summary_sentences else " ".join(sents)


def evaluate_clustering(embeddings_or_X, labels, ground_truth: Optional[List]=None) -> Dict[str, float]:
    res = {}
    try:
        if len(set(labels)) > 1 and len(labels) > len(set(labels)):
            res['silhouette'] = float(silhouette_score(embeddings_or_X, labels))
        else:
            res['silhouette'] = float('nan')
    except:
        res['silhouette'] = float('nan')
    try:
        db = davies_bouldin_score(embeddings_or_X if not hasattr(embeddings_or_X,"toarray") else embeddings_or_X.toarray(), labels)
        res['davies_bouldin'] = float(db)
    except:
        res['davies_bouldin'] = float('nan')
    if ground_truth is not None:
        try:
            nmi = normalized_mutual_info_score(ground_truth, labels)
            res['NMI'] = float(nmi)
        except:
            res['NMI'] = float('nan')
    return res

In [None]:
results = run_pipeline(
    max_docs=600,
    n_clusters=12,
    clustering_algo='kmeans',
    summarize=True
)
