In [None]:
!pip install pypdf sentence-transformers umap-learn hdbscan scikit-learn joblib bertopic[all] nltk gensim tqdm spacy
# optionally: python -m spacy download en_core_web_sm

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from google.colab import files
uploaded = files.upload()

# If the uploaded filename is something like "document.pdf":
pdf_path = list(uploaded.keys())[0]
pdf_path

Saving topic_modelling.pdf to topic_modelling.pdf


'topic_modelling.pdf'

In [None]:
%%writefile pipeline_module.py

# pipeline_module.py
import re
import os
from typing import List, Optional, Dict, Any, Tuple
import joblib
import numpy as np
from tqdm import tqdm

# PDF reading
from pypdf import PdfReader

# NLP
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem import WordNetLemmatizer

# embedding / dimensionality reduction / clustering
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors

# BERTopic & coherence
from bertopic import BERTopic
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

# Ensure nltk downloads exist
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")
try:
    nltk.data.find("corpora/stopwords")
except LookupError:
    nltk.download("stopwords")
try:
    nltk.data.find("corpora/wordnet")
except LookupError:
    nltk.download("wordnet")

# Try to use spaCy if available for better lemmatization; otherwise fallback to WordNet
USE_SPACY = False
try:
    import spacy
    nlp_spacy = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    USE_SPACY = True
except Exception:
    USE_SPACY = False

# --------------------- Preprocessing utilities ---------------------
URL_RE = re.compile(r"https?://\S+|www\.\S+")

def preprocess_text_keep_punct(text: str, lowercase: bool = True,
                                remove_urls: bool = True,
                                remove_stopwords: bool = True,
                                lemmatize: bool = True,
                                stopword_lang: str = "english") -> str:
    """
    Preprocessing that:
    - lowercases
    - removes URLs
    - removes stopwords (while keeping punctuation & numbers)
    - lemmatizes tokens
    - preserves punctuation tokens (they are kept as separate tokens)
    """
    if remove_urls:
        text = URL_RE.sub(" ", text)
    if lowercase:
        text = text.lower()
    # Tokenize preserving punctuation as separate tokens: words or punctuation
    # pattern matches words (alphanumeric) or punctuation
    token_pattern = r"\w+|[^\w\s]"
    tokens = re.findall(token_pattern, text)
    # Stopwords set
    if remove_stopwords:
        stopset = set(nltk_stopwords.words(stopword_lang))
    else:
        stopset = set()
    lem = None
    if lemmatize:
        if USE_SPACY:
            # spaCy pipeline expects full text; we'll lemmatize tokens via spaCy doc
            doc = nlp_spacy(" ".join(tokens))
            lem_tokens = []
            for tok in doc:
                if tok.is_space:
                    continue
                # keep punctuation and numbers
                if tok.is_punct or tok.like_num or re.fullmatch(r"[^\w\s]", tok.text):
                    lem_tokens.append(tok.text)
                else:
                    w = tok.lemma_.strip()
                    if remove_stopwords and w in stopset:
                        continue
                    if remove_stopwords and tok.text in stopset:
                        continue
                    lem_tokens.append(w if w else tok.text)
            return " ".join(lem_tokens)
        else:
            # fallback: NLTK WordNetLemmatizer
            lem = WordNetLemmatizer()
            out = []
            for t in tokens:
                # keep punctuation tokens or numeric tokens as is
                if re.fullmatch(r"[^\w\s]", t) or re.fullmatch(r"\d+[\w\d]*", t):
                    out.append(t)
                else:
                    t_low = t.lower()
                    if remove_stopwords and t_low in stopset:
                        continue
                    lem_t = lem.lemmatize(t_low)
                    out.append(lem_t)
            return " ".join(out)
    else:
        # no lemmatization, just remove stopwords if asked
        out = []
        for t in tokens:
            if re.fullmatch(r"[^\w\s]", t) or re.fullmatch(r"\d+[\w\d]*", t):
                out.append(t)
            else:
                t_low = t.lower()
                if remove_stopwords and t_low in stopset:
                    continue
                out.append(t_low)
        return " ".join(out)

# --------------------- PDF extraction (with page mapping) ---------------------
def extract_text_and_pages(pdf_path: str) -> Tuple[str, List[Tuple[int, str]]]:
    """
    Returns full_text and a list of (page_number, page_text)
    """
    reader = PdfReader(pdf_path)
    pages = []
    full = []
    for i, p in enumerate(reader.pages):
        try:
            text = p.extract_text() or ""
        except Exception:
            text = ""
        pages.append((i + 1, text))
        full.append(text)
    return "\n".join(full), pages

# --------------------- sentence tokenize with page mapping ---------------------
def sentences_from_pages(pages: List[Tuple[int, str]]) -> Tuple[List[str], List[int]]:
    """
    Returns list of sentences and list of corresponding page numbers (aligned).
    Uses nltk.sent_tokenize.
    """
    sentences = []
    page_nums = []
    for page_num, page_text in pages:
        if not page_text or not page_text.strip():
            continue
        sents = sent_tokenize(page_text)
        for s in sents:
            s_clean = s.strip()
            if s_clean:
                sentences.append(s_clean)
                page_nums.append(page_num)
    return sentences, page_nums

# --------------------- Embedding ---------------------
def compute_embeddings(chunks: List[str], model_name: str = "all-miniLM-L6-v2", normalize_emb: bool = True):
    model = SentenceTransformer(model_name)
    emb = model.encode(chunks, convert_to_numpy=True, show_progress_bar=True)
    if normalize_emb:
        emb = normalize(emb, axis=1)
    return emb

# --------------------- Validation metrics ---------------------
def intra_cluster_cosine(embeddings: np.ndarray, labels: np.ndarray) -> Dict[int, float]:
    res = {}
    unique = [u for u in set(labels) if u != -1]
    for u in unique:
        idx = np.where(labels == u)[0]
        if len(idx) < 2:
            res[u] = float("nan")
            continue
        sims = cosine_similarity(embeddings[idx])
        upper = sims[np.triu_indices_from(sims, k=1)]
        res[u] = float(np.mean(upper))
    return res

def knn_overlap_score(embeddings: np.ndarray, labels: np.ndarray, k: int = 10) -> float:
    nn = NearestNeighbors(n_neighbors=min(k, embeddings.shape[0]-1)).fit(embeddings)
    neigh = nn.kneighbors(embeddings, return_distance=False)
    overlaps = []
    for i, nbs in enumerate(neigh):
        same = set(np.where(labels == labels[i])[0])
        if len(same) <= 1:
            overlaps.append(0.0)
            continue
        overlap = len(set(nbs).intersection(same)) / float(len(nbs))
        overlaps.append(overlap)
    return float(np.mean(overlaps))

def topic_coherence_cv(topic_model: BERTopic, chunks: List[str]) -> float:
    # Prepare tokenized texts for gensim coherence
    tokenized = [re.findall(r"\w+|[^\w\s]", c.lower()) for c in chunks]  # simple tokenization preserving punctuation
    topics = topic_model.get_topics()
    # prepare list of topic word lists (only top words)
    topic_word_lists = []
    for tid, words in topics.items():
        if tid == -1:
            continue
        topic_word_lists.append([w for w, _ in words])
    if not topic_word_lists:
        return float("nan")
    # gensim requires dictionary
    dictionary = Dictionary(tokenized)
    cm = CoherenceModel(topics=topic_word_lists, texts=tokenized, dictionary=dictionary, coherence='c_v')
    return float(cm.get_coherence())

def silhouette_umap(umap_embeddings: np.ndarray, labels: np.ndarray) -> Optional[float]:
    try:
        if len(set(labels)) <= 1:
            return None
        return float(silhouette_score(umap_embeddings, labels, metric="euclidean"))
    except Exception:
        return None

def pdf_page_consistency(labels: np.ndarray, page_nums: List[int]) -> Dict[int, float]:
    """
    For each cluster compute average pairwise page-distance (lower is better => cluster localized).
    Returns mean page-span per cluster.
    """
    res = {}
    unique = [u for u in set(labels) if u != -1]
    pages_arr = np.array(page_nums)
    for u in unique:
        idx = np.where(labels == u)[0]
        if len(idx) < 2:
            res[u] = 0.0
            continue
        cluster_pages = pages_arr[idx]
        # average absolute difference between pages
        diffs = []
        for i in range(len(cluster_pages)):
            for j in range(i+1, len(cluster_pages)):
                diffs.append(abs(cluster_pages[i] - cluster_pages[j]))
        res[u] = float(np.mean(diffs)) if diffs else 0.0
    return res

# --------------------- Main pipeline function ---------------------
def process_pdf_pipeline(pdf_path: str,
                         expected_num_topics: Optional[int] = None,
                         chunking_method: str = "sentences",
                         chunk_params: Optional[Dict[str, Any]] = None,
                         pca_components: Optional[float] = 0.95,
                         umap_components: int = 15,
                         umap_neighbors: int = 15,
                         hdb_min_cluster_size: int = 8,
                         hdb_min_samples: Optional[int] = None,
                         save_artifacts_dir: str = "artifacts") -> Dict[str, Any]:
    """
    Runs the full pipeline described by the user.
    Returns a dictionary with chunks, embeddings, models, labels, topics, probs, topic_keywords, and validation metrics.
    """
    os.makedirs(save_artifacts_dir, exist_ok=True)

    # Step 0: extract text & pages
    full_text, pages = extract_text_and_pages(pdf_path)
    sentences, page_nums = sentences_from_pages(pages)

    # Optional: allow other chunking later (only sentences implemented)
    if chunking_method != "sentences":
        raise NotImplementedError("Only 'sentences' chunking_method is implemented in this function.")

    # Step 1: Preprocess each sentence (lowercasing, stopword removal, lemmatization, keep punctuation/numbers, remove URLs)
    preprocessed = []
    for s in sentences:
        preprocessed.append(preprocess_text_keep_punct(s,
                                                      lowercase=True,
                                                      remove_urls=True,
                                                      remove_stopwords=True,
                                                      lemmatize=True))

    # Save raw chunks + pages mapping
    joblib.dump((preprocessed, page_nums), os.path.join(save_artifacts_dir, "chunks_and_pages.joblib"))

    # Step 2: Embedding using all-MiniLM-L6-v2
    orig_emb = compute_embeddings(preprocessed, model_name="all-miniLM-L6-v2", normalize_emb=True)
    joblib.dump(orig_emb, os.path.join(save_artifacts_dir, "orig_embeddings.joblib"))

    # Step 3: PCA
    if pca_components is not None:
        pca = PCA(n_components=pca_components, random_state=42)
        pca_emb = pca.fit_transform(orig_emb)
        joblib.dump(pca, os.path.join(save_artifacts_dir, "pca.joblib"))
    else:
        pca = None
        pca_emb = orig_emb

    # Step 4: UMAP (for clustering) - not 2D
    umap = UMAP(n_components=umap_components, n_neighbors=umap_neighbors,
                min_dist=0.0, metric="cosine", random_state=42)
    umap_emb = umap.fit_transform(pca_emb)
    joblib.dump(umap, os.path.join(save_artifacts_dir, "umap.joblib"))

    # Step 5: HDBSCAN initialization & clustering
    hdb = HDBSCAN(min_cluster_size=hdb_min_cluster_size,
                  min_samples=hdb_min_samples,
                  metric="euclidean",
                  cluster_selection_method="eom",
                  prediction_data=True)
    labels = hdb.fit_predict(umap_emb)
    joblib.dump(hdb, os.path.join(save_artifacts_dir, "hdbscan.joblib"))

    # Step 6: BERTopic using important params set, others None
    topic_model = BERTopic(
        embedding_model=None,
        umap_model=None,
        hdbscan_model=hdb,
        vectorizer_model=None,
        representation_model=None,
        nr_topics=None,
        calculate_probabilities=True,
        verbose=False
    )

    topics, probs = topic_model.fit_transform(preprocessed, embeddings=umap_emb)
    topic_model.save(os.path.join(save_artifacts_dir, "bertopic_model"))
    # gather topic keywords
    topics_info = {}
    for tid in set(topics):
        if tid == -1:  # noise
            continue
        topics_info[tid] = topic_model.get_topic(tid)

    # Validation metrics
    intra_cos = intra_cluster_cosine(orig_emb, np.array(topics))
    knn_overlap = knn_overlap_score(orig_emb, np.array(topics), k=10)
    coherence_cv = topic_coherence_cv(topic_model, preprocessed)
    sil = silhouette_umap(umap_emb, np.array(topics))
    page_consistency = pdf_page_consistency(np.array(topics), page_nums)

    # Test expected topic count if provided
    generated_topic_count = len([t for t in set(topics) if t != -1])
    expected_match = None
    if expected_num_topics is not None:
        expected_match = (generated_topic_count == expected_num_topics)

    # Merge chunks by topic -> topic-based chunking (concatenate sentence chunks belonging to same topic,
    # optionally preserving page-order)
    topic_chunks = {}
    for i, t in enumerate(topics):
        if t == -1:
            # treat noise as its own small topics or skip; here we include them under 'noise'
            topic_chunks.setdefault("noise", []).append(preprocessed[i])
        else:
            topic_chunks.setdefault(int(t), []).append(preprocessed[i])

    # Optionally concatenate each topic's chunks into a larger chunk for RAG context
    merged_topic_chunks = {k: " ".join(v) for k, v in topic_chunks.items()}

    # Save artifacts & outputs
    joblib.dump({
        "chunks": preprocessed,
        "page_nums": page_nums,
        "orig_emb": orig_emb,
        "pca_emb": pca_emb,
        "umap_emb": umap_emb,
        "labels": topics,
        "probs": probs
    }, os.path.join(save_artifacts_dir, "pipeline_outputs.joblib"))

    result = {
        "chunks": preprocessed,
        "page_nums": page_nums,
        "orig_embeddings": orig_emb,
        "pca_model": pca,
        "pca_embeddings": pca_emb,
        "umap_model": umap,
        "umap_embeddings": umap_emb,
        "hdbscan_model": hdb,
        "topic_model": topic_model,
        "topics": topics,
        "probs": probs,
        "topic_keywords": topics_info,
        "validation": {
            "intra_cluster_cosine": intra_cos,
            "knn_overlap": knn_overlap,
            "coherence_cv": coherence_cv,
            "silhouette_umap": sil,
            "page_consistency": page_consistency
        },
        "generated_topic_count": generated_topic_count,
        "expected_topic_count_match": expected_match,
        "merged_topic_chunks": merged_topic_chunks,
        "artifacts_dir": os.path.abspath(save_artifacts_dir)
    }

    return result

# --------------------- Example usage ---------------------
if __name__ == "__main__":
    # Simple example: replace 'input.pdf' with your file and optionally set expected_num_topics
    out = process_pdf_pipeline("topic_modelling.pdf", expected_num_topics=12,
                               hdb_min_cluster_size=8, umap_components=15)
    print("Artifacts saved in:", out["artifacts_dir"])
    print("Generated topic count:", out["generated_topic_count"])
    print("Expected match?:", out["expected_topic_count_match"])
    print("Top topic keywords (sample):")
    for tid, kw in list(out["topic_keywords"].items())[:6]:
        print(tid, kw)

Overwriting pipeline_module.py


In [None]:
from pipeline_module import process_pdf_pipeline

result = process_pdf_pipeline(pdf_path,
                              expected_num_topics=5,  # optional
                              hdb_min_cluster_size=8,
                              umap_components=15)

result["generated_topic_count"], result["topic_keywords"]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

TypeError: BERTopic.fit_transform() got an unexpected keyword argument 'umap_embeddings'