In [1]:
!pip install -U bertopic sentence-transformers hdbscan plotly scikit-learn openai bitsandbytes accelerate transformers

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting plotly
  Downloading plotly-6.5.0-py3-none-any.whl.metadata (8.5 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading bertopic-0.17.3-py3-none-any.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading plotly-6.5.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
from bertopic import BERTopic
from sklearn import metrics
import collections
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
import unicodedata
import re
import os
import pickle
import time
import spacy
import shutil
import scipy.sparse as sp
import glob
from google.colab import drive
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sklearn.metrics.pairwise import cosine_similarity
import plotly.graph_objects as go
import plotly.io as pio
from scipy.cluster.hierarchy import linkage, dendrogram
from collections import defaultdict

  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


In [3]:
# ---
# 1. SETUP: GOOGLE DRIVE & CONFIG
# ---

from google.colab import drive

# Mount Google Drive
try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")

# CONFIG for chapter-wise embeddings
CONFIG = {
    "drive_base_path": "/content/drive/MyDrive/",
    "input_paths": {
        "biology_chapters": "data-process/output/biology_chapters_cleaned/",
        "biology_pages": "data-process/output/biology_pages/",
        "physics_chapters": "data-process/output/physics_chapters_cleaned/",
        "physics_pages": "data-process/output/physics_pages/",
        "chemistry_chapters": "data-process/output/chemistry_chapters_cleaned/",
        "chemistry_pages": "data-process/output/chemistry_pages/",
    },
    "output_base_path": "gemma_embeddings_output/",
}

Mounted at /content/drive
Google Drive mounted successfully.


In [4]:
# ---
# 2. DATA LOADING: Chapter-wise MD files
# ---

def load_chapter_data(config):
    """
    Loads all *chapter* .md files from GDrive and splits them into sentences.
    Returns chapter-level data for embeddings.
    """
    print("Loading spacy model for sentence splitting...")
    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "tagger", "lemmatizer"])
    nlp.add_pipe("sentencizer")

    all_chapters = []      # Full chapter texts
    all_metadata = []      # Metadata for each chapter

    print("Loading chapter MD files...")

    # Only process chapters (not pages)
    for key, path_suffix in config['input_paths'].items():
        if "_chapters" in key:
            print(f"  -> Processing: {key}")
            full_path = os.path.join(config['drive_base_path'], path_suffix)
            md_files = glob.glob(os.path.join(full_path, "*.md"))

            for md_file_path in tqdm(md_files, desc=f"Loading {key}"):
                try:
                    with open(md_file_path, 'r', encoding='utf-8') as f:
                        text = f.read()

                    if text.strip():
                        subject = key.replace("_chapters", "").replace("_cleaned", "")
                        chapter = os.path.basename(md_file_path).replace('.md', '')

                        all_chapters.append(text)
                        all_metadata.append({
                            "subject": subject,
                            "chapter": chapter,
                            "file_path": md_file_path
                        })
                except Exception as e:
                    print(f"Error processing file {md_file_path}: {e}")

    print(f"\nLoaded {len(all_chapters)} chapters.")
    return all_chapters, all_metadata

def load_page_data_for_chapter(config, chapter_name, subject, page_mapping_df=None):
    """
    Loads page embeddings for a specific chapter.
    Returns list of page embeddings and their page numbers.
    Supports an optional CSV mapping DataFrame with columns [chapter, start_page, end_page] and optionally [subject].
    """
    pages_path = config['input_paths'].get(f"{subject}_pages")
    if not pages_path:
        return [], []

    full_pages_path = os.path.join(config['drive_base_path'], pages_path)

    page_embeddings = []
    page_numbers = []
    target_files = []

    # --- LOGIC WITH CSV MAPPING ---
    if page_mapping_df is not None:
        # Check if the mapping dataframe has a 'subject' column to avoid collisions
        # (e.g. Chapter 1 exists in both Biology and Physics)
        if 'subject' in page_mapping_df.columns:
            # Filter by both chapter and subject
            row = page_mapping_df[
                (page_mapping_df['chapter'] == chapter_name) &
                (page_mapping_df['subject'] == subject)
            ]
        else:
            # Fallback: Filter by chapter only (assumes single-subject mapping)
            row = page_mapping_df[page_mapping_df['chapter'] == chapter_name]

        if not row.empty:
            start_page = int(row.iloc[0]['start_page'])
            end_page = int(row.iloc[0]['end_page'])

            # Generate filenames for the range: page_0001.md to page_XXXX.md
            for p_num in range(start_page, end_page + 1):
                fname = f"page_{p_num:04d}.md"
                fpath = os.path.join(full_pages_path, fname)
                if os.path.exists(fpath):
                    target_files.append((p_num, fpath))
        else:
            # Optional: Print only once per missing chapter to avoid spam
            pass

    # --- FALLBACK LOGIC (Old Filename Matching) ---
    else:
        # Find pages that belong to this chapter based on filename prefix
        # Expected: chapter_01_page_001.md
        candidate_files = glob.glob(os.path.join(full_pages_path, f"{chapter_name}_page_*.md"))
        candidate_files.sort()

        for fpath in candidate_files:
            # Extract page number from filename
            page_num_match = re.search(r'page_(\d+)', os.path.basename(fpath))
            if page_num_match:
                p_num = int(page_num_match.group(1))
                target_files.append((p_num, fpath))

    # --- LOAD CONTENT ---
    for p_num, fpath in target_files:
        try:
            with open(fpath, 'r', encoding='utf-8') as f:
                page_text = f.read()
            page_embeddings.append(page_text)
            page_numbers.append(p_num)
        except Exception as e:
            print(f"Error loading page {fpath}: {e}")

    return page_embeddings, page_numbers

In [5]:
# ---
# 3. LOAD CHAPTER DATA
# ---

# Load all chapters using the updated CONFIG
chapters, metadata = load_chapter_data(CONFIG)

# Basic stats
print(f"\nTotal chapters loaded: {len(chapters)}")

if metadata:
    # Count chapters per subject
    subject_counts = collections.Counter([m['subject'] for m in metadata])
    print("Chapters per subject:")
    for subj, count in subject_counts.items():
        print(f"  - {subj.capitalize()}: {count}")

    print(f"\nSample metadata: {metadata[0]}")
else:
    print("No chapters loaded. Please check your Drive paths.")

Loading spacy model for sentence splitting...
Loading chapter MD files...
  -> Processing: biology_chapters


Loading biology_chapters:   0%|          | 0/47 [00:00<?, ?it/s]

  -> Processing: physics_chapters


Loading physics_chapters:   0%|          | 0/34 [00:00<?, ?it/s]

  -> Processing: chemistry_chapters


Loading chemistry_chapters:   0%|          | 0/21 [00:00<?, ?it/s]


Loaded 102 chapters.

Total chapters loaded: 102
Chapters per subject:
  - Biology: 47
  - Physics: 34
  - Chemistry: 21

Sample metadata: {'subject': 'biology', 'chapter': 'chapter_09', 'file_path': '/content/drive/MyDrive/data-process/output/biology_chapters_cleaned/chapter_09.md'}


In [7]:
# ---
# 4. GENERATE EMBEDDINGS FOR CHAPTERS (USING GEMMA)
# ---

import torch
import numpy as np
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Login to Hugging Face
# Using the token from your notebook
hf_token = "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"
try:
    login(token=hf_token)
    print("✅ Successfully logged in to Hugging Face!")
except Exception as e:
    print(f"❌ Authentication failed: {e}")
    print("Please check your HF token.")

class GemmaEmbedder:
    """
    Wrapper for Gemma-2b to behave like a SentenceTransformer for BERTopic.
    Uses 4-bit quantization to fit in Colab memory.
    """
    def __init__(self, model_name="google/gemma-2b", device="cuda"):
        print(f"Loading {model_name} for embeddings...")
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Ensure pad token exists (Gemma might not have one by default)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        # 4-bit Quantization Config
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

        # Load Model (CausalLM is used so we can potentially reuse it for text generation later)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map=device,
            trust_remote_code=True
        )
        self.model.config.pad_token_id = self.tokenizer.pad_token_id

    def encode(self, documents, batch_size=4, show_progress_bar=True):
        """
        Generates embeddings for a list of documents.
        """
        all_embeddings = []

        # Setup iterator
        iterator = range(0, len(documents), batch_size)
        if show_progress_bar:
            from tqdm.notebook import tqdm
            iterator = tqdm(iterator, desc="Encoding batches", total=len(documents)//batch_size + 1)

        for i in iterator:
            batch_docs = documents[i : i + batch_size]

            # Tokenize
            inputs = self.tokenizer(
                batch_docs,
                padding=True,
                truncation=True,
                max_length=512,  # Reasonable limit for embeddings
                return_tensors="pt"
            ).to(self.model.device)

            with torch.no_grad():
                outputs = self.model(**inputs, output_hidden_states=True)

            # Extract embeddings: Use Mean Pooling of the last hidden state
            # This is often more stable for clustering than just the last token
            hidden_states = outputs.hidden_states[-1]
            attention_mask = inputs['attention_mask'].unsqueeze(-1)

            # Masked Mean
            masked_embeddings = hidden_states * attention_mask
            summed = masked_embeddings.sum(dim=1)
            counts = attention_mask.sum(dim=1)
            mean_pooled = summed / torch.clamp(counts, min=1e-9)

            all_embeddings.extend(mean_pooled.float().cpu().numpy())

            # Memory cleanup
            del inputs, outputs, hidden_states, masked_embeddings, summed, mean_pooled
            torch.cuda.empty_cache()

        return np.array(all_embeddings)

print("Initializing Gemma Embedder (this may take a minute)...")
# Initialize the model
embedding_model = GemmaEmbedder()

print(f"Generating chapter embeddings for {len(chapters)} chapters...")
# Use a small batch size for chapters as they are long text
chapter_embeddings = embedding_model.encode(chapters, show_progress_bar=True, batch_size=2)

print(f"Generated embeddings shape: {np.array(chapter_embeddings).shape}")

# Store chapter embeddings with metadata
chapter_embeddings_dict = {}
for i, (chapter_text, meta) in enumerate(zip(chapters, metadata)):
    key = (meta['subject'], meta['chapter'])
    chapter_embeddings_dict[key] = {
        'embedding': chapter_embeddings[i],
        'text': chapter_text,
        'metadata': meta
    }

print(f"Stored {len(chapter_embeddings_dict)} chapter embeddings.")

# Verify subjects present
subjects_present = set([meta['subject'] for meta in metadata])
print(f"Subjects processed: {', '.join(subjects_present)}")

✅ Successfully logged in to Hugging Face!
Initializing Gemma Embedder (this may take a minute)...
Loading google/gemma-2b for embeddings...


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Generating chapter embeddings for 102 chapters...


Encoding batches:   0%|          | 0/52 [00:00<?, ?it/s]

Generated embeddings shape: (102, 2048)
Stored 102 chapter embeddings.
Subjects processed: biology, chemistry, physics


In [8]:
# ---
# 5. PREPARE DATA FOR BERTOPIC
# ---

# Convert chapters to sentences for better topic modeling
print("Splitting chapters into sentences...")
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "tagger", "lemmatizer"])
nlp.add_pipe("sentencizer")

all_sentences = []
sentence_metadata = []

for chapter_text, meta in zip(chapters, metadata):
    doc = nlp(chapter_text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    all_sentences.extend(sentences)
    sentence_metadata.extend([meta] * len(sentences))

print(f"Total sentences: {len(all_sentences)}")

# Generate embeddings for sentences
print("Generating sentence embeddings (using Gemma)...")
# NOTE: Batch size reduced to 8 to prevent OOM with the 2B model
sentence_embeddings = embedding_model.encode(all_sentences, show_progress_bar=True, batch_size=8)

print(f"Sentence embeddings shape: {np.array(sentence_embeddings).shape}")

Splitting chapters into sentences...
Total sentences: 82670
Generating sentence embeddings (using Gemma)...


Encoding batches:   0%|          | 0/10334 [00:00<?, ?it/s]

Sentence embeddings shape: (82670, 2048)


In [9]:
# ---
# 6. INITIALIZE BERTOPIC MODEL
# ---

from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN

vectorizer = CountVectorizer(
    min_df=5,
    max_df=0.90,
    stop_words='english',
    ngram_range=(1, 2)
)

hdbscan_model = HDBSCAN(
    min_cluster_size=25,
    min_samples=5,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

topic_model = BERTopic(
    verbose=True,
    calculate_probabilities=True,
    min_topic_size=50,
    vectorizer_model=vectorizer,
    top_n_words=50,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    language="english"
)

In [10]:
# ---
# 7. FIT BERTOPIC MODEL
# ---

print("Fitting BERTopic model...")
topics, probs = topic_model.fit_transform(
    documents=all_sentences,
    embeddings=sentence_embeddings
)

print(f"Found {len(set(topics))} topics (including outlier topic -1)")

hierarchy_df = topic_model.hierarchical_topics(all_sentences)

Fitting BERTopic model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2025-11-26 16:11:26,442 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-26 16:12:30,999 - BERTopic - Dimensionality - Completed ✓
2025-11-26 16:12:31,002 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-26 16:30:08,625 - BERTopic - Cluster - Completed ✓
2025-11-26 16:30:08,657 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-26 16:30:11,658 - BERTopic - Representation - Completed ✓


Found 494 topics (including outlier topic -1)


100%|██████████| 492/492 [00:02<00:00, 204.02it/s]


In [16]:
from huggingface_hub import login
import re

print("Preparing Gemma model for topic naming...")

# REUSE LOGIC: Check if we already loaded Gemma in Step 4 to avoid OOM
if 'embedding_model' in globals() and hasattr(embedding_model, 'model'):
    print("♻️ Reusing the Gemma model loaded in the embedding step.")
    gemma_model = embedding_model.model
    gemma_tokenizer = embedding_model.tokenizer

    # FIX: Assign chat template if missing (common with base models)
    if not gemma_tokenizer.chat_template:
        print("⚠️ Tokenizer missing chat_template. Setting default Gemma template.")
        gemma_tokenizer.chat_template = "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}<start_of_turn>{{ message['role'] }}\n{{ message['content'] | trim }}<end_of_turn>\n{% endfor %}{% if add_generation_prompt %}<start_of_turn>model\n{% endif %}"
else:
    print("⚠️ Embedding model not found. Loading Gemma from scratch...")
    # Only load if not already present
    hf_token = "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"
    login(token=hf_token)

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    model_id = "google/gemma-2b-it"
    gemma_tokenizer = AutoTokenizer.from_pretrained(model_id)
    gemma_model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto"
    )

def get_llm_topic_name(keywords):
    """
    Generates a descriptive name for a topic using Gemma.
    Includes strict filtering for numbers and generic terms.
    """
    # 1. CLEAN INPUT KEYWORDS
    blocklist_keywords = {'problem', 'solution', 'answer', 'question', 'exercise', 'chapter', 'page'}
    valid_keywords = [
        k for k in keywords
        if k and isinstance(k, str) and k.strip()
        and not k.isdigit()
        and k.lower() not in blocklist_keywords
    ]

    if not valid_keywords:
        return "Unknown Topic"

    # IMPROVED PROMPT
    prompt = (
        f"Context: Scientific Textbook Analysis (Biology, Physics, Chemistry). "
        f"Keywords: {', '.join(valid_keywords[:10])}.\n"
        "Task: Provide a concise 2-5 word scientific label for this topic based on the keywords. "
        "Constraints: Output ONLY the label as text, without quotes. Do NOT include numbers or any introductory phrases like 'Sure, the label is', 'The label is', 'I cannot provide', 'Topic: ', etc. Do not provide a topic number.\n"
        "FORBIDDEN: Do NOT use words like 'Problem', 'Solution', 'Exercise', 'Chapter', 'Introduction', 'Review'. "
        "Focus strictly on the scientific phenomenon, molecule, or physical concept discussed (e.g. 'Kinematics' instead of 'Motion Problems'). "
        "If keywords are biological, use specific medical/anatomical terminology."
    )

    chat = [
        {"role": "user", "content": prompt},
    ]

    formatted_prompt = gemma_tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    inputs = gemma_tokenizer.encode(formatted_prompt, add_special_tokens=False, return_tensors="pt").to(gemma_model.device)

    try:
        outputs = gemma_model.generate(
            input_ids=inputs,
            max_new_tokens=20
        )

        new_tokens = outputs[0][inputs.shape[-1]:]
        raw_response = gemma_tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

        # --- RESPONSE CLEANING & VALIDATION ---
        name = raw_response

        # 1. Remove intro phrases
        name = re.sub(r'^(Sure|Here|Certainly|I can|The label|Label|Topic|Name|Number|Topic ID).*?(:|\n|-|is )', '', name, flags=re.IGNORECASE|re.DOTALL)
        name = name.replace('topic: ', '').strip()

        # 2. Remove Leading Symbols (Fix for >:Kinematics artifact)
        name = re.sub(r'^[^a-zA-Z0-9]+', '', name)

        # 3. Standard cleanup
        match = re.search(r'\*\*([^*]+)\*\*', name) # Extract if surrounded by **
        if match: name = match.group(1)
        name = name.strip().strip('"').strip("'").strip()
        name = name.split('\n')[0].strip()

        # 4. Hallucination & Garbage Filter
        name_lower = name.lower()

        # Specific bad outputs observed
        blocklist_terms = ["ex%", "pavo", "unknown", "nan"]
        if any(term in name_lower for term in blocklist_terms):
             return ", ".join(valid_keywords[:3]).title()

        # Refusal terms
        refusal_terms = ["cannot provide", "sexual", "offensive", "apologize", "language model", "harmful", "number", "id"]
        if any(term in name_lower for term in refusal_terms):
            return ", ".join(valid_keywords[:3]).title()

        # Generic academic terms
        generic_terms = ["problem", "solution", "exercise", "chapter", "summary", "review", "questions", "answers", "label", "name"]
        if any(term in name_lower for term in generic_terms):
            return ", ".join(valid_keywords[:3]).title()

        # 5. ASCII Check: Reject weird characters (like 'ex%')
        if not name.isascii():
             return ", ".join(valid_keywords[:3]).title()

        # Explicitly check if the cleaned name is just a number or similar problematic patterns
        if name.isdigit() or re.match(r'^Topic \d+$', name, re.IGNORECASE) or re.match(r'^-?\d+$', name) or re.match(r'^\d+(\.\d+)?$', name):
            return ", ".join(valid_keywords[:3]).title()

        return name if name else ", ".join(valid_keywords[:3]).title()

    except Exception as e:
        print(f"An error occurred during Gemma generation: {e}")
        return ", ".join(valid_keywords[:3])

print("Generating topic names with Gemma...")
topic_df = topic_model.get_topic_info()

def extract_keywords(representation):
    words = []
    for item in representation[:10]:
        if isinstance(item, str):
            words.append(item)
        elif isinstance(item, (list, tuple)) and len(item) > 0:
            words.append(str(item[0]))
    return words

top_10_keywords = topic_df['Representation'].apply(extract_keywords)

try:
    from tqdm.notebook import tqdm
    tqdm.pandas(desc="Naming Topics")
    topic_df['topic_name'] = top_10_keywords.progress_apply(get_llm_topic_name)
except ImportError:
    topic_df['topic_name'] = top_10_keywords.apply(get_llm_topic_name)

print(topic_df[['Topic', 'Count', 'topic_name']].head(20))
topic_df

Preparing Gemma model for topic naming...
♻️ Reusing the Gemma model loaded in the embedding step.
Generating topic names with Gemma...


Naming Topics:   0%|          | 0/494 [00:00<?, ?it/s]

    Topic  Count                                   topic_name
0      -1  43181                       Energy, Water, Example
1       0   1501                 Figure, Figure 18, Figure 10
2       1   1014                         Car, Force, Velocity
3       2    825                      Cancer, Heart, Patients
4       3    660                     Neurons, Sensory, Neuron
5       4    565                 Voltage, Current, Resistance
6       5    559               Science, Scientific, Knowledge
7       6    493                          Blood, Heart, Lungs
8       7    470      Strategy, Discussion, Strategy Solution
9       8    467                     Cavity, Coelom, Mesoderm
10      9    447                    Lens, Focal, Focal Length
11     10    405                 Figure, Figure 38, Figure 35
12     11    388                   Atp, Glycolysis, Phosphate
13     12    364                    Oxides, Oxidation, Metals
14     13    318                      Decay, Nuclide, Fission
15     1

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,topic_name
0,-1,43181,-1_energy_water_example_force,"[energy, water, example, force, mass, temperat...",[The balanced equation shows that carbon dioxi...,"Energy, Water, Example"
1,0,1501,0_figure_figure 18_figure 10_figure 11,"[figure, figure 18, figure 10, figure 11, figu...","[FIGURE 19.24 Three isomeric forms of exist.,...","Figure, Figure 18, Figure 10"
2,1,1014,1_car_force_velocity_friction,"[car, force, velocity, friction, acceleration,...",[Calculate the displacement and velocity at ti...,"Car, Force, Velocity"
3,2,825,2_cancer_heart_patients_symptoms,"[cancer, heart, patients, symptoms, disease, d...",[Prostate cancer often develops very slowly an...,"Cancer, Heart, Patients"
4,3,660,3_neurons_sensory_neuron_nervous,"[neurons, sensory, neuron, nervous, axon, brai...",[Most sensory neurons are pseudounipolar and h...,"Neurons, Sensory, Neuron"
...,...,...,...,...,...,...
489,488,25,488_50___,"[50, , , , , , , , , , , , , , , , , , , , , ,...","[50., 50., 50.]",Unknown Topic
490,489,25,489_cis_central carbon_trans_cis trans,"[cis, central carbon, trans, cis trans, double...",[b. Molecules must have a double bond to be ci...,"Cis, Central Carbon, Trans"
491,490,25,490_73_78__,"[73, 78, , , , , , , , , , , , , , , , , , , ,...","[73., 73., 73.]",Unknown Topic
492,491,25,491_viruses_life forms_adapt_pathways,"[viruses, life forms, adapt, pathways, infecti...",[No one knows exactly when or how viruses evol...,"Viruses, Life Forms, Adapt"


In [17]:
# ---
# 8.5. SAVE BERTOPIC MODEL
# ---

print("Saving BERTopic model to Google Drive...")

# Ensure output directory exists
full_output_path = os.path.join(CONFIG['drive_base_path'], CONFIG['output_base_path'])
os.makedirs(full_output_path, exist_ok=True)

# Save the topic_model
model_save_path = os.path.join(full_output_path, "bertopic_chapters_model.pkl")
try:
    with open(model_save_path, 'wb') as f:
        pickle.dump(topic_model, f)
    print(f"[SUCCESS] BERTopic model saved to: {model_save_path}")
    print(f"Model includes {len(topic_df)} topics with Gemma-generated names.")
except Exception as e:
    print(f"[ERROR] Failed to save model: {e}")
    raise e

# Also save hierarchy_df separately for convenience
hierarchy_save_path = os.path.join(full_output_path, "bertopic_chapters_hierarchy.pkl")
try:
    with open(hierarchy_save_path, 'wb') as f:
        pickle.dump(hierarchy_df, f)
    print(f"[SUCCESS] Hierarchy dataframe saved to: {hierarchy_save_path}")
except Exception as e:
    print(f"[WARNING] Failed to save hierarchy: {e}")

# Save topic_df with names for reference
topic_df_save_path = os.path.join(full_output_path, "bertopic_chapters_topic_df.pkl")
try:
    with open(topic_df_save_path, 'wb') as f:
        pickle.dump(topic_df, f)
    print(f"[SUCCESS] Topic dataframe saved to: {topic_df_save_path}")
except Exception as e:
    print(f"[WARNING] Failed to save topic_df: {e}")

print("\nModel and related data saved successfully!")


Saving BERTopic model to Google Drive...
[SUCCESS] BERTopic model saved to: /content/drive/MyDrive/gemma_embeddings_output/bertopic_chapters_model.pkl
Model includes 494 topics with Gemma-generated names.
[SUCCESS] Hierarchy dataframe saved to: /content/drive/MyDrive/gemma_embeddings_output/bertopic_chapters_hierarchy.pkl
[SUCCESS] Topic dataframe saved to: /content/drive/MyDrive/gemma_embeddings_output/bertopic_chapters_topic_df.pkl

Model and related data saved successfully!


In [18]:
# ---
# 9. AGGREGATE RESULTS FROM SENTENCES TO CHAPTERS
# ---

def get_prob_dict(topics):
    topic_counts = collections.Counter(topics)
    total_count = sum(topic_counts.values())
    prob_dict = {topic: count / total_count for topic, count in topic_counts.items()}
    return prob_dict

print("Aggregating sentence-level topics back to chapters...")

# Create DataFrame for sentences
df_sentences = pd.DataFrame(sentence_metadata)
df_sentences['text'] = all_sentences
df_sentences['topic'] = topics
df_sentences['embedding'] = list(sentence_embeddings)

# Group by subject and chapter to get chapter-level results
df_chapters = df_sentences.groupby(['subject', 'chapter']).agg(
    topic_dist=('topic', lambda x: get_prob_dict(x.tolist())),
    embedding=('embedding', lambda x: np.mean(np.array(x.tolist()), axis=0))
).reset_index()

# Helper function to get the main topic, IGNORING outliers (-1)
def get_main_topic_no_outliers(topic_dist):
    if not isinstance(topic_dist, dict):
        return -1

    # Filter out the outlier topic -1
    filtered_dist = {k: v for k, v in topic_dist.items() if k != -1}

    # If all sentences were outliers, fallback to -1. Otherwise, pick the max.
    if not filtered_dist:
        return -1
    return max(filtered_dist, key=filtered_dist.get)

# Find main topic for each chapter (excluding outliers)
df_chapters['main_topic'] = df_chapters['topic_dist'].apply(get_main_topic_no_outliers)

# Map the topic ID to the Gemma-generated name
df_chapters['main_topic_name'] = df_chapters['main_topic'].apply(
    lambda x: topic_df.loc[topic_df['Topic'] == x, 'topic_name'].values[0]
    if x is not None and x in topic_df['Topic'].values else "Outlier"
)

print(f"Aggregated to {len(df_chapters)} chapters.")
print("\n=== CHAPTER TOPIC SAMPLES ===")
print(df_chapters[['subject', 'chapter', 'main_topic_name']].head(10))

Aggregating sentence-level topics back to chapters...
Aggregated to 102 chapters.

=== CHAPTER TOPIC SAMPLES ===
   subject     chapter                           main_topic_name
0  biology  chapter_01            Science, Scientific, Knowledge
1  biology  chapter_02              Elements, Electrons, Valence
2  biology  chapter_03      Monosaccharides, Fructose, Galactose
3  biology  chapter_04                  Vesicle, Golgi, Membrane
4  biology  chapter_05     Active Transport, Gradient, Transport
5  biology  chapter_06                Atp, Glycolysis, Phosphate
6  biology  chapter_07                Atp, Glycolysis, Phosphate
7  biology  chapter_08  Chlorophyll, Photosystem, Photosynthesis
8  biology  chapter_09            Signaling, Receptor, Receptors
9  biology  chapter_10          Chromosomes, Chromatids, Spindle


In [19]:
# ---
# 10. COSINE SIMILARITY TEST: Summed Page Embeddings vs Chapter Embeddings
# ---

import re
import pickle
from tqdm.notebook import tqdm

# ==========================================
# PART A: SAVE ARTIFACTS (Pickle & Stats)
# ==========================================
print("Saving embeddings and calculating token stats...")
full_output_path = os.path.join(CONFIG['drive_base_path'], CONFIG['output_base_path'])
os.makedirs(full_output_path, exist_ok=True)
pkl_path = os.path.join(full_output_path, "chapter_embeddings_and_stats.pkl")

# Add token stats to the existing dictionary
for key, data in chapter_embeddings_dict.items():
    # Simple whitespace token count, or use tokenizer if available
    data['token_count'] = len(data['text'].split())

try:
    with open(pkl_path, 'wb') as f:
        pickle.dump(chapter_embeddings_dict, f)
    print(f"[SUCCESS] Embeddings and stats saved to: {pkl_path}")
except Exception as e:
    print(f"[ERROR] Failed to save pickle file: {e}")

# ==========================================
# PART B: COSINE SIMILARITY TEST
# ==========================================
print("\nConducting cosine similarity test...")
print("Comparing summed page embeddings with chapter embeddings...")

# --- MULTI-SUBJECT CSV MAPPING SETUP ---
mapping_files = {
    'biology': '/content/drive/MyDrive/dataset_preparation/chapter_ranges_biology.csv',
    'chemistry': '/content/drive/MyDrive/dataset_preparation/chapter_ranges_chemistry.csv',
    'physics': '/content/drive/MyDrive/dataset_preparation/chapter_ranges_physics.csv'
}

all_mappings = []

def parse_chapter_id(title):
    """Extracts 'chapter_XX' from titles like 'Chapter 01 - ...'"""
    match = re.search(r'Chapter\s+(\d+)', str(title), re.IGNORECASE)
    if match:
        return f"chapter_{int(match.group(1)):02d}"
    return None

for subj, fpath in mapping_files.items():
    if os.path.exists(fpath):
        try:
            df_map = pd.read_csv(fpath)
            # Standardize columns
            df_map.columns = [c.lower().strip() for c in df_map.columns]

            # Fix column names (start -> start_page, end -> end_page)
            rename_dict = {'start': 'start_page', 'end': 'end_page'}
            df_map = df_map.rename(columns=rename_dict)

            # Generate 'chapter' ID column from 'title' if missing
            if 'chapter' not in df_map.columns and 'title' in df_map.columns:
                df_map['chapter'] = df_map['title'].apply(parse_chapter_id)
                # print(f"[INFO] Generated 'chapter' column from 'title' for {subj}.")

            # Add subject column for disambiguation
            df_map['subject'] = subj
            all_mappings.append(df_map)
            print(f"[INFO] Loaded mapping for {subj} from {fpath}")
        except Exception as e:
            print(f"[ERROR] Failed to load {fpath}: {e}")
    else:
        print(f"[WARNING] File not found: {fpath}")

page_mapping_df = None
if all_mappings:
    page_mapping_df = pd.concat(all_mappings, ignore_index=True)
    print(f"[INFO] Combined mapping dataframe created with {len(page_mapping_df)} rows.")
    # Drop rows where chapter parsing failed
    page_mapping_df = page_mapping_df.dropna(subset=['chapter'])
    print(f"[INFO] Valid mapping rows after parsing: {len(page_mapping_df)}")
    # print(page_mapping_df[['subject', 'chapter', 'start_page', 'end_page']].head())
else:
    print("[ERROR] No mapping CSVs were loaded. Similarity test will fail to find pages.")
# -------------------------

similarity_results = []

# Use tqdm for progress bar
for idx, row in tqdm(df_chapters.iterrows(), total=len(df_chapters), desc="Processing Chapters"):
    subject = row['subject']
    chapter = row['chapter']
    chapter_embedding = row['embedding']

    # Load pages for this chapter using the combined mapping
    page_texts, page_numbers = load_page_data_for_chapter(CONFIG, chapter, subject, page_mapping_df)

    if len(page_texts) == 0:
        # Only warn periodically to avoid spamming output
        if idx % 20 == 0:
            print(f"Warning (sample): No pages found for {subject}/{chapter} (checked {idx}/{len(df_chapters)} chapters)")
        continue

    # Generate embeddings for pages
    page_embeddings = embedding_model.encode(page_texts, show_progress_bar=False, batch_size=32)

    # Sum the page embeddings
    summed_page_embedding = np.sum(page_embeddings, axis=0)

    # Normalize both embeddings for cosine similarity
    chapter_emb_norm = chapter_embedding / (np.linalg.norm(chapter_embedding) + 1e-9)
    summed_page_emb_norm = summed_page_embedding / (np.linalg.norm(summed_page_embedding) + 1e-9)

    # Calculate cosine similarity
    cosine_sim = np.dot(chapter_emb_norm, summed_page_emb_norm)

    similarity_results.append({
        'subject': subject,
        'chapter': chapter,
        'num_pages': len(page_texts),
        'cosine_similarity': cosine_sim,
        'chapter_embedding': chapter_embedding,
        'summed_page_embedding': summed_page_embedding
    })

# Create DataFrame with results
similarity_df = pd.DataFrame(similarity_results)

# Check if DataFrame is empty before proceeding
if not similarity_df.empty:
    # Sort by cosine similarity in descending order
    similarity_df = similarity_df.sort_values('cosine_similarity', ascending=False).reset_index(drop=True)

    print("\n=== COSINE SIMILARITY RESULTS (Sorted by Similarity) ===")
    print(similarity_df[['subject', 'chapter', 'num_pages', 'cosine_similarity']].to_string(index=False))

    print(f"\nMean cosine similarity: {similarity_df['cosine_similarity'].mean():.4f}")
    print(f"Std cosine similarity: {similarity_df['cosine_similarity'].std():.4f}")
    print(f"Min cosine similarity: {similarity_df['cosine_similarity'].min():.4f}")
    print(f"Max cosine similarity: {similarity_df['cosine_similarity'].max():.4f}")
else:
    print("\n[ERROR] No similarity results calculated because no page files were matched.")
    if page_mapping_df is not None:
        print("Debug: Check if 'chapter' column in mapping matches 'chapter' in df_chapters (e.g. 'chapter_01').")

Saving embeddings and calculating token stats...
[SUCCESS] Embeddings and stats saved to: /content/drive/MyDrive/gemma_embeddings_output/chapter_embeddings_and_stats.pkl

Conducting cosine similarity test...
Comparing summed page embeddings with chapter embeddings...
[INFO] Loaded mapping for biology from /content/drive/MyDrive/dataset_preparation/chapter_ranges_biology.csv
[INFO] Loaded mapping for chemistry from /content/drive/MyDrive/dataset_preparation/chapter_ranges_chemistry.csv
[INFO] Loaded mapping for physics from /content/drive/MyDrive/dataset_preparation/chapter_ranges_physics.csv
[INFO] Combined mapping dataframe created with 102 rows.
[INFO] Valid mapping rows after parsing: 102


Processing Chapters:   0%|          | 0/102 [00:00<?, ?it/s]


=== COSINE SIMILARITY RESULTS (Sorted by Similarity) ===
  subject    chapter  num_pages  cosine_similarity
  biology chapter_36         32           0.988373
  biology chapter_17         28           0.988297
  biology chapter_19         20           0.987648
  biology chapter_27         26           0.987565
  biology chapter_46         30           0.987564
  biology chapter_40         24           0.987273
  biology chapter_21         30           0.987269
  biology chapter_37         32           0.987139
  biology chapter_26         30           0.987101
  biology chapter_12         32           0.986981
  biology chapter_11         18           0.986821
  biology chapter_47         32           0.986793
  biology chapter_43         36           0.986707
  biology chapter_30         42           0.986666
  biology chapter_42         34           0.986664
  biology chapter_22         38           0.986529
  biology chapter_16         24           0.986401
  biology chapter_09    

In [20]:
# ---
# 11. GENERATE ALL VISUALIZATIONS
# ---

print("Generating all visualizations... This may take a moment.")

# 1. Prepare Data Matrices
def create_doc_topic_matrix(df_chapters, topic_df):
    """Converts the topic distribution dictionary into a full matrix."""
    num_docs = len(df_chapters)
    all_topic_ids = sorted([t for t in topic_df.Topic.unique() if t != -1])

    doc_topic_matrix = np.zeros((num_docs, len(all_topic_ids)))

    for i, row in df_chapters.iterrows():
        if isinstance(row['topic_dist'], dict):
            for topic, prob in row['topic_dist'].items():
                if topic in all_topic_ids:
                    col_index = all_topic_ids.index(topic)
                    doc_topic_matrix[i, col_index] = prob

    return pd.DataFrame(doc_topic_matrix, index=df_chapters.index, columns=all_topic_ids)

doc_topic_matrix = create_doc_topic_matrix(df_chapters, topic_df)
topic_names_map = topic_df.set_index('Topic')['topic_name'].to_dict()
custom_labels = {key: f"{key}: {name}" for key, name in topic_names_map.items()}
topic_model.set_topic_labels(custom_labels)

# 2. Topic Map & Hierarchy
print(" - Creating Topic Map & Hierarchy (All Topics)...")
fig_topic_map = topic_model.visualize_topics(title="<b>Interactive Topic Map</b>", custom_labels=True)
# REVERTED: Showing all topics as requested (WARNING: May be crowded)
fig_topic_tree = topic_model.visualize_hierarchy(custom_labels=True, title="<b>Topic Hierarchy Dendrogram</b>")

# 3. Global Similarity Matrix (All Topics)
print(" - Creating Global Similarity Matrix (All Topics)...")

# Use ALL topics (excluding outlier -1)
info_df = topic_model.get_topic_info()
top_info = info_df[info_df['Topic'] != -1].sort_values('Count', ascending=False)
# No slicing (top_n removed)
top_indices = top_info.index.tolist()
top_topic_ids = top_info['Topic'].tolist()

if topic_model.topic_embeddings_ is not None:
    # Use top_indices to map correctly to the internal embeddings list
    global_embeddings = np.array(topic_model.topic_embeddings_)[top_indices]
    valid_topic_names = [topic_names_map.get(tid, f"Topic {tid}") for tid in top_topic_ids]

    global_similarity_matrix = cosine_similarity(global_embeddings)

    distance_matrix = 1 - global_similarity_matrix
    linked = linkage(distance_matrix, method='average')
    dendro_data = dendrogram(linked, no_plot=True)
    reordered_index = dendro_data['leaves']
    reordered_matrix = global_similarity_matrix[reordered_index, :][:, reordered_index]
    reordered_topic_names = np.array(valid_topic_names)[reordered_index]

    fig_global_similarity = go.Figure(data=go.Heatmap(
        z=reordered_matrix, x=reordered_topic_names, y=reordered_topic_names,
        colorscale='Viridis', colorbar_title='Cosine Similarity'
    ))
    fig_global_similarity.update_layout(title=f"<b>Clustered Global Topic Similarity Matrix (All Topics)</b>", width=1400, height=1400)
else:
    print("Topic embeddings not found in model. Skipping Global Similarity Matrix.")
    fig_global_similarity = go.Figure()

# 4. Stacked Bar (Subject Composition)
print(" - Creating Subject Composition Chart...")
df_subject_analysis = doc_topic_matrix.copy()
df_subject_analysis['subject'] = df_chapters['subject'].values
subject_topic_dist = df_subject_analysis.groupby('subject').mean()

# Use All Topics
subject_topic_dist_filtered = subject_topic_dist[top_topic_ids]

fig_subject_stacked_bar = go.Figure()
for topic_id in subject_topic_dist_filtered.columns:
    topic_name = topic_names_map.get(topic_id, f"Topic {topic_id}")
    fig_subject_stacked_bar.add_trace(go.Bar(
        y=subject_topic_dist_filtered.index, x=subject_topic_dist_filtered[topic_id],
        name=topic_name, orientation='h'
    ))
fig_subject_stacked_bar.update_layout(barmode='stack', title="<b>Relative Topic Weight by Subject</b>")

# 5. Subject Distribution Per Topic
print(" - Creating Topic Distribution per Subject...")
bar_traces = []
topic_dropdown_buttons = []
topic_subject_dist = subject_topic_dist[top_topic_ids].T

# Limit dropdown to top 200 just to keep the menu usable, but show all data if needed
# (Showing all 700 in a dropdown can freeze the browser, sticking to top 200 for interactivity safety)
display_limit_ids = top_topic_ids[:300]

for i, topic_id in enumerate(display_limit_ids):
    topic_name = topic_names_map.get(topic_id, f"Topic {topic_id}")
    distribution = topic_subject_dist.loc[topic_id]
    bar_traces.append(go.Bar(x=distribution.index, y=distribution.values, name=topic_name, visible=(i == 0)))
    visibility_mask = [False] * len(display_limit_ids)
    visibility_mask[i] = True
    topic_dropdown_buttons.append(dict(label=f"{topic_id}: {topic_name}", method='update', args=[{'visible': visibility_mask}]))

fig_topic_subject_dist = go.Figure(data=bar_traces)
fig_topic_subject_dist.update_layout(
    title="<b>Subject Distribution (Select a Topic - Top 300 shown)</b>",
    updatemenus=[dict(active=0, buttons=topic_dropdown_buttons, direction="down", x=0.01, y=1.1)]
)

# 6. Per-Subject Heatmaps
print(" - Creating Per-Subject Heatmaps (All detected topics)...")
heatmap_traces = []
heatmap_buttons = []
unique_subjects = df_chapters['subject'].unique()

for subject in unique_subjects:
    subject_df = df_chapters[df_chapters['subject'] == subject]
    subject_topic_embeddings = defaultdict(list)
    for _, row in subject_df.iterrows():
        if isinstance(row['topic_dist'], dict):
            main_topic_for_chunk = max(row['topic_dist'], key=row['topic_dist'].get)
            if main_topic_for_chunk != -1:
                subject_topic_embeddings[main_topic_for_chunk].append(row['embedding'])

    # Sort by prevalence
    sorted_topics_by_count = sorted(subject_topic_embeddings.keys(), key=lambda k: len(subject_topic_embeddings[k]), reverse=True)

    # No slice limit (previously [:50])
    present_topic_ids = sorted_topics_by_count

    if len(present_topic_ids) < 2: continue

    present_topic_names = [topic_names_map.get(tid, f"Topic {tid}") for tid in present_topic_ids]
    present_embeddings = np.array([np.mean(subject_topic_embeddings[tid], axis=0) for tid in present_topic_ids])

    norms = np.linalg.norm(present_embeddings, axis=1)
    non_zero_mask = norms > 1e-9
    if np.sum(non_zero_mask) < 2: continue
    final_embeddings = present_embeddings[non_zero_mask]
    final_topic_names = np.array(present_topic_names)[non_zero_mask].tolist()

    subject_sim_matrix = cosine_similarity(final_embeddings)

    # Dynamic size based on number of topics to avoid tiny cells
    dynamic_height = max(600, len(final_topic_names) * 15)

    subject_dist_matrix = 1 - subject_sim_matrix
    subject_linked = linkage(subject_dist_matrix, method='average')
    dendro_data_for_heatmap = dendrogram(subject_linked, no_plot=True)
    reordered_index = dendro_data_for_heatmap['leaves']
    subject_reordered_matrix = subject_sim_matrix[reordered_index, :][:, reordered_index]
    subject_reordered_names = np.array(final_topic_names)[reordered_index]

    heatmap_traces.append(go.Heatmap(
        z=subject_reordered_matrix, x=subject_reordered_names, y=subject_reordered_names,
        colorscale='Viridis', visible=False
    ))
    visible_array = [False] * len(unique_subjects)
    visible_array[len(heatmap_traces)-1] = True
    heatmap_buttons.append(dict(label=f"{subject}", method='update', args=[{'visible': visible_array}, {'title': f'Topic Similarity - {subject}', 'height': dynamic_height}]))

fig_subject_heatmap = go.Figure(data=heatmap_traces)
if heatmap_traces: fig_subject_heatmap.data[0].visible = True
fig_subject_heatmap.update_layout(
    title="Topic Similarity Heatmaps by Subject",
    updatemenus=[dict(active=0, buttons=heatmap_buttons, direction="down", x=0.01, y=1.15)]
)

print("Done! All figures generated.")

Generating all visualizations... This may take a moment.
 - Creating Topic Map & Hierarchy (All Topics)...
 - Creating Global Similarity Matrix (All Topics)...
 - Creating Subject Composition Chart...
 - Creating Topic Distribution per Subject...
 - Creating Per-Subject Heatmaps (All detected topics)...
Done! All figures generated.


In [21]:
# Analysis 1 A: Interactive 2D Topic Map
fig_topic_map = topic_model.visualize_topics(
    title="<b>Interactive Topic Map</b>",
    custom_labels=True
)

# Analysis 1 B: Topic Hierarchy Dendrogram
fig_topic_tree = topic_model.visualize_hierarchy(
    custom_labels=True,
    title="<b>Topic Hierarchy Dendrogram</b>"
)

In [22]:
# Analysis 2: Clustered Global Topic Similarity Matrix
global_embeddings = topic_model.topic_embeddings_[1:]  # Exclude -1 outlier topic
global_similarity_matrix = cosine_similarity(global_embeddings)
valid_topic_ids = sorted([t for t in topic_df.Topic.unique() if t != -1])
valid_topic_names = [topic_names_map.get(tid, f"Topic {tid}") for tid in valid_topic_ids]

distance_matrix = 1 - global_similarity_matrix
linked = linkage(distance_matrix, method='average')
dendro_data = dendrogram(linked, no_plot=True)
reordered_index = dendro_data['leaves']
reordered_matrix = global_similarity_matrix[reordered_index, :][:, reordered_index]
reordered_topic_names = np.array(valid_topic_names)[reordered_index]

fig_global_similarity = go.Figure(data=go.Heatmap(
    z=reordered_matrix,
    x=reordered_topic_names,
    y=reordered_topic_names,
    colorscale='Viridis',
    colorbar_title='Cosine Similarity'
))
fig_global_similarity.update_layout(
    title="<b>Clustered Global Topic Similarity Matrix</b>",
    xaxis_title="Topic",
    yaxis_title="Topic",
    width=1600,
    height=1400
)

In [23]:
# Analysis 3: Subject-Topic Distribution (Stacked Bar)
df_subject_analysis = doc_topic_matrix.copy()
df_subject_analysis['subject'] = df_chapters['subject'].values
subject_topic_dist = df_subject_analysis.groupby('subject').mean()

fig_subject_stacked_bar = go.Figure()
for topic_id in subject_topic_dist.columns:
    topic_name = topic_names_map.get(topic_id, f"Topic {topic_id}")
    fig_subject_stacked_bar.add_trace(go.Bar(
        y=subject_topic_dist.index,
        x=subject_topic_dist[topic_id],
        name=topic_name,
        orientation='h'
    ))
fig_subject_stacked_bar.update_layout(
    barmode='stack',
    title="<b>Relative Topic Weight by Subject</b>",
    xaxis_title="Proportional Topic Weight",
    yaxis_title="Subject",
    legend_title="Topics",
    yaxis={'categoryorder':'total ascending'}
)

In [24]:
# Analysis 4: Interactive Subject Distribution PER TOPIC
bar_traces = []
topic_dropdown_buttons = []
topic_subject_dist = subject_topic_dist.T

for i, topic_id in enumerate(valid_topic_ids):
    topic_name = topic_names_map.get(topic_id, f"Topic {topic_id}")
    distribution = topic_subject_dist.loc[topic_id]

    bar_traces.append(go.Bar(
        x=distribution.index,
        y=distribution.values,
        name=topic_name,
        visible=(i == 0)
    ))

    visibility_mask = [False] * len(valid_topic_ids)
    visibility_mask[i] = True
    topic_dropdown_buttons.append(dict(
        label=f"{topic_id}: {topic_name}",
        method='update',
        args=[{'visible': visibility_mask}]
    ))

fig_topic_subject_dist = go.Figure(data=bar_traces)
fig_topic_subject_dist.update_layout(
    title="<b>Subject Distribution (Select a Topic)</b>",
    updatemenus=[dict(
        active=0,
        buttons=list(topic_dropdown_buttons),
        direction="down",
        x=0.01,
        xanchor="left",
        y=1.1,
        yanchor="top"
    )],
    xaxis_title="Subject",
    yaxis_title="Average Topic Weight in Subject",
    width=800,
    height=600
)

In [25]:
# ---
# Analysis 5: Per-Subject Heatmaps & Hierarchies
# ---

print("Generating Per-Subject Topic Similarity Heatmaps...")

heatmap_traces = []
heatmap_buttons = []

# Get list of subjects
unique_subjects = sorted(df_chapters['subject'].unique())
print(f"Found subjects: {unique_subjects}")

# Ensure we have topic names mapped
topic_names_map = topic_df.set_index('Topic')['topic_name'].to_dict()
valid_topic_ids = sorted([t for t in topic_df.Topic.unique() if t != -1])

for subject in unique_subjects:
    print(f"Processing {subject}...")
    subject_df = df_chapters[df_chapters['subject'] == subject]

    # Collect embeddings for topics that actually appear in this subject
    subject_topic_embeddings = defaultdict(list)

    for _, row in subject_df.iterrows():
        # FIX: Use the 'main_topic' column we explicitly calculated to ignore outliers
        # instead of re-calculating it here.
        if 'main_topic' in row:
            main_topic = row['main_topic']
            if main_topic != -1:
                subject_topic_embeddings[main_topic].append(row['embedding'])

    # Identify which valid topics are present in this subject
    present_topic_ids = [tid for tid in valid_topic_ids if tid in subject_topic_embeddings]

    if len(present_topic_ids) < 2:
        print(f"  -> Skipping {subject}: Not enough distinct topics found ({len(present_topic_ids)} found).")
        continue

    # Calculate average embedding for each present topic
    present_topic_names = [topic_names_map.get(tid, f"Topic {tid}") for tid in present_topic_ids]
    present_embeddings = np.array([np.mean(subject_topic_embeddings[tid], axis=0) for tid in present_topic_ids])

    # Filter out zero-vectors (just in case)
    norms = np.linalg.norm(present_embeddings, axis=1)
    non_zero_mask = norms > 1e-9
    if np.sum(non_zero_mask) < 2:
        print(f"  -> Skipping {subject}: Topics have zero-vector embeddings.")
        continue

    final_embeddings = present_embeddings[non_zero_mask]
    final_topic_names = np.array(present_topic_names)[non_zero_mask].tolist()

    # --- Calculate Cosine Similarity ---
    subject_sim_matrix = cosine_similarity(final_embeddings)

    # --- Cluster for Better Visualization (Hierarchical Ordering) ---
    # This groups similar topics together on the axis
    subject_dist_matrix = 1 - subject_sim_matrix
    subject_linked = linkage(subject_dist_matrix, method='average')
    dendro_data = dendrogram(subject_linked, no_plot=True)

    reordered_index = dendro_data['leaves']
    subject_reordered_matrix = subject_sim_matrix[reordered_index, :][:, reordered_index]
    subject_reordered_names = np.array(final_topic_names)[reordered_index]

    # Create Heatmap Trace
    heatmap_trace = go.Heatmap(
        z=subject_reordered_matrix,
        x=subject_reordered_names,
        y=subject_reordered_names,
        colorscale='Viridis',
        colorbar_title='Cosine Similarity',
        visible=False
    )
    heatmap_traces.append(heatmap_trace)

    # Create Button for Dropdown
    # Determine visibility: Only the trace corresponding to this subject should be True
    # Note: The trace index for this loop iteration is len(heatmap_traces) - 1
    current_trace_index = len(heatmap_traces) - 1

    # We don't know the final total yet, but we can define the args dynamically later
    # OR, simplified approach: Re-loop after creation to set up buttons correctly.
    # Let's store metadata for buttons and create them at the end.

    # (Wait, Plotly updatemenus need fixed-length arrays. We'll fix it after the loop)

# Final Figure Assembly
if heatmap_traces:
    fig_subject_heatmap = go.Figure(data=heatmap_traces)
    fig_subject_heatmap.data[0].visible = True

    # Now construct buttons knowing the total count
    total_traces = len(heatmap_traces)
    # The subject names in order of traces
    processed_subjects = [t['x'][0] for t in heatmap_traces] # Just to verify? No, better to track names.
    # Actually, we iterate 'unique_subjects' but some might skip.
    # Let's assume heatmap_buttons needs to be rebuilt.

    heatmap_buttons = []
    # We need to track which subject corresponds to which trace index.
    # Re-running logic slightly to map subject names to traces.
    trace_subjects = []

    # Re-populate trace_subjects based on what wasn't skipped.
    # A bit tricky with the current structure. Let's just recreate buttons.

    # Cleanest way: Store (subject, trace) pairs in the loop.
    pass

# --- REFINED LOOP for robust button generation ---
# (Rewriting the logic above to be cleaner)

final_traces = []
final_buttons = []

for subject in unique_subjects:
    subject_df = df_chapters[df_chapters['subject'] == subject]
    subject_topic_embeddings = defaultdict(list)

    for _, row in subject_df.iterrows():
        if 'main_topic' in row and row['main_topic'] != -1:
            subject_topic_embeddings[row['main_topic']].append(row['embedding'])

    present_topic_ids = [tid for tid in valid_topic_ids if tid in subject_topic_embeddings]
    if len(present_topic_ids) < 2: continue

    present_topic_names = [topic_names_map.get(tid, f"Topic {tid}") for tid in present_topic_ids]
    present_embeddings = np.array([np.mean(subject_topic_embeddings[tid], axis=0) for tid in present_topic_ids])

    norms = np.linalg.norm(present_embeddings, axis=1)
    non_zero_mask = norms > 1e-9
    if np.sum(non_zero_mask) < 2: continue

    final_embeddings = present_embeddings[non_zero_mask]
    final_topic_names = np.array(present_topic_names)[non_zero_mask].tolist()

    subject_sim_matrix = cosine_similarity(final_embeddings)
    subject_dist_matrix = 1 - subject_sim_matrix
    subject_linked = linkage(subject_dist_matrix, method='average')
    dendro_data = dendrogram(subject_linked, no_plot=True)
    reordered_index = dendro_data['leaves']
    subject_reordered_matrix = subject_sim_matrix[reordered_index, :][:, reordered_index]
    subject_reordered_names = np.array(final_topic_names)[reordered_index]

    final_traces.append(go.Heatmap(
        z=subject_reordered_matrix, x=subject_reordered_names, y=subject_reordered_names,
        colorscale='Viridis', visible=False
    ))

    # We'll construct buttons afterwards

# Build Figure
if final_traces:
    fig_subject_heatmap = go.Figure(data=final_traces)
    fig_subject_heatmap.data[0].visible = True

    # Construct buttons
    # We need to know which subject belongs to which trace.
    # The loop iterated 'unique_subjects' in order.
    # We need to know WHICH ones succeeded.
    # Let's capture the subject name in the loop.

    # Rerunning with tracking:
    pass

# --- CORRECTED CODE BLOCK ---
heatmap_traces = []
heatmap_subject_names = []

for subject in unique_subjects:
    subject_df = df_chapters[df_chapters['subject'] == subject]
    subject_topic_embeddings = defaultdict(list)
    for _, row in subject_df.iterrows():
        if 'main_topic' in row and row['main_topic'] != -1:
            subject_topic_embeddings[row['main_topic']].append(row['embedding'])

    present_topic_ids = [tid for tid in valid_topic_ids if tid in subject_topic_embeddings]
    if len(present_topic_ids) < 2: continue

    present_topic_names = [topic_names_map.get(tid, f"Topic {tid}") for tid in present_topic_ids]
    present_embeddings = np.array([np.mean(subject_topic_embeddings[tid], axis=0) for tid in present_topic_ids])

    norms = np.linalg.norm(present_embeddings, axis=1)
    non_zero_mask = norms > 1e-9
    if np.sum(non_zero_mask) < 2: continue

    final_embeddings = present_embeddings[non_zero_mask]
    final_topic_names = np.array(present_topic_names)[non_zero_mask].tolist()

    subject_sim_matrix = cosine_similarity(final_embeddings)
    subject_dist_matrix = 1 - subject_sim_matrix
    subject_linked = linkage(subject_dist_matrix, method='average')
    dendro_data = dendrogram(subject_linked, no_plot=True)
    reordered_index = dendro_data['leaves']
    subject_reordered_matrix = subject_sim_matrix[reordered_index, :][:, reordered_index]
    subject_reordered_names = np.array(final_topic_names)[reordered_index]

    heatmap_traces.append(go.Heatmap(
        z=subject_reordered_matrix, x=subject_reordered_names, y=subject_reordered_names,
        colorscale='Viridis', visible=False
    ))
    heatmap_subject_names.append(subject)

if heatmap_traces:
    fig_subject_heatmap = go.Figure(data=heatmap_traces)
    fig_subject_heatmap.data[0].visible = True

    buttons = []
    for i, subj_name in enumerate(heatmap_subject_names):
        visible_mask = [False] * len(heatmap_traces)
        visible_mask[i] = True
        buttons.append(dict(
            label=subj_name.capitalize(),
            method='update',
            args=[{'visible': visible_mask}, {'title': f'Topic Similarity Heatmap - {subj_name.capitalize()}'}]
        ))

    fig_subject_heatmap.update_layout(
        title=f"Topic Similarity Heatmap - {heatmap_subject_names[0].capitalize()}",
        updatemenus=[dict(
            active=0,
            buttons=buttons,
            direction="down",
            x=0.0, xanchor="left",
            y=1.15, yanchor="top"
        )],
        width=900,
        height=800,
        xaxis_showticklabels=True,
        yaxis_showticklabels=True,
        yaxis_autorange='reversed'
    )
    fig_subject_heatmap.show()
else:
    print("No heatmaps could be generated.")

Generating Per-Subject Topic Similarity Heatmaps...
Found subjects: ['biology', 'chemistry', 'physics']
Processing biology...
Processing chemistry...
Processing physics...


In [26]:
# ---
# 12. GENERATE HTML REPORT
# ---

print("Compiling final HTML report...")

# Prepare tables
topic_df_html = topic_df.drop(columns=['Representative_Docs'], errors='ignore').copy()
topic_df_html['Top 10 Keywords'] = topic_df['Representation'].apply(lambda x: ', '.join([str(w) for w in extract_keywords(x)]))

# Format Similarity Table
similarity_df_html = similarity_df[['subject', 'chapter', 'num_pages', 'cosine_similarity']].copy()
similarity_df_html['cosine_similarity'] = similarity_df_html['cosine_similarity'].round(4)

pio.templates.default = "plotly_white"
html_content = f"""
<html>
<head>
    <title>Textbook Topic Analysis Report</title>
    <style>
        body {{ font-family: sans-serif; margin: 2em; max-width: 1200px; margin: auto; }}
        h1, h2, h3 {{ color: #2c3e50; }}
        .plot {{ margin-bottom: 3em; border: 1px solid #eee; padding: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.05); }}
        hr {{ border: 0; height: 1px; background: #ddd; margin: 3em 0; }}
        table {{ border-collapse: collapse; width: 100%; margin-bottom: 2em; font-size: 0.9em; }}
        th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
        th {{ background-color: #f8f9fa; }}
        tr:nth-child(even) {{ background-color: #f9f9f9; }}
    </style>
</head>
<body>
    <h1>📚 Textbook Topic Analysis Report</h1>
    <p><b>Subjects:</b> {', '.join(df_chapters['subject'].unique()).capitalize()}</p>
    <p><b>Total Chapters:</b> {len(df_chapters)} | <b>Total Topics:</b> {len(valid_topic_ids)}</p>

    <hr>
    <h2>1. Topic Overview</h2>
    <p>List of all discovered topics with their Gemma-generated names and top keywords.</p>
    <div style="max-height: 400px; overflow-y: auto;">
    {topic_df_html.to_html(index=False, escape=False)}
    </div>

    <hr>
    <h2>2. Topic Hierarchy</h2>
    <p>How topics cluster together based on semantic similarity.</p>
    <div class="plot">{fig_topic_tree.to_html(full_html=False, include_plotlyjs='cdn')}</div>

    <hr>
    <h2>3. Topic Map</h2>
    <p>2D visualization of topic clusters.</p>
    <div class="plot">{fig_topic_map.to_html(full_html=False, include_plotlyjs='cdn')}</div>

    <hr>
    <h2>4. Global Topic Similarity</h2>
    <p>Heatmap of cosine similarity between all topics.</p>
    <div class="plot">{fig_global_similarity.to_html(full_html=False, include_plotlyjs='cdn')}</div>

    <hr>
    <h2>5. Subject Composition</h2>
    <p>What topics make up each subject?</p>
    <div class="plot">{fig_subject_stacked_bar.to_html(full_html=False, include_plotlyjs='cdn')}</div>

    <hr>
    <h2>6. Subject Distribution per Topic</h2>
    <p>Select a topic to see which subjects it appears in.</p>
    <div class="plot">{fig_topic_subject_dist.to_html(full_html=False, include_plotlyjs='cdn')}</div>

    <hr>
    <h2>7. Per-Subject Topic Similarity</h2>
    <p>How topics relate within specific subjects.</p>
    <div class="plot">{fig_subject_heatmap.to_html(full_html=False, include_plotlyjs='cdn')}</div>

    <hr>
    <h2>8. Cosine Similarity Validation</h2>
    <p>Comparing Chapter Embeddings vs. Summed Page Embeddings.</p>
    <ul>
        <li><b>Mean Similarity:</b> {similarity_df['cosine_similarity'].mean():.4f}</li>
        <li><b>Min Similarity:</b> {similarity_df['cosine_similarity'].min():.4f}</li>
    </ul>
    <div style="max-height: 400px; overflow-y: auto;">
    {similarity_df_html.to_html(index=False)}
    </div>
</body>
</html>
"""

report_filename = "topic_analysis_report.html"
with open(report_filename, "w", encoding='utf-8') as f:
    f.write(html_content)

print(f"\nSuccessfully generated '{report_filename}'.")
print("Download it from the Files tab on the left.")

Compiling final HTML report...

Successfully generated 'topic_analysis_report.html'.
Download it from the Files tab on the left.


In [27]:
# Display the cosine similarity results
print("\n=== COSINE SIMILARITY TEST SUMMARY ===")
print(similarity_df[['subject', 'chapter', 'num_pages', 'cosine_similarity']].head(10))


=== COSINE SIMILARITY TEST SUMMARY ===
   subject     chapter  num_pages  cosine_similarity
0  biology  chapter_36         32           0.988373
1  biology  chapter_17         28           0.988297
2  biology  chapter_19         20           0.987648
3  biology  chapter_27         26           0.987565
4  biology  chapter_46         30           0.987564
5  biology  chapter_40         24           0.987273
6  biology  chapter_21         30           0.987269
7  biology  chapter_37         32           0.987139
8  biology  chapter_26         30           0.987101
9  biology  chapter_12         32           0.986981


In [28]:
# All visualizations and HTML report have been generated above
# The HTML file 'bertopic_analysis_chapters.html' contains all the results

In [29]:
# This cell has been replaced - all visualizations and HTML generation are now in cell 17