In [1]:
!nvidia-smi




Fri Dec  5 21:33:06 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
from google.colab import drive
import os
import zipfile

drive.mount('/content/drive')

# Path to your file in Drive (update if stored elsewhere)
zip_path = "/content/drive/MyDrive/cleaned_data.zip"
extract_path = "/content/cleaned_data"

# Unzip the file
if os.path.exists(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"✅ Extracted cleaned_data.zip to: {extract_path}")
else:
    print("❌ File not found in Google Drive. Check the path.")

Mounted at /content/drive
✅ Extracted cleaned_data.zip to: /content/cleaned_data


In [3]:
# !unzip -q cleaned_data.zip
# !ls -la cleaned_data/ | head -20

# # Count JSON fiy
# les
# import os
# json_count = len([f for f in os.listdir("cleaned_data") if f.endswith(".json")])
# print(f"\n✓ Total JSON files found: {json_count}")


In [4]:
pip install -U langchain langchain-core langchain-community langchain-text-splitters langchain-huggingface


Collecting langchain
  Downloading langchain-1.1.2-py3-none-any.whl.metadata (4.9 kB)
Collecting langchain-core
  Downloading langchain_core-1.1.1-py3-none-any.whl.metadata (3.7 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-text-splitters
  Downloading langchain_text_splitters-1.0.0-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-1.1.0-py3-none-any.whl.metadata (2.8 kB)
Collecting uuid-utils<1.0,>=0.12.0 (from langchain-core)
  Downloading uuid_utils-0.12.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from lang

In [5]:
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

from pathlib import Path
import json
from typing import List, Dict, Iterator
import numpy as np
import time
from datetime import datetime
import re

In [6]:
class MedicalJSONLoader(BaseLoader):
    """
    Focused medical JSON loader - removes citation/metadata sections effectively

    Filters:
    - Headers that are just DOI/page numbers
    - Pure reference/bibliography entries
    - Sections where heading == page_title (duplicate intro sections)
    - Metadata without actual medical text
    """

    SKIP_HEADINGS = {
        "references",
        "citations",
        "footnotes",
        "acknowledgments",
        "author information",
        "conflict of interest",
        "funding",
        "data availability",
        "supplementary material",
        "appendix",
        "table of contents",
        "index",
        "disclaimer",
        "copyright"
    }

    def __init__(self, json_directory: str):
        self.json_directory = Path(json_directory)
        self.stats = {
            "total_files": 0,
            "successfully_loaded": 0,
            "failed_files": [],
            "total_sections_with_valid_heading": 0,
            "sections_skipped_empty_heading": 0,
            "sections_skipped_empty_content": 0,
            "sections_skipped_junk_heading": 0,
            "sections_skipped_doi_only": 0,
            "sections_skipped_ref_citations": 0,
            "sections_skipped_duplicate_title": 0,
            "full_text_ignored": 0
        }

    @staticmethod
    def is_doi_or_metadata_only(text: str) -> bool:
        """
        Remove sections that are ONLY DOI, page numbers, or pure citations.
        Keep everything else.
        """
        # If it starts with DOI: and nothing else meaningful
        if text.strip().startswith("DOI:") or text.strip().startswith("doi:"):
            return True

        # If text is ONLY: "Hepatology(1):p 358-379, January 2025. DOI:10.1097..."
        # Pattern: mostly digits, colons, months, DOI
        import re
        # Remove DOI pattern
        no_doi = re.sub(r'doi[:\.][\d./]+', '', text, flags=re.IGNORECASE)
        # Remove page numbers like "358-379" or "p 358-379"
        no_pages = re.sub(r'[p\s:]*\d+-\d+', '', no_doi)
        # Remove months
        months = r'January|February|March|April|May|June|July|August|September|October|November|December'
        no_months = re.sub(months, '', no_pages, flags=re.IGNORECASE)
        # Remove journal volume patterns like "81(1)"
        no_journal = re.sub(r'\d+\(\d+\)', '', no_months)

        # What's left after removing all metadata?
        cleaned = no_journal.strip()

        # If almost nothing is left, it's metadata-only
        if len(cleaned) < 20:
            return True

        return False

    @staticmethod
    def is_reference_entry(text: str) -> bool:
        """
        Detect if text is a bibliography/reference entry.
        """
        import re
        patterns = [
            r'et al\.\s+\w+\s+\d{4}',
            r'\w+\s+et al\.\s*\d{4}',
            r'\d{4};\d+:\d+-\d+',
            r'\|\s*Cited\s+Here',
        ]

        for pattern in patterns:
            if re.search(pattern, text):
                return True

        return False

    @staticmethod
    def are_titles_same(heading: str, page_title: str) -> bool:
        """
        Check if heading is the same as page_title (case-insensitive).
        These are duplicate intro sections we want to skip.
        """
        # Normalize both: lowercase, strip whitespace
        heading_norm = heading.lower().strip()
        title_norm = page_title.lower().strip()

        # Exact match
        if heading_norm == title_norm:
            return True

        # Partial match (heading contains most of page_title or vice versa)
        # e.g., page_title: "AASLD Practice Guideline on acute liver failure"
        #       heading: "AASLD Practice Guideline on acute liver failure and management"
        # This would be a duplicate/intro section

        # If one is significantly contained in the other (80%+ similarity)
        heading_words = set(heading_norm.split())
        title_words = set(title_norm.split())

        if len(heading_words) > 0 and len(title_words) > 0:
            # Calculate overlap
            overlap = len(heading_words & title_words)
            max_len = max(len(heading_words), len(title_words))
            similarity = overlap / max_len

            # If >80% similar, they're probably the same
            if similarity > 0.8:
                return True

        return False

    def lazy_load(self) -> Iterator[Document]:
        """Lazy load documents - focused filtering"""
        json_files = sorted(self.json_directory.glob("*.json"))
        self.stats["total_files"] = len(json_files)

        print(f"📁 JSON files found: {len(json_files)}\n")

        for jf in json_files:
            try:
                with jf.open("r", encoding="utf-8") as f:
                    data = json.load(f)

                # Extract metadata
                file_id = data.get("file_id", "unknown")
                page_title = data.get("page_title", "unknown")
                page_url = data.get("page_url", "")

                if "content" in data and "full_text" in data["content"]:
                    self.stats["full_text_ignored"] += 1

                # Navigate to sections
                if "content" not in data or "sections" not in data["content"]:
                    self.stats["failed_files"].append(f"{jf.name} - No content/sections")
                    continue

                sections = data["content"]["sections"]

                if not isinstance(sections, list) or len(sections) == 0:
                    continue

                # Process each section
                for section_idx, section in enumerate(sections):
                    if not isinstance(section, dict):
                        continue

                    # FILTER #1: Valid heading required
                    heading = section.get("heading", "").strip()
                    if not heading:
                        self.stats["sections_skipped_empty_heading"] += 1
                        continue

                    # FILTER #2: Skip obvious junk heading types
                    heading_lower = heading.lower()
                    if heading_lower in self.SKIP_HEADINGS or any(skip in heading_lower for skip in self.SKIP_HEADINGS):
                        self.stats["sections_skipped_junk_heading"] += 1
                        continue

                    # FILTER #3: Skip if heading == page_title (NEW!)
                    if self.are_titles_same(heading, page_title):
                        self.stats["sections_skipped_duplicate_title"] += 1
                        continue

                    # FILTER #4: Extract content
                    section_content = section.get("content", [])

                    if isinstance(section_content, list):
                        text = " ".join(str(s).strip() for s in section_content if s and str(s).strip())
                    elif isinstance(section_content, str):
                        text = section_content.strip()
                    else:
                        text = str(section_content).strip()

                    # FILTER #5: Skip if empty
                    if not text:
                        self.stats["sections_skipped_empty_content"] += 1
                        continue

                    # FILTER #6: Remove DOI-only/metadata-only sections
                    if self.is_doi_or_metadata_only(text):
                        self.stats["sections_skipped_doi_only"] += 1
                        continue

                    # FILTER #7: Remove pure reference entries
                    if self.is_reference_entry(text):
                        self.stats["sections_skipped_ref_citations"] += 1
                        continue

                    # FILTER #8: Minimum length
                    if len(text) < 50:
                        self.stats["sections_skipped_empty_content"] += 1
                        continue

                    # ✅ YIELD - Medical content
                    yield Document(
                        page_content=text,
                        metadata={
                            "file_id": file_id,
                            "page_title": page_title,
                            "page_url": page_url,
                            "source": jf.name,
                            "section_index": section_idx,
                            "heading": heading,
                            "level": section.get("level", 0),
                            "file_path": str(jf),
                            "char_count": len(text)
                        }
                    )
                    self.stats["total_sections_with_valid_heading"] += 1

                self.stats["successfully_loaded"] += 1

            except Exception as e:
                print(f"⚠️ Error reading {jf.name}: {e}")
                self.stats["failed_files"].append(f"{jf.name} - {str(e)}")
                continue

    def load(self) -> List[Document]:
        """Load all documents"""
        docs = list(self.lazy_load())

        print("\n" + "="*70)
        print("📊 JSON LOADING STATISTICS (FOCUSED FILTERING)")
        print("="*70)
        print(f"✓ Valid medical content sections: {self.stats['total_sections_with_valid_heading']}")
        print(f"✗ Empty headings: {self.stats['sections_skipped_empty_heading']}")
        print(f"✗ Junk heading types: {self.stats['sections_skipped_junk_heading']}")
        print(f"✗ Duplicate title sections: {self.stats['sections_skipped_duplicate_title']} ← NEW!")
        print(f"✗ DOI/metadata only: {self.stats['sections_skipped_doi_only']}")
        print(f"✗ Reference entries: {self.stats['sections_skipped_ref_citations']}")
        print(f"✗ Empty/short content: {self.stats['sections_skipped_empty_content']}")
        print(f"⚠️  Full_text fields ignored: {self.stats['full_text_ignored']}")
        print(f"✓ Total valid documents: {len(docs)}")
        print(f"✓ Files loaded: {self.stats['successfully_loaded']}/{self.stats['total_files']}")
        print("="*70 + "\n")

        return docs


In [7]:
class MedicalDataPipeline:
    """
    End-to-end LangChain pipeline for medical data processing with GPU acceleration
    Designed for Google Colab with T4 GPU
    """

    def __init__(
        self,
        json_directory: str,
        model_name: str = "pritamdeka/S-Bluebert-snli-multinli-stsb",
        chunk_size: int = 512,
        chunk_overlap: int = 50,
        device: str = "cuda"
    ):
        self.json_directory = json_directory
        self.model_name = model_name
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.device = device

        # Initialize components
        self.loader = None
        self.splitter = None
        self.embeddings = None
        self.documents = None
        self.chunks = None
        self.vectors = None
        self.pipeline_start_time = None

    def load_documents(self) -> List[Document]:
        """Step 1: Load medical documents from JSONs"""
        print("\n" + "="*70)
        print("[1/4] LOADING DOCUMENTS FROM MEDICAL JSONS")
        print("="*70)

        start_time = time.time()
        self.loader = MedicalJSONLoader(self.json_directory)
        self.documents = self.loader.load()

        elapsed = time.time() - start_time

        print(f"\n✓ Documents loaded in {elapsed:.1f}s")
        print(f" - Total sections: {len(self.documents)}")
        print(f" - Files successfully loaded: {self.loader.stats['successfully_loaded']}/{self.loader.stats['total_files']}")

        if self.loader.stats["failed_files"]:
            print(f" - Failed files ({len(self.loader.stats['failed_files'])}):")
            for failed in self.loader.stats["failed_files"][:5]:
                print(f" • {failed}")

        # Calculate stats
        total_chars = sum(doc.metadata.get("char_count", 0) for doc in self.documents)
        print(f" - Total characters: {total_chars:,}")

        return self.documents

    def split_documents(self) -> List[Document]:
        """Step 2: Split documents into chunks using LangChain splitter"""
        print("\n" + "="*70)
        print(f"[2/4] SPLITTING DOCUMENTS INTO CHUNKS")
        print("="*70)
        print(f" Configuration:")
        print(f" - Chunk size: {self.chunk_size}")
        print(f" - Overlap: {self.chunk_overlap}")

        start_time = time.time()

        # Use RecursiveCharacterTextSplitter for intelligent chunking
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", ". ", " ", ""]
        )

        self.chunks = self.splitter.split_documents(self.documents)
        elapsed = time.time() - start_time

        print(f"\n✓ Documents split in {elapsed:.1f}s")
        print(f" - Total chunks: {len(self.chunks):,}")
        print(f" - Average chunk size: {sum(len(c.page_content) for c in self.chunks) / len(self.chunks):.0f} chars")
        print(f" - Chunking ratio: {len(self.chunks) / len(self.documents):.1f}x")

        return self.chunks

    def remove_duplicate_chunks(self, chunks, similarity_threshold=0.95):
        """Remove near-duplicate chunks from the same document"""
        print("\n" + "="*70)
        print("[DEDUPLICATION] Removing duplicate/near-duplicate chunks")
        print("="*70)

        # Group chunks by source file
        chunks_by_file = {}
        for chunk in chunks:
            source = chunk.metadata.get("source", "unknown")
            if source not in chunks_by_file:
                chunks_by_file[source] = []
            chunks_by_file[source].append(chunk)

        deduplicated_chunks = []
        duplicates_removed = 0

        # Within each file, check for duplicates
        for source, file_chunks in chunks_by_file.items():
            seen_texts = set()

            for chunk in file_chunks:
                text = chunk.page_content

                # Method 1: Exact duplicate - normalize whitespace
                text_normalized = " ".join(text.split())
                if text_normalized in seen_texts:
                    duplicates_removed += 1
                    continue

                # Method 2: Very similar text (85%+ character overlap)
                # Check against last 10 seen chunks for performance
                is_duplicate = False
                for seen_text in list(seen_texts)[-10:]:
                    # Calculate character-level similarity
                    common_chars = sum(1 for a, b in zip(text, seen_text) if a == b)
                    max_len = max(len(text), len(seen_text))
                    similarity = common_chars / max_len if max_len > 0 else 0

                    if similarity > 0.85:  # If 85%+ similar, it's a duplicate
                        is_duplicate = True
                        duplicates_removed += 1
                        break

                if not is_duplicate:
                    seen_texts.add(text_normalized)
                    deduplicated_chunks.append(chunk)

        print(f"✓ Duplicates removed: {duplicates_removed}")
        print(f"✓ Remaining chunks: {len(deduplicated_chunks)}")
        print(f"  Reduction: {(duplicates_removed / len(chunks) * 100):.1f}%")
        print("="*70 + "\n")

        return deduplicated_chunks

    def generate_embeddings(self) -> np.ndarray:
        """Step 3: Generate embeddings with S-BlueBERT on GPU"""
        print("\n" + "="*70)
        print("[3/4] GENERATING EMBEDDINGS WITH S-BLUEBERT (GPU)")
        print("="*70)
        print(f" Model: {self.model_name}")
        print(f" Device: {self.device.upper()}")
        print(f" Total chunks to embed: {len(self.chunks):,}")

        # Initialize HuggingFace embeddings
        print(f"\n 🔥 Loading model on {self.device.upper()}...")
        model_load_start = time.time()

        self.embeddings = HuggingFaceEmbeddings(
            model_name=self.model_name,
            model_kwargs={'device': self.device},
            encode_kwargs={
                'batch_size': 256,  # Large batch size for GPU
                'normalize_embeddings': True
            }
        )

        model_load_time = time.time() - model_load_start
        print(f" ✓ Model loaded in {model_load_time:.1f}s")

        # Extract text from chunks
        texts = [chunk.page_content for chunk in self.chunks]

        # Generate embeddings
        print(f"\n 🔥 Embedding {len(texts):,} chunks on GPU...")
        print(f" ⏳ Estimated time: 3-10 minutes (depending on data size)")

        embed_start = time.time()

        # embed_documents() handles progress bar internally
        self.vectors = self.embeddings.embed_documents(texts)
        self.vectors = np.array(self.vectors, dtype='float32')

        embed_time = time.time() - embed_start

        print(f"\n✓ Embedding complete!")
        print(f" - Time taken: {embed_time/60:.1f} minutes")
        print(f" - Speed: {len(texts)/embed_time:.0f} chunks/second")
        print(f" - Embeddings shape: {self.vectors.shape}")
        print(f" - Memory size: {self.vectors.nbytes / (1024**2):.1f} MB")

        return self.vectors

    def save_outputs(self, output_directory: str = "./pipeline_output") -> Dict:
        """Step 4: Save all outputs to Google Drive or local storage"""
        print("\n" + "="*70)
        print("[4/4] SAVING OUTPUTS")
        print("="*70)

        output_path = Path(output_directory)
        output_path.mkdir(parents=True, exist_ok=True)

        start_time = time.time()

        # Save embeddings as numpy array
        embeddings_path = output_path / "embeddings.npy"
        np.save(embeddings_path, self.vectors)
        print(f" ✓ Embeddings saved: {embeddings_path}")

        # Save chunks with metadata
        chunks_data = []
        for i, chunk in enumerate(self.chunks):
            chunks_data.append({
                "index": i,
                "text": chunk.page_content,
                "metadata": chunk.metadata
            })

        chunks_path = output_path / "chunks_metadata.json"
        with chunks_path.open("w", encoding="utf-8") as f:
            json.dump(chunks_data, f, indent=2, ensure_ascii=False)
        print(f" ✓ Chunks metadata saved: {chunks_path}")

        # Save summary statistics
        summary = {
            "timestamp": datetime.now().isoformat(),
            "total_json_files": self.loader.stats["total_files"],
            "files_successfully_loaded": self.loader.stats["successfully_loaded"],
            "total_sections_extracted": len(self.documents),
            "total_chunks": len(self.chunks),
            "embedding_dimension": int(self.vectors.shape[1]),
            "embedding_dtype": str(self.vectors.dtype),
            "embedding_size_mb": round(self.vectors.nbytes / (1024**2), 2),
            "model_name": self.model_name,
            "device": self.device.upper(),
            "chunking_config": {
                "chunk_size": self.chunk_size,
                "chunk_overlap": self.chunk_overlap
            },
            "gpu_info": {
                "type": "T4 (Google Colab)",
                "batch_size": 256
            }
        }

        summary_path = output_path / "pipeline_summary.json"
        with summary_path.open("w", encoding="utf-8") as f:
            json.dump(summary, f, indent=2)
        print(f" ✓ Summary saved: {summary_path}")

        # Create index file
        index_path = output_path / "index.txt"
        with index_path.open("w", encoding="utf-8") as f:
            f.write("MEDICAL DATA PIPELINE OUTPUT INDEX\n")
            f.write("="*50 + "\n\n")
            f.write(f"Generated: {datetime.now().isoformat()}\n\n")
            f.write("FILES:\n")
            f.write(f" - embeddings.npy: {self.vectors.shape[0]} chunks × {self.vectors.shape[1]} dimensions\n")
            f.write(f" - chunks_metadata.json: Full text and metadata for each chunk\n")
            f.write(f" - pipeline_summary.json: Detailed statistics\n\n")
            f.write("STATISTICS:\n")
            f.write(f" - JSON files processed: {summary['total_json_files']}\n")
            f.write(f" - Sections extracted: {summary['total_sections_extracted']}\n")
            f.write(f" - Chunks created: {summary['total_chunks']:,}\n")
            f.write(f" - Embedding size: {summary['embedding_size_mb']} MB\n")

        print(f" ✓ Index file created: {index_path}")

        elapsed = time.time() - start_time
        print(f"\n✓ All outputs saved in {elapsed:.1f}s")

        return summary

    def run(self, output_directory: str = "./pipeline_output") -> Dict:
        """Execute the complete pipeline"""
        self.pipeline_start_time = time.time()

        print("\n")
        print("╔" + "="*68 + "╗")
        print("║" + " "*15 + "🚀 MEDICAL DATA PROCESSING PIPELINE" + " "*18 + "║")
        print("║" + " "*20 + "LangChain + S-BlueBERT + GPU" + " "*20 + "║")
        print("║" + " "*25 + "Google Colab" + " "*31 + "║")
        print("╚" + "="*68 + "╝")

        # Execute all steps
        self.load_documents()
        self.split_documents()
        self.chunks = self.remove_duplicate_chunks(self.chunks, similarity_threshold=0.95)
        self.generate_embeddings()
        summary = self.save_outputs(output_directory)

        # Calculate total time
        total_time = time.time() - self.pipeline_start_time

        # Print final summary
        print("\n" + "="*70)
        print("✅ PIPELINE EXECUTION COMPLETE!")
        print("="*70)
        print(f"\n📊 FINAL STATISTICS:")
        print(f" ├─ JSON files processed: {summary['total_json_files']}")
        print(f" ├─ Sections extracted: {summary['total_sections_extracted']:,}")
        print(f" ├─ Chunks created: {summary['total_chunks']:,}")
        print(f" ├─ Embedding dimension: {summary['embedding_dimension']}")
        print(f" ├─ Model: {summary['model_name']}")
        print(f" ├─ Device: {summary['device']}")
        print(f" └─ Total time: {total_time/60:.1f} minutes")
        print(f"\n📁 Output location: {output_directory}/")
        print("="*70 + "\n")

        return summary


In [8]:
pipeline = MedicalDataPipeline(
    json_directory="./cleaned_data/cleaned_data",
    model_name="pritamdeka/S-Bluebert-snli-multinli-stsb",
    chunk_size=512,
    chunk_overlap=25,
    device="cuda"  # Colab T4 GPU
)

# Run the complete pipeline
summary = pipeline.run(output_directory="./pipeline_output")



║               🚀 MEDICAL DATA PROCESSING PIPELINE                  ║
║                    LangChain + S-BlueBERT + GPU                    ║
║                         Google Colab                               ║

[1/4] LOADING DOCUMENTS FROM MEDICAL JSONS
📁 JSON files found: 49


📊 JSON LOADING STATISTICS (FOCUSED FILTERING)
✓ Valid medical content sections: 1507
✗ Empty headings: 220
✗ Junk heading types: 107
✗ Duplicate title sections: 56 ← NEW!
✗ DOI/metadata only: 5
✗ Reference entries: 6
✗ Empty/short content: 343
⚠️  Full_text fields ignored: 48
✓ Total valid documents: 1507
✓ Files loaded: 44/49


✓ Documents loaded in 12.7s
 - Total sections: 1507
 - Files successfully loaded: 44/49
 - Failed files (5):
 • 241d01eb3a55b549889d39cf3d0e3a49cb4019c7a2e9f24587aa4e56488213ed_cleaned.json - No content/sections
 • 92c722c98c27b38c58e94dad15f2b25477c275e5ed15e0ba87684ed25a3faef2_cleaned.json - No content/sections
 • a8418d6c65f306e5d83ec5ec4ac69938a98e576737a7733867802e3097ff3eb7_cle

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/356 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

 ✓ Model loaded in 17.2s

 🔥 Embedding 12,788 chunks on GPU...
 ⏳ Estimated time: 3-10 minutes (depending on data size)

✓ Embedding complete!
 - Time taken: 0.9 minutes
 - Speed: 243 chunks/second
 - Embeddings shape: (12788, 768)
 - Memory size: 37.5 MB

[4/4] SAVING OUTPUTS
 ✓ Embeddings saved: pipeline_output/embeddings.npy
 ✓ Chunks metadata saved: pipeline_output/chunks_metadata.json
 ✓ Summary saved: pipeline_output/pipeline_summary.json
 ✓ Index file created: pipeline_output/index.txt

✓ All outputs saved in 0.6s

✅ PIPELINE EXECUTION COMPLETE!

📊 FINAL STATISTICS:
 ├─ JSON files processed: 49
 ├─ Sections extracted: 1,507
 ├─ Chunks created: 12,788
 ├─ Embedding dimension: 768
 ├─ Model: pritamdeka/S-Bluebert-snli-multinli-stsb
 ├─ Device: CUDA
 └─ Total time: 1.4 minutes

📁 Output location: ./pipeline_output/



In [9]:
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity

# Load embeddings and metadata
embeddings = np.load("./pipeline_output/embeddings.npy")
with open("./pipeline_output/chunks_metadata.json", "r", encoding="utf-8") as f:
    chunks_data = json.load(f)

print("="*80)
print("🔬 SEMANTIC SIMILARITY VERIFICATION - MEDICAL CONCEPT CLUSTERING")
print("="*80)
print("\nThis test verifies that semantically related medical concepts are close")
print("in the embedding space (high cosine similarity)\n")

def find_and_display_related_concepts(search_term, top_k=10):
    """
    Find chunks containing a medical concept and show most similar chunks
    """
    print("\n" + "="*80)
    print(f"SEARCHING FOR: '{search_term}'")
    print("="*80)

    # Find all chunks containing the search term
    matching_indices = []
    for idx, chunk in enumerate(chunks_data):
        if search_term.lower() in chunk['text'].lower():
            matching_indices.append(idx)

    if not matching_indices:
        print(f"❌ No chunks found containing '{search_term}'")
        return None

    print(f"\n✓ Found {len(matching_indices)} chunks containing '{search_term}'")
    print(f" Using first occurrence as query...\n")

    # Use first occurrence as the query
    query_idx = matching_indices[0]
    query_embedding = embeddings[query_idx]
    query_text = chunks_data[query_idx]['text']
    query_heading = chunks_data[query_idx]['metadata']['heading']
    query_source = chunks_data[query_idx]['metadata']['source']

    # Display query
    print(f"📄 QUERY CHUNK (Index {query_idx}):")
    print(f" File: {query_source}")
    print(f" Heading: {query_heading}")
    print(f" Text: {query_text[:150]}...")
    print()

    # Calculate similarities with ALL chunks
    similarities = cosine_similarity([query_embedding], embeddings)[0]

    # Get top K most similar (excluding the query itself)
    top_indices = np.argsort(similarities)[::-1][1:top_k+1]

    print(f"🔍 TOP {top_k} MOST SIMILAR CHUNKS:")
    print("-" * 80)

    for rank, idx in enumerate(top_indices, 1):
        sim_score = similarities[idx]
        chunk_text = chunks_data[idx]['text']
        chunk_heading = chunks_data[idx]['metadata']['heading']
        chunk_source = chunks_data[idx]['metadata']['source']

        # Semantic relevance rating
        if sim_score > 0.9:
            relevance = "🟢 EXCELLENT"
        elif sim_score > 0.8:
            relevance = "🟢 VERY GOOD"
        elif sim_score > 0.7:
            relevance = "🟡 GOOD"
        elif sim_score > 0.6:
            relevance = "🟡 MODERATE"
        else:
            relevance = "🔴 WEAK"

        print(f"\n{rank}. Similarity: {sim_score:.4f} | {relevance}")
        print(f" File: {chunk_source}")
        print(f" Heading: {chunk_heading}")
        print(f" Text: {chunk_text[:100]}...")

    return top_indices

# Test with medical terms
find_and_display_related_concepts("liver cirrhosis", top_k=8)
find_and_display_related_concepts("hepatic encephalopathy", top_k=8)
find_and_display_related_concepts("treatment", top_k=8)


🔬 SEMANTIC SIMILARITY VERIFICATION - MEDICAL CONCEPT CLUSTERING

This test verifies that semantically related medical concepts are close
in the embedding space (high cosine similarity)


SEARCHING FOR: 'liver cirrhosis'

✓ Found 108 chunks containing 'liver cirrhosis'
 Using first occurrence as query...

📄 QUERY CHUNK (Index 1580):
 File: 1a4a810ae5257a66c8afbc16350bf4e375ee4aab0ad1e97a0eeebcfd1bed4934_cleaned.json
 Heading: FUTURE RESEARCH
 Text: . Hirooka M, Ochi H, Koizumi Y, Kisaka Y, Abe M, Ikeda Y, et al. Splenic elasticity measured with real-time tissue elastography is a marker of portal ...

🔍 TOP 8 MOST SIMILAR CHUNKS:
--------------------------------------------------------------------------------

1. Similarity: 0.8358 | 🟢 VERY GOOD
 File: fdb941dbf57c3f0668d49c807fba8b278c8b7040b056921d9833666cdd8816a6_cleaned.json
 Heading: Conclusions:
 Text: . Takuma Y, Nouso K, Morimoto Y, Tomokuni J, Sahara A, Takabatake H, et al. Portal hypertension in p...

2. Similarity: 0.8199 | 🟢 

array([ 6491, 10727,  6948,  6672,  6664,  1278, 10778,   233])

In [10]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.1


In [11]:
import faiss
import numpy as np
from pathlib import Path

print("\n" + "="*70)
print("🔨 CREATING FAISS VECTOR DATABASE (CPU-Optimized for Colab)")
print("="*70)

# Verify embeddings are loaded
print(f"\n✓ Embeddings shape: {embeddings.shape}")
print(f"✓ Total vectors: {embeddings.shape[0]}")
print(f"✓ Dimension: {embeddings.shape[1]}")

# Make sure embeddings are float32
embeddings_final = embeddings.astype('float32')

# ============================================================
# Step 1: Create FAISS Index
# ============================================================

print("\n[1/3] Creating FAISS Index...")

# IndexFlatIP = Inner Product (perfect for normalized embeddings)
# This gives us cosine similarity because embeddings are normalized
index = faiss.IndexFlatIP(embeddings_final.shape[1])
print(f"✓ Index type: IndexFlatIP")
print(f"✓ Index dimension: {index.d}")

# ============================================================
# Step 2: Add Embeddings to Index
# ============================================================

print("\n[2/3] Adding embeddings to index...")
print(f"⏳ Processing {embeddings_final.shape[0]} vectors...")

# Add embeddings in batches for memory efficiency
batch_size = 10000
for i in range(0, len(embeddings_final), batch_size):
    batch = embeddings_final[i:i+batch_size]
    index.add(batch)
    end_idx = min(i+batch_size, len(embeddings_final))
    print(f"  ✓ Added {end_idx}/{len(embeddings_final)} vectors")

print(f"\n✓ All embeddings added to index")
print(f"✓ Total vectors in index: {index.ntotal}")

# ============================================================
# Step 3: Save FAISS Index
# ============================================================

print("\n[3/3] Saving FAISS index to disk...")

# Create vector_db directory
vector_db_dir = Path("./vector_db")
vector_db_dir.mkdir(exist_ok=True)

# Save the index
faiss_path = vector_db_dir / "faiss_index.bin"
faiss.write_index(index, str(faiss_path))

file_size_mb = faiss_path.stat().st_size / (1024**2)
print(f"✓ FAISS index saved")
print(f"✓ File path: {faiss_path}")
print(f"✓ File size: {file_size_mb:.2f} MB")

print("\n" + "="*70)
print("✅ FAISS VECTOR DATABASE CREATED SUCCESSFULLY!")
print("="*70 + "\n")



🔨 CREATING FAISS VECTOR DATABASE (CPU-Optimized for Colab)

✓ Embeddings shape: (12788, 768)
✓ Total vectors: 12788
✓ Dimension: 768

[1/3] Creating FAISS Index...
✓ Index type: IndexFlatIP
✓ Index dimension: 768

[2/3] Adding embeddings to index...
⏳ Processing 12788 vectors...
  ✓ Added 10000/12788 vectors
  ✓ Added 12788/12788 vectors

✓ All embeddings added to index
✓ Total vectors in index: 12788

[3/3] Saving FAISS index to disk...
✓ FAISS index saved
✓ File path: vector_db/faiss_index.bin
✓ File size: 37.46 MB

✅ FAISS VECTOR DATABASE CREATED SUCCESSFULLY!



In [12]:
import time

print("="*70)
print("✅ VERIFICATION - Testing FAISS Index")
print("="*70)

# Load index
print("\n[1/2] Loading FAISS index...")
index_test = faiss.read_index(str(vector_db_dir / "faiss_index.bin"))
print(f"✓ Index loaded successfully")
print(f"✓ Total vectors: {index_test.ntotal}")
print(f"✓ Dimension: {index_test.d}")

# Test search performance
print("\n[2/2] Testing search performance...")

# Create test query
test_query = np.random.random((1, embeddings.shape[1])).astype('float32')

# Measure search time
start = time.time()
distances, indices = index_test.search(test_query, k=5)
search_time = (time.time() - start) * 1000  # milliseconds

print(f"\n✓ Search Results:")
print(f"  Time: {search_time:.2f}ms")
print(f"  Top 5 indices: {indices[0]}")
print(f"  Top 5 similarities: {distances[0]}")

print("\n" + "="*70)
print("📊 FAISS DATABASE SUMMARY")
print("="*70)
print(f"✓ Total chunks indexed: {index_test.ntotal}")
print(f"✓ Embedding dimension: {index_test.d}")
print(f"✓ Index type: IndexFlatIP (cosine similarity)")
print(f"✓ Storage location: {vector_db_dir}")
print(f"✓ Environment: Google Colab (CPU)")
print(f"\nFiles created:")
print(f"  ✓ {vector_db_dir / 'faiss_index.bin'}")
print(f"  ✓ {vector_db_dir / 'chunks_metadata.json'}")
print(f"  ✓ {vector_db_dir / 'embeddings.npy'}")
print("="*70 + "\n")


✅ VERIFICATION - Testing FAISS Index

[1/2] Loading FAISS index...
✓ Index loaded successfully
✓ Total vectors: 12788
✓ Dimension: 768

[2/2] Testing search performance...

✓ Search Results:
  Time: 4.00ms
  Top 5 indices: [10434 10397  8183  4727   120]
  Top 5 similarities: [0.4333532  0.33961058 0.33720195 0.33073246 0.33002502]

📊 FAISS DATABASE SUMMARY
✓ Total chunks indexed: 12788
✓ Embedding dimension: 768
✓ Index type: IndexFlatIP (cosine similarity)
✓ Storage location: vector_db
✓ Environment: Google Colab (CPU)

Files created:
  ✓ vector_db/faiss_index.bin
  ✓ vector_db/chunks_metadata.json
  ✓ vector_db/embeddings.npy

