In [None]:
# Unzipping processed files
!unzip /content/chunked_data.zip

Archive:  /content/chunked_data.zip
   creating: content/chunked_data/
  inflating: content/chunked_data/The-Alchemist_chunks.json  
  inflating: content/chunked_data/Ocean_ecogeochemistry_A_review_chunks.json  
  inflating: content/chunked_data/Stats_chunks.json  
  inflating: content/chunked_data/new-approaches-and-procedures-for-cancer-treatment_chunks.json  
  inflating: content/chunked_data/all_chunks.json  
  inflating: content/chunked_data/The_Plan_of_the_Giza_Pyramids_chunks.json  
  inflating: content/chunked_data/M.Sc. Applied Psychology_chunks.json  
  inflating: content/chunked_data/Dataset summaries and citations_chunks.json  


In [None]:
!mkdir chunked_data
!mv /content/content/chunked_data/* /content/chunked_data
!rm -rf /content/content/

In [None]:
!pip install -qU transformers torch sentence-transformers chromadb bitsandbytes tqdm

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m86.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m113.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m52.8 MB/s[0m eta [36m0:

In [None]:
import os
import json
import uuid
import torch
import chromadb
from typing import List, Dict, Any, Optional, Union
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tqdm import tqdm

In [None]:
# Check if GPU is available and set up accordingly
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
class VectorDatabaseBuilder:
    """Create a vector database from document chunks using the Nomic embedding model."""

    def __init__(
        self,
        chunks_dir: str = "chunked_data",
        db_dir: str = "vector_db",
        collection_name: str = "DR_X_Publications",
        embedding_model: str = "nomic-ai/nomic-embed-text-v1.5"
    ):

        self.chunks_dir = chunks_dir
        self.db_dir = db_dir
        self.collection_name = collection_name

        # Ensure directories exist
        os.makedirs(db_dir, exist_ok=True)

        # Initialize ChromaDB client
        self.client = chromadb.PersistentClient(path=db_dir)

        # Create or get collection
        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            metadata={"hnsw:space": "cosine"}  # Use cosine similarity
        )

        # Loading the Nomic embedding model
        print(f"Loading embedding model: {embedding_model}")
        self.embedding_model = SentenceTransformer(embedding_model, device=device, trust_remote_code=True)

    def load_chunks(self, filename: str = "all_chunks.json") -> List[Dict[str, Any]]:
        """Load document chunks from a JSON file."""

        filepath = os.path.join(self.chunks_dir, filename)
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                chunks = json.load(f)
            print(f"Loaded {len(chunks)} chunks from {filepath}")
            return chunks
        except Exception as e:
            print(f"Error loading chunks from {filepath}: {str(e)}")
            return []

    def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
        """Generate embeddings for a list of texts using the Nomic embedding model."""

        try:
            # Generate embeddings
            embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
            return embeddings.tolist()
        except Exception as e:
            print(f"Error generating embeddings: {str(e)}")
            raise

    def add_chunks_to_db(self, chunks: List[Dict[str, Any]], batch_size: int = 100) -> None:
        """Add chunks to the vector db with their embeddings."""

        total_chunks = len(chunks)
        print(f"Adding {total_chunks} chunks to vector database...")

        # Process in batches to avoid memory issues
        for i in tqdm(range(0, total_chunks, batch_size), desc="Adding chunks to DB"):
            batch = chunks[i:i+batch_size]
            batch_ids = [chunk.get("id", str(uuid.uuid4())) for chunk in batch]
            batch_texts = [chunk["text"] for chunk in batch]

            # Generate embeddings for this batch
            batch_embeddings = self.generate_embeddings(batch_texts)

            # Prepare metadata for each chunk
            batch_metadata = []
            for chunk in batch:
                # Extract relevant metadata, excluding text to avoid duplication
                metadata = {
                    "source": chunk.get("source", ""),
                    "page_number": ", ".join(map(str, chunk.get("pages", []))),
                    "chunk_number": chunk.get("chunk_number", 0),
                    "token_count": chunk.get("token_count", 0)
                }

                # Add element types and tables if available
                if "element_types" in chunk:
                    metadata["element_types"] = ", ".join(chunk["element_types"])

                if "tables" in chunk:
                    metadata["tables"] = str(chunk["tables"])

                batch_metadata.append(metadata)

            # Add to ChromaDB collection
            self.collection.add(
                ids=batch_ids,
                embeddings=batch_embeddings,
                documents=batch_texts,
                metadatas=batch_metadata
            )

    def build_vector_database(self) -> None:
        """Build the complete vector database from chunks."""

        # Load all chunks
        chunks = self.load_chunks()
        if not chunks:
            print("No chunks found. Vector database creation aborted.")
            return

        # Add chunks to the database
        self.add_chunks_to_db(chunks)

        # Count items in the collection
        count = self.collection.count()
        print(f"Vector database built successfully with {count} entries.")

    def get_collection_info(self) -> Dict[str, Any]:
        """Get information about the vector database collection."""

        return {
            "name": self.collection_name,
            "count": self.collection.count(),
            "location": self.db_dir
        }

In [None]:
# Build the vector database
print("Building vector database...")
db_builder = VectorDatabaseBuilder(
    chunks_dir="/content/chunked_data",
    db_dir="vector_db",
    collection_name="DR_X_Publications",
    embedding_model="nomic-ai/nomic-embed-text-v1.5"
)


db_builder.build_vector_database()

Building vector database...
Loading embedding model: nomic-ai/nomic-embed-text-v1.5


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/71.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/103k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

Loaded 599 chunks from /content/chunked_data/all_chunks.json
Adding 599 chunks to vector database...


Adding chunks to DB:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Adding chunks to DB:  17%|█▋        | 1/6 [00:03<00:18,  3.78s/it]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Adding chunks to DB:  33%|███▎      | 2/6 [00:07<00:15,  3.84s/it]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Adding chunks to DB:  50%|█████     | 3/6 [00:11<00:11,  3.74s/it]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Adding chunks to DB:  67%|██████▋   | 4/6 [00:13<00:06,  3.30s/it]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Adding chunks to DB:  83%|████████▎ | 5/6 [00:17<00:03,  3.24s/it]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Adding chunks to DB: 100%|██████████| 6/6 [00:34<00:00,  5.78s/it]

Vector database built successfully with 599 entries.





In [None]:
print("Collection Info...\n")

db_builder.get_collection_info()

Collection Info...



{'name': 'DR_X_Publications', 'count': 599, 'location': 'vector_db'}

In [None]:
!zip vector_db.zip -r vector_db

  adding: vector_db/ (stored 0%)
  adding: vector_db/f103d939-d98e-43eb-aa43-400e520ee1e5/ (stored 0%)
  adding: vector_db/f103d939-d98e-43eb-aa43-400e520ee1e5/length.bin (deflated 13%)
  adding: vector_db/f103d939-d98e-43eb-aa43-400e520ee1e5/header.bin (deflated 61%)
  adding: vector_db/f103d939-d98e-43eb-aa43-400e520ee1e5/link_lists.bin (stored 0%)
  adding: vector_db/f103d939-d98e-43eb-aa43-400e520ee1e5/data_level0.bin (deflated 100%)
  adding: vector_db/chroma.sqlite3 (deflated 47%)
