In [None]:
!pip install "protobuf<5" chromadb sentence-transformers PyPDF2




In [None]:
# Mount Google Drive
#(**Naveen Kumar Reddy Singam**)
from google.colab import drive
drive.mount('/content/drive')

import os
import uuid
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import chromadb
import shutil

# Define paths
pdf_folder_path = "/content/drive/MyDrive/AAAI Papers"  # Update this path
chroma_db_path = "/content/chroma_db"
export_path = "/content/drive/MyDrive/chroma_db"

# Initialize ChromaDB with persistence
client = chromadb.PersistentClient(path=chroma_db_path)

# Load domain-specific embedding model
model = SentenceTransformer("allenai-specter")  # Fine-tuned for academic texts

# Create ChromaDB collection
collection = client.get_or_create_collection(name= "research_papers",
    metadata={"distance_metric": "cosine"})

# Generate a unique identifier for this run
run_id = str(uuid.uuid4())[:8]  # Short unique identifier

# Optional: Clear existing data in the collection
clear_collection = True
if clear_collection:
    collection.delete(where={"*": "*"})  # Delete all entries in the collection
    print("Cleared existing collection data.")

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Function to chunk text with overlap
def chunk_text_with_overlap(text, chunk_size, overlap):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# Chunking configuration
chunk_size = 500  # Increased chunk size
overlap_size = 100  # Added overlap for better context

# Process PDFs and add to ChromaDB
pdf_files = [f for f in os.listdir(pdf_folder_path) if f.endswith(".pdf")]

for doc_id, pdf_file in enumerate(pdf_files):
    pdf_path = os.path.join(pdf_folder_path, pdf_file)
    text = extract_text_from_pdf(pdf_path)
    chunks = chunk_text_with_overlap(text, chunk_size, overlap_size)

    for chunk_id, chunk in enumerate(chunks):
        embedding = model.encode(chunk).tolist()
        collection.add(
            ids=[f"{run_id}_{doc_id}_{chunk_id}"],  # Unique ID with run identifier
            documents=[chunk],
            metadatas=[{
                "document_id": doc_id,
                "chunk_id": chunk_id,
                "file_name": pdf_file
            }],
            embeddings=[embedding],
        )

# Check if data was added successfully
print(f"Number of documents in collection: {collection.count()}")

# Save ChromaDB to Google Drive
if os.path.exists(chroma_db_path):
    shutil.copytree(chroma_db_path, export_path, dirs_exist_ok=True)
    print(f"ChromaDB exported to {export_path}")
else:
    print("ChromaDB directory not found!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.77k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/331 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/462k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Cleared existing collection data.
Number of documents in collection: 8244
ChromaDB exported to /content/drive/MyDrive/chroma_db
