In [None]:
pip install faiss-cpu sentence-transformers langchain_chroma langchain_community nest_asyncio && pip install torch torchvision torchaudio && pip install ragatouille colbert && pip install ipywidgets && jupyter nbextension enable --py widgetsnbextension && pip install --upgrade chroma

In [None]:
import os
import json
import logging
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from tqdm import tqdm
from fastapi import FastAPI, Query
from fastapi.responses import JSONResponse
import uvicorn

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize FastAPI app
app = FastAPI()

# Model configuration
MODEL_NAME = "BAAI/bge-m3"
embedding_function = HuggingFaceBgeEmbeddings(model_name=MODEL_NAME)

# Chroma persistent directory
PERSIST_DIR = "./chroma_db"

def get_text_collection(user_id: str):
    """
    Initializes a Chroma vector store for textual data using the provided user ID to create a unique collection name.
    """
    collection_name = f"{user_id}_text_collection"
    return Chroma(
        collection_name=collection_name,
        embedding_function=embedding_function,
         persist_directory=PERSIST_DIR,
    )
def embed_data():
    """
    Embeds all data from 'publications.json' into ChromaDB before the server starts.
    """
    try:
        # Load dataset
        with open("publications.json", "r") as file:
            data = json.load(file)

        # Initialize Chroma collection
        user_id = "default_user"
        logger.info(f"Initializing ChromaDB for user_id: {user_id}")
        text_store = get_text_collection(user_id)
        logger.info("ChromaDB vector store initialized successfully.")

        # Prepare and embed documents
        documents = []
        ids = []

        logger.info(f"Embedding {len(data)} documents from the JSON file...")

        for idx, item in enumerate(data):
            # Extract fields
            title = item.get("Title", f"Untitled Document {idx}")
            url = item.get("URL", "No URL")
            full_abstract = item.get("Full Abstract", "").strip()

            # Skip documents with insufficient data
            if not full_abstract and not title.strip():
                logger.warning(f"Skipping document {idx}: insufficient content.")
                continue

            # Combine title and abstract
            document_content = f"{title}\n{full_abstract or 'No additional content available.'}"

            # Create Document object
            document = Document(
                page_content=document_content,
                metadata={"title": title, "url": url},
            )
            documents.append(document)
            ids.append(f"doc_{idx}")

        # Store documents in the vector store
        logger.info("Storing documents in ChromaDB vector store...")
        text_store.add_documents(documents=documents, ids=ids)
        logger.info(f"Successfully embedded {len(documents)} documents.")
    except Exception as e:
        logger.error(f"Error embedding documents: {e}")
        logger.debug("Traceback:", exc_info=True)

@app.get("/")
async def read_root():
    """
    Root endpoint for testing.
    """
    return {"message": "Welcome to the ChromaDB Semantic Search API"}

@app.get("/search")
async def search(query: str, k: int = Query(default=5, description="Number of results to fetch")):
    """
    Performs a similarity search on the indexed data.
    """
    try:
        user_id = "default_user"  # Use the same user_id as in embed_data
        logger.info(f"Initializing ChromaDB for user_id: {user_id}")
        text_store = get_text_collection(user_id)
        logger.info("ChromaDB vector store initialized successfully.")

        # Perform similarity search
        logger.info(f"Received search query: {query}")
        results = text_store.similarity_search(query, k=k)

        # Format and log results
        formatted_results = []
        for idx, result in enumerate(results):
            title = result.metadata.get("title", "Unknown Title")
            url = result.metadata.get("url", "No URL")
            content = result.page_content
            logger.debug(f"Result {idx + 1}: Title: {title}, URL: {url}, Content: {content[:100]}...")
            formatted_results.append({"title": title, "url": url, "content": content})

        return {"results": formatted_results}
    except Exception as e:
        logger.error(f"Error during search: {e}")
        return JSONResponse(status_code=500, content={"message": "An error occurred during search."})

if __name__ == "__main__":
    # Embed data before starting the server
    embed_data()
    # Run FastAPI app
    uvicorn.run(app, host="127.0.0.1", port=8000)



In [43]:
MODEL_NAME = "all-MiniLM-L6-v2"
embedding_function = HuggingFaceBgeEmbeddings(model_name=MODEL_NAME)

# Chroma persistent directory
PERSIST_DIR = "./chroma_db_3"
import torch
torch.set_default_device("cpu")

def get_text_collection(user_id: str):
    """
    Initializes a Chroma vector store for textual data using the provided user ID to create a unique collection name.
    """
    collection_name = f"{user_id}_text_collection"
    return Chroma(
        collection_name=collection_name,
        embedding_function=embedding_function,
         persist_directory=PERSIST_DIR,
    )

def embed_data(batch_size=100):
    """
    Embeds all data from 'abstracts.json' into ChromaDB in smaller batches to avoid memory issues.
    """
    try:
        # Check if the JSON file exists
        file_path = "/Users/paniz/ReSearch/data/abstracts.json"
        if not os.path.exists(file_path):
            logger.error("Error: abstracts.json not found.")
            return

        # Load dataset
        with open(file_path, "r") as file:
            data = json.load(file)

        # Initialize Chroma collection
        user_id = "default_user"
        logger.info(f"Initializing ChromaDB for user_id: {user_id}")
        text_store = get_text_collection(user_id)
        logger.info("ChromaDB vector store initialized successfully.")

        # Split data into batches
        total_docs = len(data)
        logger.info(f"Embedding {total_docs} documents in batches of {batch_size}...")

        for i in range(0, total_docs, batch_size):
            batch = data[i:i + batch_size]
            documents = []
            ids = []

            for idx, item in enumerate(batch):
                # Extract fields
                title = item.get("Title", f"Untitled Document {i + idx}")
                url = item.get("URL", "No URL")
                full_abstract = item.get("Full Abstract", "").strip()

                # Skip invalid abstracts
                if not full_abstract or "no abstract available" in full_abstract.lower():
                    logger.warning(f"Skipping document {i + idx + 1}: Missing or invalid abstract.")
                    continue

                document_content = full_abstract
                document = Document(
                    page_content=document_content,
                    metadata={"title": title, "url": url}
                )
                documents.append(document)
                ids.append(f"doc_{i + idx}")

            # Store batch in ChromaDB
            if documents:
                logger.info(f"Storing batch {i // batch_size + 1} with {len(documents)} documents...")
                text_store.add_documents(documents=documents, ids=ids)

        logger.info("All batches embedded successfully.")
    except Exception as e:
        logger.error(f"Error embedding documents: {e}")
        logger.debug("Traceback:", exc_info=True)


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps


In [44]:
# verify the embedding

embed_data()

INFO:__main__:Initializing ChromaDB for user_id: default_user
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


INFO:__main__:ChromaDB vector store initialized successfully.
INFO:__main__:Embedding 2832 documents in batches of 100...
INFO:__main__:Storing batch 1 with 72 documents...
INFO:__main__:Storing batch 2 with 71 documents...
INFO:__main__:Storing batch 3 with 62 documents...
INFO:__main__:Storing batch 4 with 78 documents...
INFO:__main__:Storing batch 5 with 52 documents...
INFO:__main__:Storing batch 6 with 89 documents...
INFO:__main__:Storing batch 7 with 81 documents...
INFO:__main__:Storing batch 8 with 62 documents...
INFO:__main__:Storing batch 9 with 78 documents...
INFO:__main__:Storing batch 10 with 64 documents...
INFO:__main__:Storing batch 11 with 77 documents...
INFO:__main__:Storing batch 12 with 89 documents...
INFO:__main__:Storing batch 13 with 94 documents...
INFO:__main__:Storing batch 14 with 93 documents...
INFO:__main__:Storing batch 15 with 86 documents...
INFO:__main__:Storing batch 16 with 90 documents...
INFO:__main__:Storing batch 17 with 89 documents...
INF

In [37]:
@staticmethod
def clear_system_cache() -> None:
    """
    Clears the system cache for the current user.
    """
    try:
        os.system("rm -rf ~/.cache/chroma")
        logger.info("System cache cleared successfully.")
    except Exception as e:
        logger.error(f"Error clearing system cache: {e}")
        logger.debug("Traceback:", exc_info=True)



import os
import json
import logging
from langchain_core.documents import Document
# clear_system_cache() method is not available in the current version of Chroma
from langchain_chroma import Chroma

# clean cache
clear_system_cache()

INFO:__main__:System cache cleared successfully.


In [None]:
import os
import json
import logging
from fastapi import FastAPI, Query
from fastapi.responses import JSONResponse
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
import uvicorn
import nest_asyncio

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
nest_asyncio.apply()

# Initialize FastAPI app
app = FastAPI()

# Model configuration
MODEL_NAME = "BAAI/bge-m3"
embedding_function = HuggingFaceBgeEmbeddings(model_name=MODEL_NAME)

# Chroma persistent directory
PERSIST_DIR = "./chroma_db_3"

def get_text_collection(user_id: str):
    """
    Initializes or retrieves a Chroma vector store for textual data using the provided user ID.
    """
    collection_name = f"{user_id}_text_collection"
    return Chroma(
        collection_name=collection_name,
        embedding_function=embedding_function,
        persist_directory=PERSIST_DIR,
    )

@app.get("/search_with_scores")
async def search_with_scores(
    query: str = Query(..., description="Search query text"),
    k: int = Query(5, description="Number of results to fetch")
):
    """
    Perform a similarity search for a given query and return results with similarity scores.
    Add suggestions for documents missing abstracts.
    """
    try:
        # Initialize Chroma collection
        user_id = "default_user"
        text_store = get_text_collection(user_id)

        # Perform similarity search with scores
        results = text_store.similarity_search_with_score(query, k=k)

        if not results:
            return {"results": [], "message": "No matching results found."}

        # Separate results with and without abstracts
        results_with_abstract = []
        suggestions = []

        for doc, score in results:
            if "Abstract not found" in doc.page_content or "failed to load" in doc.page_content:
                suggestions.append({
                    "title": doc.metadata.get("title", "Unknown Title"),
                    "url": doc.metadata.get("url", "No URL"),
                    "similarity_score": score
                })
            else:
                results_with_abstract.append({
                    "title": doc.metadata.get("title", "Unknown Title"),
                    "url": doc.metadata.get("url", "No URL"),
                    "content_snippet": doc.page_content[:100],  # Limit to 100 characters for snippet
                    "similarity_score": score
                })

        response = {"results": results_with_abstract}

        # Add suggestions if available
        if suggestions:
            response["suggestions"] = suggestions

        return response

    except Exception as e:
        logger.error(f"Error during similarity search: {e}")
        return JSONResponse(status_code=500, content={"message": "An error occurred during search."})

if __name__ == "__main__":
    # Ensure the persistent directory exists
    os.makedirs(PERSIST_DIR, exist_ok=True)
    
    # Run the FastAPI app
    uvicorn.run(app, host="127.0.0.1", port=8000)
