In [None]:
!pip install faiss-cpu sentence-transformers langchain_chroma langchain_community nest_asyncio
!pip install torch torchvision torchaudio
!pip install ragatouille colbert
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
!pip install --upgrade chroma

In [None]:
import os
import json
import logging
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from tqdm import tqdm
from fastapi import FastAPI, Query
from fastapi.responses import JSONResponse
import uvicorn

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize FastAPI app
app = FastAPI()

# Model configuration
MODEL_NAME = "BAAI/bge-m3"
embedding_function = HuggingFaceBgeEmbeddings(model_name=MODEL_NAME)

# Chroma persistent directory
PERSIST_DIR = "./chroma_db"

def get_text_collection(user_id: str):
    """
    Initializes a Chroma vector store for textual data using the provided user ID to create a unique collection name.
    """
    collection_name = f"{user_id}_text_collection"
    return Chroma(
        collection_name=collection_name,
        embedding_function=embedding_function,
         persist_directory=PERSIST_DIR,
    )
def embed_data():
    """
    Embeds all data from 'publications.json' into ChromaDB before the server starts.
    """
    try:
        # Load dataset
        with open("publications.json", "r") as file:
            data = json.load(file)

        # Initialize Chroma collection
        user_id = "default_user"
        logger.info(f"Initializing ChromaDB for user_id: {user_id}")
        text_store = get_text_collection(user_id)
        logger.info("ChromaDB vector store initialized successfully.")

        # Prepare and embed documents
        documents = []
        ids = []

        logger.info(f"Embedding {len(data)} documents from the JSON file...")

        for idx, item in enumerate(data):
            # Extract fields
            title = item.get("Title", f"Untitled Document {idx}")
            url = item.get("URL", "No URL")
            full_abstract = item.get("Full Abstract", "").strip()

            # Skip documents with insufficient data
            if not full_abstract and not title.strip():
                logger.warning(f"Skipping document {idx}: insufficient content.")
                continue

            # Combine title and abstract
            document_content = f"{title}\n{full_abstract or 'No additional content available.'}"

            # Create Document object
            document = Document(
                page_content=document_content,
                metadata={"title": title, "url": url},
            )
            documents.append(document)
            ids.append(f"doc_{idx}")

        # Store documents in the vector store
        logger.info("Storing documents in ChromaDB vector store...")
        text_store.add_documents(documents=documents, ids=ids)
        logger.info(f"Successfully embedded {len(documents)} documents.")
    except Exception as e:
        logger.error(f"Error embedding documents: {e}")
        logger.debug("Traceback:", exc_info=True)

@app.get("/")
async def read_root():
    """
    Root endpoint for testing.
    """
    return {"message": "Welcome to the ChromaDB Semantic Search API"}

@app.get("/search")
async def search(query: str, k: int = Query(default=5, description="Number of results to fetch")):
    """
    Performs a similarity search on the indexed data.
    """
    try:
        user_id = "default_user"  # Use the same user_id as in embed_data
        logger.info(f"Initializing ChromaDB for user_id: {user_id}")
        text_store = get_text_collection(user_id)
        logger.info("ChromaDB vector store initialized successfully.")

        # Perform similarity search
        logger.info(f"Received search query: {query}")
        results = text_store.similarity_search(query, k=k)

        # Format and log results
        formatted_results = []
        for idx, result in enumerate(results):
            title = result.metadata.get("title", "Unknown Title")
            url = result.metadata.get("url", "No URL")
            content = result.page_content
            logger.debug(f"Result {idx + 1}: Title: {title}, URL: {url}, Content: {content[:100]}...")
            formatted_results.append({"title": title, "url": url, "content": content})

        return {"results": formatted_results}
    except Exception as e:
        logger.error(f"Error during search: {e}")
        return JSONResponse(status_code=500, content={"message": "An error occurred during search."})

if __name__ == "__main__":
    # Embed data before starting the server
    embed_data()
    # Run FastAPI app
    uvicorn.run(app, host="127.0.0.1", port=8000)



In [16]:
import os
import json
import logging
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from fastapi import FastAPI, Query
from fastapi.responses import JSONResponse
import uvicorn
import nest_asyncio

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
nest_asyncio.apply()

# Initialize FastAPI app
app = FastAPI()

# Model configuration
MODEL_NAME = "BAAI/bge-m3"
embedding_function = HuggingFaceBgeEmbeddings(model_name=MODEL_NAME)

# Chroma persistent directory
PERSIST_DIR = "./chroma_db"
def embed_data():
    """
    Embeds all data from 'publications.json' into ChromaDB before the server starts.
    """
    try:
        if not os.path.exists("publications.json"):
            logger.error("Error: publications.json not found.")
            return

        # Load dataset
        with open("publications.json", "r") as file:
            data = json.load(file)

        # Initialize Chroma collection
        user_id = "default_user"
        logger.info(f"Initializing ChromaDB for user_id: {user_id}")
        text_store = get_text_collection(user_id)
        logger.info("ChromaDB vector store initialized successfully.")

        # Prepare and embed documents
        documents = []
        ids = []

        logger.info(f"Embedding {len(data)} documents from the JSON file...")

        for idx, item in enumerate(data):
            # Extract fields
            title = item.get("Title", f"Untitled Document {idx}")
            url = item.get("URL", "No URL")
            full_abstract = item.get("Full Abstract", "").strip()

            # Log the embedding content for debugging
            logger.debug(f"Embedding document {idx + 1}:")
            logger.debug(f"  Title: {title}")
            logger.debug(f"  URL: {url}")
            logger.debug(f"  Abstract: {full_abstract[:100]}...")

            # Handle missing abstracts
            if not full_abstract or "No abstract available" in full_abstract:
                logger.warning(f"Document {idx} has no abstract. Embedding title and URL only.")
                document_content = f"{title}\n{url}"
            else:
                document_content = f"{title}\n{full_abstract}"

            # Create Document object
            document = Document(
                page_content=document_content,
                metadata={"title": title, "url": url},
            )
            documents.append(document)
            ids.append(f"doc_{idx}")

        # Store documents in the vector store
        logger.info("Storing documents in ChromaDB vector store...")
        text_store.add_documents(documents=documents, ids=ids)
        logger.info(f"Successfully embedded {len(documents)} documents.")
    except Exception as e:
        logger.error(f"Error embedding documents: {e}")
        logger.debug("Traceback:", exc_info=True)


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-m3
ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-32' coro=<Server.serve() done, defined at /Users/paniz/ReSearch/.venv/lib/python3.12/site-packages/uvicorn/server.py:67> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/Users/paniz/ReSearch/.venv/lib/python3.12/site-packages/uvicorn/main.py", line 579, in run
    server.run()
  File "/Users/paniz/ReSearch/.venv/lib/python3.12/site-packages/uvicorn/server.py", line 65, in run
    return asyncio.run(self.serve(sockets=sockets))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/paniz/ReSearch/.venv/lib/python3.12/site-packages/nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/paniz/ReSearch/.venv/lib/python3.12/site-packages/nest_asyncio.py", line 92, in run_until_complete
    self._run_onc

In [15]:
@staticmethod
def clear_system_cache() -> None:
    """
    Clears the system cache for the current user.
    """
    try:
        os.system("rm -rf ~/.cache/chroma")
        logger.info("System cache cleared successfully.")
    except Exception as e:
        logger.error(f"Error clearing system cache: {e}")
        logger.debug("Traceback:", exc_info=True)



import os
import json
import logging
from langchain_core.documents import Document
# clear_system_cache() method is not available in the current version of Chroma
from langchain_chroma import Chroma

# clean cache
clear_system_cache()

INFO:__main__:System cache cleared successfully.


In [None]:
import os
import json
import logging
from fastapi import FastAPI, Query
from fastapi.responses import JSONResponse
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
import uvicorn
import nest_asyncio

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
nest_asyncio.apply()

# Initialize FastAPI app
app = FastAPI()

# Model configuration
MODEL_NAME = "BAAI/bge-m3"
embedding_function = HuggingFaceBgeEmbeddings(model_name=MODEL_NAME)

# Chroma persistent directory
PERSIST_DIR = "./chroma_db"

def get_text_collection(user_id: str):
    """
    Initializes or retrieves a Chroma vector store for textual data using the provided user ID.
    """
    collection_name = f"{user_id}_text_collection"
    return Chroma(
        collection_name=collection_name,
        embedding_function=embedding_function,
        persist_directory=PERSIST_DIR,
    )

@app.get("/search_with_scores")
async def search_with_scores(
    query: str = Query(..., description="Search query text"),
    k: int = Query(5, description="Number of results to fetch")
):
    """
    Perform a similarity search for a given query and return results with similarity scores.
    Add suggestions for documents missing abstracts.
    """
    try:
        # Initialize Chroma collection
        user_id = "default_user"
        text_store = get_text_collection(user_id)

        # Perform similarity search with scores
        results = text_store.similarity_search_with_score(query, k=k)

        if not results:
            return {"results": [], "message": "No matching results found."}

        # Separate results with and without abstracts
        results_with_abstract = []
        suggestions = []

        for doc, score in results:
            if "Abstract not found" in doc.page_content or "failed to load" in doc.page_content:
                suggestions.append({
                    "title": doc.metadata.get("title", "Unknown Title"),
                    "url": doc.metadata.get("url", "No URL"),
                    "similarity_score": score
                })
            else:
                results_with_abstract.append({
                    "title": doc.metadata.get("title", "Unknown Title"),
                    "url": doc.metadata.get("url", "No URL"),
                    "content_snippet": doc.page_content[:100],  # Limit to 100 characters for snippet
                    "similarity_score": score
                })

        response = {"results": results_with_abstract}

        # Add suggestions if available
        if suggestions:
            response["suggestions"] = suggestions

        return response

    except Exception as e:
        logger.error(f"Error during similarity search: {e}")
        return JSONResponse(status_code=500, content={"message": "An error occurred during search."})

if __name__ == "__main__":
    # Ensure the persistent directory exists
    os.makedirs(PERSIST_DIR, exist_ok=True)
    
    # Run the FastAPI app
    uvicorn.run(app, host="127.0.0.1", port=8000)


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-m3
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:     Started server process [57326]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:52685 - "GET / HTTP/1.1" 404 Not Found
INFO:     127.0.0.1:52690 - "GET /search?query=machine%20learning%20security&k=3 HTTP/1.1" 404 Not Found
INFO:     127.0.0.1:52690 - "GET /docs HTTP/1.1" 200 OK
INFO:     127.0.0.1:52690 - "GET /openapi.json HTTP/1.1" 200 OK
INFO:     127.0.0.1:52692 - "GET /search_with_scores?query=name&k=5 HTTP/1.1" 200 OK
INFO:     127.0.0.1:52697 - "GET /search_with_scores?query=machine&k=5 HTTP/1.1" 200 OK
INFO:     127.0.0.1:52809 - "GET /search_with_scores?query=code&k=5 HTTP/1.1" 200 OK
