In [1]:
from sentence_transformers import SentenceTransformer
import openai
import os
import chromadb
from chromadb.config import Settings



In [3]:
def initialize_chromadb(persist_directory="./chroma_db", collection_name="policies"):
    """
    Initialize ChromaDB client and create/get collection.
    
    Args:
        persist_directory: Local directory to persist the database
        collection_name: Name of the collection
    
    Returns:
        ChromaDB collection object
    """
    # Create persist directory if it doesn't exist
    os.makedirs(persist_directory, exist_ok=True)
    
    # Initialize ChromaDB client with persistence
    client = chromadb.PersistentClient(path=persist_directory)
    
    print(f"‚úì ChromaDB initialized at: {persist_directory}")
    
    # Get or create collection
    try:
        # Try to get existing collection
        collection = client.get_collection(name=collection_name)
        print(f"‚úì Retrieved existing collection: {collection_name}")
        print(f"  Current documents: {collection.count()}")
    except:
        # Create new collection if doesn't exist
        collection = client.create_collection(
            name=collection_name,
            metadata={"description": "UF Policy documents with embeddings"}
        )
        print(f"‚úì Created new collection: {collection_name}")
    
    return collection

In [4]:
def load_embedding_model(model_name="BAAI/bge-small-en-v1.5"):
    """
    Load BGE embedding model from HuggingFace.
    
    Available BGE models:
    - BAAI/bge-small-en-v1.5 (fastest, 384 dimensions)
    - BAAI/bge-base-en-v1.5 (balanced, 768 dimensions)
    - BAAI/bge-large-en-v1.5 (best quality, 1024 dimensions)
    
    Args:
        model_name: HuggingFace model identifier
    
    Returns:
        SentenceTransformer model
    """
    print(f"Loading embedding model: {model_name}")
    model = SentenceTransformer(model_name)
    print(f"‚úì Model loaded successfully!")
    print(f"  Embedding dimension: {model.get_sentence_embedding_dimension()}")
    return model

In [5]:
def query_chromadb(collection, model, query_text, n_results=5, filter_by_type=None):
    """
    Query ChromaDB with a text query.
    
    Args:
        collection: ChromaDB collection
        model: SentenceTransformer model for query embedding
        query_text: Query string
        n_results: Number of results to return
        filter_by_type: Filter results by policy type (optional)
    
    Returns:
        Query results
    """
    # Create query embedding
    query_embedding = model.encode([query_text])[0].tolist()
    
    # Prepare filter
    where_filter = None
    if filter_by_type:
        where_filter = {"type": {"$eq": filter_by_type}}
    
    # Query ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
        where=where_filter
    )
    
    return results

In [6]:
def main_pipeline(
                  persist_directory="./chroma_db",
                  collection_name="policies",
                  model_name="BAAI/bge-small-en-v1.5",
                  include_types=None,
                  ):
    """
    Main pipeline to load data, create embeddings, and store in ChromaDB.
    
    Args:
        excel_file: Path to Excel file
        persist_directory: ChromaDB storage directory
        collection_name: Name of ChromaDB collection
        model_name: BGE model name
        include_types: List of types to include (e.g., ['Policy', 'Regulation'])
        overwrite: If True, delete existing collection and create new one
    
    Returns:
        collection: ChromaDB collection object
        model: Embedding model
    """

    
    # Step 2: Load embedding model
    print(f"\nStep 2: Loading embedding model...")
    model = load_embedding_model(model_name)
    
    # Step 3: Initialize ChromaDB
    print(f"\nStep 3: Initializing ChromaDB...")
    
    collection = initialize_chromadb(persist_directory, collection_name)
    

    
    print("\n" + "="*80)
    print("‚úì PIPELINE COMPLETE!")
    print(f"‚úì Total documents in collection: {collection.count()}")
    print("="*80)
    
    return collection, model

In [7]:
collection, model = main_pipeline(
                persist_directory="./chroma_db",
                collection_name="policies",
    model_name="BAAI/bge-base-en-v1.5"  # Better quality
)


Step 2: Loading embedding model...
Loading embedding model: BAAI/bge-base-en-v1.5
‚úì Model loaded successfully!
  Embedding dimension: 768

Step 3: Initializing ChromaDB...
‚úì ChromaDB initialized at: ./chroma_db
‚úì Retrieved existing collection: policies
  Current documents: 1887

‚úì PIPELINE COMPLETE!
‚úì Total documents in collection: 1887


{'ids': [['Working_Safely_and_Maintaining_Workplace_Health_Standards_3_1520',
   'Furlough_Policy_(includes_UFF_Faculty_MOU)_2_1853',
   'Paid_Family_Leave_7_1595']],
 'embeddings': None,
 'documents': [['. Employees should not return to work until symptoms have improved. Employees must follow all department reporting procedures for absences, including accurately reporting their work time and absence from the office in myUFL. During the absence, employees may work from home, if approved, and shall use accrued sick leave or leave without pay if unable to work. In situations in which the reason for the absence is due to an employee‚Äôs serious health condition, the absence may be covered by the Family Medical Leave Act (FMLA) . Employees must practice and maintain personal hygiene to help ensure a healthy workplace. Strategies to accomplish this include: Washing hands to aid in keeping employees healthy and preventing the spread of illness to co-workers. All employees should adhere to th

In [9]:
def initialize_openai_client(api_key: str, base_url: str = 'https://api.ai.it.ufl.edu'):
    """
    Initialize OpenAI client with custom base URL.
    
    Args:
        api_key: Your OpenAI API key
        base_url: Custom base URL for API (default: UF API endpoint)
    
    Returns:
        OpenAI client object
    """
    try:
        client = openai.OpenAI(
            api_key=api_key,
            base_url=base_url
        )
        
        print('‚úì OpenAI client created successfully')
        print(f'  Base URL: {base_url}')
        print(f'  Client type: {type(client)}')
        
        # Sanity checks
        has_chat = hasattr(client, 'chat')
        print(f'  Has chat attribute: {has_chat}')
        
        if has_chat:
            has_completions = hasattr(client.chat, 'completions')
            print(f'  Has completions attribute: {has_completions}')
        
        return client
    except Exception as e:
        print(f'‚ùå Error initializing OpenAI client: {e}')
        return None

In [10]:
from typing import Dict, List, Any
def prepare_context_from_results(query_results: Dict[str, Any]) -> tuple:
    """
    Prepare context and metadata from ChromaDB query results.
    
    Args:
        query_results: Dictionary from ChromaDB query (with ids, documents, metadatas, distances)
    
    Returns:
        Tuple of (context_text, source_documents)
    """
    # Extract documents and metadata
    documents = query_results['documents'][0]
    metadatas = query_results['metadatas'][0]
    distances = query_results['distances'][0]
    
    # Build context text
    context_parts = []
    source_documents = []
    
    for i, (doc, metadata, distance) in enumerate(zip(documents, metadatas, distances), 1):
        # Add document to context with reference number
        context_parts.append(f"[Document {i}]")
        context_parts.append(f"Title: {metadata['title']}")
        context_parts.append(f"Type: {metadata['type']}")
        context_parts.append(f"Category: {metadata['category']}")
        context_parts.append(f"Content: {doc}")
        context_parts.append("")  # Empty line between documents
        
        # Store source information
        source_documents.append({
            'title': metadata['title'],
            'type': metadata['type'],
            'category': metadata['category'],
            'distance': distance,
            'link': metadata.get('link', 'N/A')
        })
    
    context_text = "\n".join(context_parts)
    
    return context_text, source_documents

In [45]:
def query_openai_with_context(
    client,
    question: str,
    query_results: Dict[str, Any],
    model: str = "gpt-oss-120b",
    temperature: float = 0.1,
    max_tokens: int = 1000
) -> Dict[str, Any]:
    """
    Query OpenAI with context from ChromaDB results.
    
    Args:
        client: OpenAI client object
        question: User's question
        query_results: Results from ChromaDB query
        model: Model name (e.g., 'gpt-4', 'gpt-3.5-turbo')
        temperature: Sampling temperature (0-2, lower is more focused)
        max_tokens: Maximum tokens in response
    
    Returns:
        Dictionary with answer and source documents
    """
    try:
        # Prepare context from query results
        context_text, source_documents = prepare_context_from_results(query_results)
        print(context_text,source_documents)
        
        # Create system prompt
        system_prompt = """You are a helpful assistant that answers questions based ONLY on the provided context documents.

IMPORTANT RULES:
1. Answer ONLY using information from the provided documents
2. Do NOT use any external knowledge or information not present in the documents
3. If the answer is not in the provided documents, say "I cannot find this information in the provided documents"
4. Cite which document(s) you used by referring to [Document N] format
"""

        # Create user prompt with context
        user_prompt = f"""Context Documents:
{context_text}

Question: {question}

Please answer the question using ONLY the information from the context documents above. Do not use any external knowledge."""

        # Call OpenAI API
        print(f"\nü§ñ Querying {model}...")
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=temperature,
            max_tokens=max_tokens
        )
        
        # Extract answer
        answer = response.choices[0].message.content
        
        # Prepare result
        result = {
            'question': question,
            'answer': answer,
            'source_documents': source_documents,
            'model': model,
            'temperature': temperature,
            'tokens_used': {
                'prompt': response.usage.prompt_tokens,
                'completion': response.usage.completion_tokens,
                'total': response.usage.total_tokens
            }
        }
        
        print("‚úì Response received successfully")
        
        return result
        
    except Exception as e:
        print(f"‚ùå Error querying OpenAI: {e}")
        return {
            'question': question,
            'answer': f"Error: {str(e)}",
            'source_documents': [],
            'model': model
        }

In [48]:
question_text = "uf teaching assistant pay sat and sunday?"

In [49]:
client = initialize_openai_client(
    api_key="sk-pgNP-BIHOtI8RPt-aC3Stg",
    base_url='https://api.ai.it.ufl.edu'
)
query_result= query_chromadb(
    collection=collection,
    model=model,
    query_text=question_text,
    n_results=3,
    filter_by_type="Policy"  # Only search Policy type
)



result = query_openai_with_context(
    client=client,
    question=question_text,
    query_results=query_result,
    model="gpt-oss-120b",
    temperature=0.1,
    max_tokens=1000
)

‚úì OpenAI client created successfully
  Base URL: https://api.ai.it.ufl.edu
  Client type: <class 'openai.OpenAI'>
  Has chat attribute: True
  Has completions attribute: True
[Document 1]
Title: On-Call Pay and Callbacks
Type: Policy
Category: Human Resources
Content: . An employee who is on-call on Saturday, Sunday, and/or a university holiday or official closing may be paid at the rate of one-third (1/3) of the UF Hourly minimum wage for TEAMS employees per hour for each hour they are required to be available. Callback pay for non-exempt employees is mandatory and is not awarded on a discretionary basis. Callback pay occurs when an employee is ‚Äúcalled back‚Äù to perform work beyond the employee‚Äôs scheduled hours of work for that day, a minimum payment may be required. Non-exempt TEAMS employees who are called back to campus or other appropriate worksites will be credited with either two hours or the actual time worked plus traveling time to and from the employee‚Äôs home‚Äîwhic

In [50]:
result

{'question': 'uf teaching assistant pay sat and sunday?',
 'answer': 'Teaching assistants who are on‚Äëcall on Saturday, Sunday (or a university holiday/official closing) are paid at a rate of **one‚Äëthird (1/3) of the UF hourly minimum wage for TEAMS employees for each hour they must be available**„ÄêDocument\u202f1„Äë.',
 'source_documents': [{'title': 'On-Call Pay and Callbacks',
   'type': 'Policy',
   'category': 'Human Resources',
   'distance': 0.5877289772033691,
   'link': 'N/A'},
  {'title': 'Premium Pay for Holidays and University Emergencies',
   'type': 'Policy',
   'category': 'Human Resources',
   'distance': 0.6286293268203735,
   'link': 'N/A'},
  {'title': 'Non-Student Hourly OPS Employment',
   'type': 'Policy',
   'category': 'Human Resources',
   'distance': 0.6647508144378662,
   'link': 'N/A'}],
 'model': 'gpt-oss-120b',
 'temperature': 0.1,
 'tokens_used': {'prompt': 835, 'completion': 192, 'total': 1027}}