In [1]:
from sentence_transformers import SentenceTransformer
import openai
import os
import chromadb
from chromadb.config import Settings



In [2]:
def initialize_chromadb(persist_directory="./chroma_db", collection_name="policies"):
    """
    Initialize ChromaDB client and create/get collection.
    
    Args:
        persist_directory: Local directory to persist the database
        collection_name: Name of the collection
    
    Returns:
        ChromaDB collection object
    """
    # Create persist directory if it doesn't exist
    os.makedirs(persist_directory, exist_ok=True)
    
    # Initialize ChromaDB client with persistence
    client = chromadb.PersistentClient(path=persist_directory)
    
    print(f"‚úì ChromaDB initialized at: {persist_directory}")
    
    # Get or create collection
    try:
        # Try to get existing collection
        collection = client.get_collection(name=collection_name)
        print(f"‚úì Retrieved existing collection: {collection_name}")
        print(f"  Current documents: {collection.count()}")
    except:
        # Create new collection if doesn't exist
        collection = client.create_collection(
            name=collection_name,
            metadata={"description": "UF Policy documents with embeddings"}
        )
        print(f"‚úì Created new collection: {collection_name}")
    
    return collection

In [None]:
def load_embedding_model(model_name="BAAI/bge-small-en-v1.5"):
    """
    Load BGE embedding model from HuggingFace.
    
    Available BGE models:
    - BAAI/bge-base-en-v1.5 (balanced, 768 dimensions)
    Args:
        model_name: HuggingFace model identifier
    
    Returns:
        SentenceTransformer model
    """
    print(f"Loading embedding model: {model_name}")
    model = SentenceTransformer(model_name)
    print(f"‚úì Model loaded successfully!")
    print(f"  Embedding dimension: {model.get_sentence_embedding_dimension()}")
    return model

In [4]:
def query_chromadb(collection, model, query_text, n_results=5, filter_by_type=None):
    """
    Query ChromaDB with a text query.
    
    Args:
        collection: ChromaDB collection
        model: SentenceTransformer model for query embedding
        query_text: Query string
        n_results: Number of results to return
        filter_by_type: Filter results by policy type (optional)
    
    Returns:
        Query results
    """
    # Create query embedding
    query_embedding = model.encode([query_text])[0].tolist()
    
    # Prepare filter
    where_filter = None
    if filter_by_type:
        where_filter = {"type": {"$eq": filter_by_type}}
    
    # Query ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
        where=where_filter
    )
    
    return results

In [None]:
def main_pipeline(
                  persist_directory="./chroma_db",
                  collection_name="policies",
                  model_name="BAAI/bge-small-en-v1.5",
                  include_types=None,
                  ):
    """
    Main pipeline to load data, create embeddings, and store in ChromaDB.
    
    Args:
        excel_file: Path to Excel file
        persist_directory: ChromaDB storage directory
        collection_name: Name of ChromaDB collection
        model_name: BGE model name
        include_types: List of types to include (e.g., ['Policy', 'Regulation'])
        overwrite: If True, delete existing collection and create new one
    
    Returns:
        collection: ChromaDB collection object
        model: Embedding model
    """

    
    # Step 2: Load embedding model
    print(f"\nStep 2: Loading embedding model...")
    embedding_model = load_embedding_model(model_name)
    
    # Step 3: Initialize ChromaDB
    print(f"\nStep 3: Initializing ChromaDB...")
    
    collection = initialize_chromadb(persist_directory, collection_name)
    

    
    print("\n" + "="*80)
    print("‚úì PIPELINE COMPLETE!")
    print(f"‚úì Total documents in collection: {collection.count()}")
    print("="*80)
    
    return collection

In [None]:
collection, model = main_pipeline(
                persist_directory="./chroma_db",
                collection_name="policies",
    model_name="BAAI/bge-base-en-v1.5" 
)

In [None]:
def initialize_openai_client(api_key: str, base_url: str = 'https://api.ai.it.ufl.edu'):
    """
    Initialize OpenAI client with custom base URL.
    
    Args:
        api_key: Your OpenAI API key
        base_url: Custom base URL for API (default: UF API endpoint)
    
    Returns:
        OpenAI client object
    """
    try:
        client = openai.OpenAI(
            api_key=api_key,
            base_url=base_url
        )
        
        print('‚úì OpenAI client created successfully')
        print(f'  Base URL: {base_url}')
        print(f'  Client type: {type(client)}')
        
        # Sanity checks
        has_chat = hasattr(client, 'chat')
        print(f'  Has chat attribute: {has_chat}')
        
        if has_chat:
            has_completions = hasattr(client.chat, 'completions')
            print(f'  Has completions attribute: {has_completions}')
        
        return client
    except Exception as e:
        print(f' Error initializing OpenAI client: {e}')
        return None

In [1]:
from typing import Dict, List, Any
def prepare_context_from_results(query_results: Dict[str, Any]) -> tuple:
    """
    Prepare context and metadata from ChromaDB query results.
    
    Args:
        query_results: Dictionary from ChromaDB query (with ids, documents, metadatas, distances)
    
    Returns:
        Tuple of (context_text, source_documents)
    """
    # Extract documents and metadata
    documents = query_results['documents'][0]
    metadatas = query_results['metadatas'][0]
    distances = query_results['distances'][0]
    
    # Build context text
    context_parts = []
    source_documents = []
    
    for i, (doc, metadata, distance) in enumerate(zip(documents, metadatas, distances), 1):
        # Add document to context with reference number
        context_parts.append(f"[Document {i}]")
        context_parts.append(f"Title: {metadata['title']}")
        context_parts.append(f"Type: {metadata['type']}")
        context_parts.append(f"Category: {metadata['category']}")
        context_parts.append(f"Content: {doc}")
        context_parts.append("")  # Empty line between documents
        
        # Store source information
        source_documents.append({
            'title': metadata['title'],
            'type': metadata['type'],
            'category': metadata['category'],
            'distance': distance,
            'link': metadata.get('link', 'N/A')
        })
    
    context_text = "\n".join(context_parts)
    
    return context_text, source_documents

In [None]:
def query_openai_with_context(
    client,
    question: str,
    query_results: Dict[str, Any],
    model: str = "gpt-oss-120b",
    temperature: float = 0.1,
    max_tokens: int = 1000
) -> Dict[str, Any]:
    """
    Query OpenAI with context from ChromaDB results.
    
    Args:
        client: OpenAI client object
        question: User's question
        query_results: Results from ChromaDB query
        model: Model name (e.g., 'gpt-4', 'gpt-3.5-turbo')
        temperature: Sampling temperature (0-2, lower is more focused)
        max_tokens: Maximum tokens in response
    
    Returns:
        Dictionary with answer and source documents
    """
    try:
        # Prepare context from query results
        context_text, source_documents = prepare_context_from_results(query_results)
        print(context_text,source_documents)
        
        # Create system prompt
        system_prompt = """You are a helpful assistant that answers questions based ONLY on the provided context documents.

IMPORTANT RULES:
1. Answer ONLY using information from the provided documents and also if you see a content in an docs so try to create answer from the provided doc
2. Do NOT use any external knowledge or information not present in the documents
3. Try to create a answer from the given content its not necessary for exact answers, you can use your logical reasoning here
4. Give answers ina n normal format do not use any stars or any extra special symbols in an answer

"""
# 3. If the answer is not in the provided documents, say "I cannot find this information in the provided documents"
        # Create user prompt with context
        user_prompt = f"""Context Documents:
{context_text}

Question: {question}

Please answer the question using ONLY the information from the context documents above. Do not use any external knowledge."""

        # Call OpenAI API
        print(f"\nü§ñ Querying {model}...")
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=temperature,
            max_tokens=max_tokens
        )
        
        # Extract answer
        answer = response.choices[0].message.content
        
        # Prepare result
        result = {
            'question': question,
            'answer': answer,
            'source_documents': source_documents,
            'model': model,
            'temperature': temperature,
            'tokens_used': {
                'prompt': response.usage.prompt_tokens,
                'completion': response.usage.completion_tokens,
                'total': response.usage.total_tokens
            }
        }
        
        print("‚úì Response received successfully")
        
        return result
        
    except Exception as e:
        print(f"Error querying OpenAI: {e}")
        return {
            'question': question,
            'answer': f"Error: {str(e)}",
            'source_documents': [],
            'model': model
        }

In [40]:
question_text = "rules for disabled parking?"

In [None]:

client = initialize_openai_client(
    api_key="sk-pgNP-BIHOtI8RPt-aC3Stg",
    base_url='https://api.ai.it.ufl.edu'
)
query_result= query_chromadb(
    collection=collection,
    embedding_model=model,
    query_text=question_text,
    n_results=3
)



result = query_openai_with_context(
    client=client,
    question=question_text,
    query_results=query_result,
    model="gpt-oss-120b",
    temperature=0.1,
    max_tokens=1000
)

‚úì OpenAI client created successfully
  Base URL: https://api.ai.it.ufl.edu
  Client type: <class 'openai.OpenAI'>
  Has chat attribute: True
  Has completions attribute: True
[Document 1]
Title: Parking
Type: Regulation
Category: Business Affairs
Content: . (h) Unauthorized parking in Reserved Spaces or Restricted Areas is prohibited. (i) A Vehicle parked overtime at any time limited parking space (meters, time restricted loading zones and Service Drive Areas, etc.) may receive a citation at the time the violation is identified and may receive another citation in the same day if the Vehicle remains in the same space more than two (2) hours from the time of issuance of the first citation. (j) Vehicles may park according to Permit type in the appropriate lots and spaces as identified on the TAPS parking map and parking lot signage. (k) All Vehicle operators using a parking space controlled by a meter must pay to occupy the space in accordance with the instructions on the meter. (l) Onl

In [45]:
result['answer']

'- Only authorized vehicles may park in disabled spaces.  \n- Authorization is provided by a State‚Äëissued ‚ÄúDisabled Persons Parking Permit‚Äù or a disabled‚Äëperson license plate.  \n- Students and employees must purchase a campus permit in addition to the state‚Äëissued permit or plate before parking on campus.  \n- Visitors who have a State‚Äëissued disabled permit or plate may use the designated disabled spaces (and non‚Äëreserved decal‚Äërestricted spaces) on campus.  \n- Parking in a disabled space without the proper authorization is prohibited and can result in citations and possible impoundment of the vehicle.'

In [None]:
## Get answers of the questions from rag system for evaluation

import pandas as pd
from tqdm import tqdm  

def process_questions_from_excel(
    excel_file,
    collection,
    embedding_model,
    client,
    question_column='question',
    answer_column='rag_answer',
    n_results=3,
    llm_model="gpt-oss-120b",
    temperature=0.1,
    max_tokens=1000

):
    """
    Process multiple questions from Excel and store RAG answers in new column
    
    Parameters:
    -----------
    excel_file : str
        Path to your Excel file
    collection : ChromaDB collection
        Your ChromaDB collection object
    model : 
        Your embedding model
    client : OpenAI client
        Your OpenAI/compatible API client
    question_column : str
        Name of column containing questions (default: 'question')
    answer_column : str
        Name of new column to store answers (default: 'rag_answer')
    n_results : int
        Number of results from ChromaDB (default: 3)
    gpt_model : str
        Model name (default: "gpt-oss-120b")
    temperature : float
        Temperature for generation (default: 0.1)
    max_tokens : int
        Max tokens for response (default: 1000)
    
    Returns:
    --------
    df : pandas.DataFrame
        DataFrame with new answer column
    """
    
    # Read Excel file
    print("Reading Excel file: {excel_file}")
    df = pd.read_excel(excel_file)
    print(f"Loaded {len(df)} questions")
    
    # Check if question column exists
    if question_column not in df.columns:
        raise ValueError(f"Column '{question_column}' not found. Available columns: {df.columns.tolist()}")
    
    # Initialize answer column if it doesn't exist
    if answer_column not in df.columns:
        df[answer_column] = ""
    
    # Process each question
    print(f"\n Processing questions...")
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Getting answers"):
        question_text = str(row[question_column])
        
        # Skip if question is empty or NaN
        if pd.isna(question_text) or question_text.strip() == "":
            print(f"  Skipping empty question at row {idx}")
            continue
        
        try:
            # Query ChromaDB
            query_result = query_chromadb(
                collection=collection,
                model=embedding_model,
                query_text=question_text,
                n_results=n_results
            )
            
            # Get answer from OpenAI-compatible API
            result = query_openai_with_context(
                client=client,
                question=question_text,
                query_results=query_result,
                model=llm_model,
                temperature=temperature,
                max_tokens=max_tokens
            )
            
            # Store answer in DataFrame
            df.at[idx, answer_column] = result['answer']
            
        except Exception as e:
            print(f" Error processing question at row {idx}: {str(e)}")
            df.at[idx, answer_column] = f"ERROR: {str(e)}"
    
    # Save updated DataFrame back to Excel
    output_file = excel_file.replace('.xlsx', '_with_answers.xlsx')
    print(f"\n Saving results to: {output_file}")
    df.to_excel(output_file, index=False)
    print("Done!")
    
    return df

In [None]:
# Process the Excel file

##only change api key and model name to get new model
#Try with different llms
"""llama-3.1-70b-instruct
mistral-7b-instruct
gemma-3-27b-it
gpt-oss-20b
mistral-small-3.1"""
new_api_key="sk-Jz7TVhKsaeJYJHRSaGl1ag"
client = initialize_openai_client(
    api_key="sk-pgNP-BIHOtI8RPt-aC3Stg",
    base_url='https://api.ai.it.ufl.edu'
)
df_with_answers = process_questions_from_excel(
    excel_file="your_questions.xlsx",
    collection=collection,
    model=model,
    client=client,
    question_column='question', 
    answer_column='rag_answer',   
    n_results=3,
    llm_model="gpt-oss-120b",
    temperature=0.1,
    max_tokens=1000
)

print("\nFirst few results:")
print(df_with_answers[['question', 'rag_answer']].head())