In [None]:
import os
import glob
from dotenv import load_dotenv
from pathlib import Path
import gradio as gr
from groq import Groq

In [None]:
load_dotenv(override=True)

groq_api_key = Groq(api_key=os.getenv("GROQ_API_KEY"))

if groq_api_key:
    print(f"Groq API Key exists and begins {groq_api_key[:8]}")
else:
    print("Groq API Key not set")

MODEL = "llama-3.1-8b-instant"



In [None]:
system_prompt ="""
You represent the AI portfolio assistant for Sharvari.


Your role is to answer questions about Sharvari’s background, profile, projects, skills, work experience, hackathons, education and achievements using ONLY the provided context and you are an expert at it.
You are provided with all the information in different folders that are relevant to her.
Rules:
- Base all answers strictly on the given context.
- Do not invent or assume information that is not explicitly stated.
- If the answer is not present in the context, say so clearly.
- Prefer precise, evidence-based answers over vague summaries.
- When listing items or counting, rely on canonical meta files if available.
- Use a professional, confident, and concise tone.
- Do not mention internal file names unless explicitly asked.
Additional Rule:
- If the context includes an explicit total count (e.g. “Total Awards Count”), use that number directly rather than recalculating.

You are not a general chatbot. You are a factual portfolio assistant.
"""

In [None]:
import os
import glob
import numpy as np
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [None]:
db_name = "vector_db"


In [None]:
# How many characters in all the documents?

knowledge_base_path = "knowledge-base/**/*.md"
files = glob.glob(knowledge_base_path, recursive=True)
print(f"Found {len(files)} files in the knowledge base")

entire_knowledge_base = ""

for file_path in files:
    with open(file_path, 'r', encoding='utf-8') as f:
        entire_knowledge_base += f.read()
        entire_knowledge_base += "\n\n"

print(f"Total characters in knowledge base: {len(entire_knowledge_base):,}")

In [None]:
# Load in everything in the knowledgebase using LangChain's loaders

folders = glob.glob("knowledge-base/*")

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={'encoding': 'utf-8'})
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

print(f"Loaded {len(documents)} documents")

In [None]:
# Divide into chunks using the RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150)
chunks = text_splitter.split_documents(documents)

print(f"Divided into {len(chunks)} chunks")
#print(f"First chunk:\n\n{chunks[0]}")

In [None]:
# Pick an embedding model

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
#embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

In [None]:
# query = "How many awards has Sharvari won?"

# docs = vectorstore.similarity_search(query, k=5)

# for i, doc in enumerate(docs):
#     print(f"\n--- Result {i+1} ---")
#     print(doc.metadata["source"])
#     print(doc.page_content[:500])


In [None]:
RAG_PROMPT_TEMPLATE="""
Use the following context to answer the user’s question.

Context:
{context}

Question:
{question}

Instructions:
- Answer in clear, complete sentences.
- If multiple sources support the answer, synthesise them.
- If the question asks for a list or count, provide it explicitly.
- Do not add information beyond the context.
"""

In [None]:
def answer_question(query):
    docs = vectorstore.similarity_search(query, k=5)
    context = "\n\n".join([doc.page_content for doc in docs])

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {
            "role": "user",
            "content": RAG_PROMPT_TEMPLATE.format(
                context=context,
                question=query
            )
        }
    ]

    response = client.chat.completions.create(
        model=MODEL,
        messages=messages,
        temperature=0.2,
    )

    return response.choices[0].message.content


In [None]:
print(
    answer_question(
        "How strong is Sharvari in LLMs?",
        vectorstore,
        system_prompt,
        RAG_PROMPT_TEMPLATE
    )
)


In [None]:
# Global objects
vectorstore = vectorstore  # already loaded
SYSTEM_PROMPT = system_prompt
RAG_PROMPT_TEMPLATE = RAG_PROMPT_TEMPLATE


In [None]:
with gr.Blocks(title="Sharvari | AI Portfolio Assistant") as demo:

    gr.Markdown(
        """
        # Sharvari — AI Portfolio Assistant
        Ask anything about my background, projects, skills, or experience.
        """
    )

    chatbot = gr.Chatbot(height=420)
    msg = gr.Textbox(
        placeholder="Ask a question (e.g. Why should I hire Sharvari?)",
        label="Your Question"
    )

    clear = gr.Button("Clear Chat")

    def respond(message, chat_history):
        answer = answer_question(message)
        chat_history.append((message, answer))
        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot])
    clear.click(lambda: None, None, chatbot, queue=False)

demo.launch()

In [3]:
from evaluation import test

In [4]:
tests = test.load_tests()

In [5]:
len(tests)

45

In [6]:
example = tests[0]
print(example.question)
print(example.category)
print(example.reference_answer)
print(example.keywords)


Who is Sharvari?
direct_fact
Sharvari is an AI Engineer specialising in applied AI systems, including computer vision, large language models, and agent-based workflows.
['Sharvari', 'AI Engineer']


In [27]:
from collections import Counter
count = Counter([t.category for t in tests])
count

Counter({'direct_fact': 26, 'holistic': 10, 'numerical': 6, 'comparative': 3})

In [1]:
from evaluation.eval import evaluate_retrieval, evaluate_answer

In [7]:
evaluate_retrieval(example)

RetrievalEval(mrr=1.0, ndcg=0.9837339917445846, keywords_found=2, total_keywords=2, keyword_coverage=100.0)