In [1]:
!pip install langchain langchain-openai langchain-chroma chromadb openai numpy scikit-learn plotly



In [23]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
import sys 
sys.path.append("../../llm_engineering")
from api_clients import create_clients
from langchain_ollama import OllamaEmbeddings
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory

In [None]:
!pip install langchain-community

In [None]:
!pip install langchain-huggingface

In [None]:
!pip install sentence-transformers

In [6]:
from langchain_chroma import Chroma
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_huggingface import HuggingFaceEmbeddings


In [24]:
clients = create_clients()
db_name = "vector_db"

In [26]:
print(clients['models'])

{'GROQ_MODEL': 'openai/gpt-oss-20b', 'OLLAMA_MODEL': 'ollama3.2'}


In [9]:
folders = glob.glob("knowledge-base/*")
text_loader_kwargs = {'encoding': 'utf-8'}
documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [10]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)

Created a chunk of size 1088, which is longer than the specified 1000


In [11]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [None]:
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

: 

: 

: 

In [12]:
vectorstore = Chroma.from_documents(
    documents=chunks,             
    embedding=embeddings,           
    persist_directory=db_name      
)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")
collection = vectorstore._collection

sample_embedding = collection.get(
    limit=1,                        
    include=["embeddings"]          
)["embeddings"][0]                  

dimensions = len(sample_embedding)

Vectorstore created with 246 documents


In [13]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [None]:
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
!pip install langchain-ollama

In [29]:
from langchain_ollama import ChatOllama

retriever = vectorstore.as_retriever()
llm = ChatOllama(temperature=0.7, model="llama3.2")

In [30]:
retriever.invoke("Who is Avery?")

[Document(id='2c9f2921-5342-4d19-ac40-9783afd7ecc4', metadata={'doc_type': 'employees', 'source': 'knowledge-base\\employees\\Avery Lancaster.md'}, page_content="## Other HR Notes\n- **Professional Development**: Avery has actively participated in leadership training programs and industry conferences, representing Insurellm and fostering partnerships.  \n- **Diversity & Inclusion Initiatives**: Avery has championed a commitment to diversity in hiring practices, seeing visible improvements in team representation since 2021.  \n- **Work-Life Balance**: Feedback revealed concerns regarding work-life balance, which Avery has approached by implementing flexible working conditions and ensuring regular check-ins with the team.\n- **Community Engagement**: Avery led community outreach efforts, focusing on financial literacy programs, particularly aimed at underserved populations, improving Insurellm's corporate social responsibility image.  \n\nAvery Lancaster has demonstrated resilience and a

In [31]:
llm.invoke("Who is Avery?")

AIMessage(content='There are several individuals named Avery, so it\'s possible that you\'re referring to a specific person. Here are a few notable ones:\n\n1. Avery Whitted: An American professional poker player who won the 2009 World Series of Poker Main Event.\n2. Avery Brooks: An American actor, best known for his roles in the TV series "Star Trek: Deep Space Nine" and the film "Mystic River".\n3. Avery Winter: A British actor, known for his roles in the TV series "EastEnders" and "Doctors".\n4. Avery Wines: An American wine company, founded by James Avery.\n\nIf you could provide more context or information about who Avery is that you\'re referring to, I\'d be happy to try and help further!', additional_kwargs={}, response_metadata={'model': 'llama3.2', 'created_at': '2025-11-12T10:59:56.3395658Z', 'done': True, 'done_reason': 'stop', 'total_duration': 3795504700, 'load_duration': 312822400, 'prompt_eval_count': 29, 'prompt_eval_duration': 22595900, 'eval_count': 157, 'eval_durati

In [32]:
def answer_question(question: str):
    docs = retriever.invoke(question)
    
    context = "\n\n".join(doc.page_content for doc in docs)
    
    system_prompt = f"""You are a helpful assistant. Answer the question based only on this context:
                    Context:
                    {context}
                    If the answer is not in the context, say you don't know."""
    
    response = llm.invoke([
        SystemMessage(content=system_prompt),
        HumanMessage(content=question)
    ])
    
    return response.content

In [33]:
answer_question("Who is Avery?")

'Avery Lancaster appears to be an employee of Insurellm, a company that specializes in insurance technology. The text provides information about her professional development, diversity and inclusion initiatives, work-life balance efforts, community engagement, and leadership achievements within the company.'