In [1]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
from openai import OpenAI

In [2]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = api_key
openai = OpenAI()

In [3]:
MODEL = "gpt-4o-mini"

In [4]:
context = {}
employees = glob.glob("knowledge-base/employees/*")
for employee in employees:
    name = employee.split(" ")[-1][:-3]
    doc = ""
    with open(employee, "r") as f:
        doc = f.read()
    context[name] = doc

In [5]:
products = glob.glob("knowledge-base/products/*")
for product in products:
    name = product.split("/")[-1][:-3]
    doc = ""
    with open(product, "r") as f:
        doc = f.read()
    context[name] = doc


In [6]:
system_message = "You are an expert in answering accurate questions about Insurellm, the Insurance Tech company. Give brief, accurate answers. If you don't know the answer, say so. Do not make anything up if you haven't been provided with relevant context."

In [7]:
def get_relevant_context(message):
    relevant_context = []
    for context_keys, context_details in context.items():
        if context_keys in message:
            relevant_context.append(context_details)
    return relevant_context

In [8]:
get_relevant_context("Lancaster")

["# Avery Lancaster\n\n## Summary\n- **Date of Birth**: March 15, 1985  \n- **Job Title**: Co-Founder & Chief Executive Officer (CEO)  \n- **Location**: San Francisco, California  \n\n## Insurellm Career Progression\n- **2015 - Present**: Co-Founder & CEO  \n  Avery Lancaster co-founded Insurellm in 2015 and has since guided the company to its current position as a leading Insurance Tech provider. Avery is known for her innovative leadership strategies and risk management expertise that have catapulted the company into the mainstream insurance market.  \n\n- **2013 - 2015**: Senior Product Manager at Innovate Insurance Solutions  \n  Before launching Insurellm, Avery was a leading Senior Product Manager at Innovate Insurance Solutions, where she developed groundbreaking insurance products aimed at the tech sector.  \n\n- **2010 - 2013**: Business Analyst at Edge Analytics  \n  Prior to joining Innovate, Avery worked as a Business Analyst, focusing on market trends and consumer preferen

In [9]:
def add_context(message):
    relevant_context = get_relevant_context(message)
    print(relevant_context)
    if relevant_context:
        message += "\n\nThe following additional context might be relevant in answering this question:\n\n"
        for relevant in relevant_context:
            message += f"\n\n{relevant}"
    return message

In [10]:
add_context("who is Avery Lancaster?")

["# Avery Lancaster\n\n## Summary\n- **Date of Birth**: March 15, 1985  \n- **Job Title**: Co-Founder & Chief Executive Officer (CEO)  \n- **Location**: San Francisco, California  \n\n## Insurellm Career Progression\n- **2015 - Present**: Co-Founder & CEO  \n  Avery Lancaster co-founded Insurellm in 2015 and has since guided the company to its current position as a leading Insurance Tech provider. Avery is known for her innovative leadership strategies and risk management expertise that have catapulted the company into the mainstream insurance market.  \n\n- **2013 - 2015**: Senior Product Manager at Innovate Insurance Solutions  \n  Before launching Insurellm, Avery was a leading Senior Product Manager at Innovate Insurance Solutions, where she developed groundbreaking insurance products aimed at the tech sector.  \n\n- **2010 - 2013**: Business Analyst at Edge Analytics  \n  Prior to joining Innovate, Avery worked as a Business Analyst, focusing on market trends and consumer preferen

"who is Avery Lancaster?\n\nThe following additional context might be relevant in answering this question:\n\n\n\n# Avery Lancaster\n\n## Summary\n- **Date of Birth**: March 15, 1985  \n- **Job Title**: Co-Founder & Chief Executive Officer (CEO)  \n- **Location**: San Francisco, California  \n\n## Insurellm Career Progression\n- **2015 - Present**: Co-Founder & CEO  \n  Avery Lancaster co-founded Insurellm in 2015 and has since guided the company to its current position as a leading Insurance Tech provider. Avery is known for her innovative leadership strategies and risk management expertise that have catapulted the company into the mainstream insurance market.  \n\n- **2013 - 2015**: Senior Product Manager at Innovate Insurance Solutions  \n  Before launching Insurellm, Avery was a leading Senior Product Manager at Innovate Insurance Solutions, where she developed groundbreaking insurance products aimed at the tech sector.  \n\n- **2010 - 2013**: Business Analyst at Edge Analytics  \n

In [11]:
def chat(message, history):
    messages = [{"role": "system", "content": system_message}] + history
    user_message = add_context(message)
    messages.append({"role": "user", "content": user_message})

    stream = openai.chat.completions.create(messages=messages, model=MODEL, stream=True)
    reply = ""
    for chunk in stream:
        reply += chunk.choices[0].delta.content or ""
        yield reply

In [None]:
view = gr.ChatInterface(fn=chat, type="messages").launch()

## Using LangChain

In [12]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import CharacterTextSplitter

In [13]:
db_name = "vector_db"

In [14]:
folders = glob.glob("knowledge-base/*")
text_loader_kwargs = {'encoding': 'utf-8'}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [15]:
len(documents)

31

In [16]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)


Created a chunk of size 1088, which is longer than the specified 1000


In [17]:
len(chunks)

123

In [18]:
chunks[5]

Document(metadata={'source': 'knowledge-base/products/Markellm.md', 'doc_type': 'products'}, page_content="# Product Summary\n\n# Markellm\n\n## Summary\n\nMarkellm is an innovative two-sided marketplace designed to seamlessly connect consumers with insurance companies. Powered by advanced matching AI, Markellm transforms the insurance shopping experience, making it more efficient, personalized, and accessible. Whether you're a homeowner searching for the best rates on home insurance or an insurer looking to reach new customers, Markellm acts as the ultimate bridge, delivering tailored solutions for all parties involved. With a user-friendly interface and powerful algorithms, Markellm not only saves time but also enhances decision-making in the often-complex insurance landscape.\n\n## Features\n\n- **AI-Powered Matching**: Markellm utilizes sophisticated AI algorithms to match consumers with the most suitable insurance products based on their individual needs and preferences. This ensu

In [19]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [20]:
embeddings = OpenAIEmbeddings()


In [21]:
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [22]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 123 documents


In [23]:
collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 1,536 dimensions


In [24]:
result = collection.get(include=["embeddings", "metadatas", "documents"])
vectors = np.array(result["embeddings"])
documents = result["documents"]
doc_types = [metadata["doc_type"] for metadata in result["metadatas"]]
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [28]:
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [29]:
# Let's try 3D!

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [56]:
# 1️⃣ add llm

llm = ChatOpenAI(model=MODEL, temperature=0.7)


In [57]:
# 2️⃣ retriever
retriever = vectorstore.as_retriever()


In [58]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory


In [59]:
# 3️⃣  build prompts
contextualize_q_prompt = ChatPromptTemplate.from_messages([
    ("system", "Given the chat history and new user question, rewrite the question to be standalone."),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}")
])

qa_prompt = ChatPromptTemplate.from_messages([
    ("system", "Use the retrieved context to answer the user’s question clearly and accurately."),
    MessagesPlaceholder("chat_history"),
    ("human", "Question: {input}\n\nContext:\n{context}")
])

In [60]:
from langchain_classic.chains import create_history_aware_retriever, create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain


In [61]:
# 4️⃣  Create RAG chain (retriever + LLM)
history_aware_retriever = create_history_aware_retriever(llm, retriever, contextualize_q_prompt)
document_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, document_chain)

In [62]:
# 5️⃣  Add chat memory (replacement for ConversationBufferMemory)
session_store = {}
def get_session_history(session_id: str):
    if session_id not in session_store:
        session_store[session_id] = InMemoryChatMessageHistory()
    return session_store[session_id]

conversation_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
)

In [63]:
# 6️⃣  Run an example conversation
config = {"configurable": {"session_id": "demo"}}

response1 = conversation_chain.invoke({"input": "What’s insurellm?"}, config=config)
print("AI:", response1["answer"])

response2 = conversation_chain.invoke({"input": "what did Avery Lancaster?"}, config=config)
print("AI:", response2["answer"])

Error in RootListenersTracer.on_chain_end callback: KeyError('output')


AI: Insurellm is an insurance tech startup founded by Avery Lancaster in 2015, aimed at innovating the insurance industry. It offers four main software products: Carllm for auto insurance companies, Homellm for home insurance companies, Rellm for the reinsurance sector, and Marketllm, a marketplace connecting consumers with insurance providers. As of 2024, Insurellm has expanded to 200 employees and operates 12 offices across the US, serving over 300 clients worldwide.


Error in RootListenersTracer.on_chain_end callback: KeyError('output')


AI: Avery Lancaster is the Co-Founder and Chief Executive Officer (CEO) of Insurellm, a prominent insurance technology company she co-founded in 2015. Throughout her career at Insurellm, she has demonstrated innovative leadership and expertise in risk management, helping the company become a key player in the insurance market. 

Avery has actively engaged in professional development, participated in diversity and inclusion initiatives, and improved work-life balance for her team. Additionally, she has led community outreach efforts focused on financial literacy for underserved populations. Her leadership has been recognized through various annual performance evaluations, with notable achievements in launching successful products and regaining market leadership.


In [64]:
from langchain_classic.memory import ConversationBufferMemory
from langchain_classic.chains import ConversationalRetrievalChain

In [65]:
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

retriever = vectorstore.as_retriever()

conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)


Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/

