# Expert Knowledge Worker using RAG

In [None]:
# general imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr
import numpy as np



In [None]:
# langchain imports

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain



In [None]:
# Visualization Imports
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import plotly.io as pio


In [None]:
# intializaing Constants

MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [None]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")


In [None]:
# prepping the docs for chunks

folders = glob.glob("knowledge-base/*")

text_loader_kwargs = {'encoding':"utf-8"}

documents = []

for folder in folders:
    doctype = os.path.basename(folder)
    loader = DirectoryLoader(folder, loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for docs in folder_docs:
        docs.metadata["doc_type"] = doctype
        documents.append(docs)

In [None]:
# making chunks for vectorization
text_splitter = CharacterTextSplitter(chunk_size = 2000, chunk_overlap = 400)
chunks = text_splitter.split_documents(documents)
len(chunks)
 

In [None]:
# embedding chunks in vector datastore (vectorization)

embeddings = OpenAIEmbeddings()

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

vectorstore = Chroma.from_documents(persist_directory=db_name, embedding=embeddings, documents=chunks)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions") 

In [None]:
# Prework for data visualizaiton

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [None]:

# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

pio.renderers.default = 'notebook'
fig.show()

In [None]:
#  3D

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# preparing the RAG abstractions: LLM, retriever and memory

llm = ChatOpenAI(temperature=0.7, model=MODEL)

memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

retriever =  vectorstore.as_retriever()

conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)


In [None]:
def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

In [None]:
# launching the chat interface in gradio

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)
