In [69]:
import os 
import glob
from dotenv import load_dotenv
from openai import OpenAI
from langchain_chroma import Chroma
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, TextLoader
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [43]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
openai = OpenAI()

In [46]:
MODEL = 'gpt-4o-mini'
vector_db = 'vector_db'

In [40]:
documents = []

folders = glob.glob('knowledge-base/*')
for folder in folders:
    doc_type = folder.split(os.sep)[-1]
    loader = DirectoryLoader(folder, glob= "**/*.md", loader_cls= TextLoader)
    folder_docs = loader.load()
  
    for doc in folder_docs:
        doc.metadata['doc_type'] = doc_type
        documents.append(doc)

In [44]:
text_splitter = CharacterTextSplitter(chunk_size = 1088, chunk_overlap = 200)
spllited_text = text_splitter.split_documents(documents)

In [45]:
doc_types = set(chunk.metadata['doc_type'] for chunk in spllited_text)
print(f"Document types found: {', '.join(doc_types)}")

Document types found: products, contracts, company, employees


In [47]:
embeddings = OpenAIEmbeddings()

In [None]:
# if os.path.exists(vector_db):
#     Chroma(persist_directory= vector_db, embedding_function= embeddings).delete_collection()

In [None]:
# vectorStore = Chroma.from_documents(embedding= embeddings, persist_directory= vector_db, documents= spllited_text)
# print(f"Vector Store created with {vectorStore._collection.count()} documents.")

Vector Store created with 109 documents.


In [73]:
# collection = vectorStore._collection
# result = collection.get(limit=1, include=['embeddings'])['embeddings'][0]
# dimensions =  len(result)
# print(f"The vector has {dimensions} dimensions.")


In [None]:
vectorStore = FAISS.from_documents(embedding= embeddings, documents= spllited_text)
print(f"Vector Store created with {vectorStore.index.d} dimensions and {vectorStore.index.ntotal} documents")

In [74]:
def RAG_pipeline(prompt, history):
    llm = ChatOpenAI(temperature= 0.5, model= MODEL)
    retriever = vectorStore.as_retriever()
    memory = ConversationBufferMemory(memory_key= 'chat_history', return_messages= True)

    conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

    return conversation_chain.invoke({'question': prompt, "chat_history": history})['answer']

In [75]:
gr.ChatInterface(RAG_pipeline).launch()

Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.


