In [1]:
import langchain
import os
import getpass
from langchain_openai import ChatOpenAI
import langchain_chroma
import shutil
import argparse
from langchain.prompts import ChatPromptTemplate
import time

In [2]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "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"
os.environ['LANGCHAIN_PROJECT'] = "My First Chatbot"

In [3]:
os.environ["OPENAI_API_KEY"] = getpass.getpass()
llm = ChatOpenAI(model="gpt-4o-mini")

In [4]:
import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import DirectoryLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document

DATA_PATH = "database"
CHROMA_PATH = "ChromaDatabases"

In [5]:
def load_docs():
    loader = DirectoryLoader(DATA_PATH, glob="*.txt")
    documents = loader.load()
    return documents

In [6]:
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=110,
        chunk_overlap=30,
        length_function = len,
        add_start_index=True,
    )

    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
    return chunks

In [7]:
def saveToChroma(chunks: list[Document], user_id):

    embeddings = OpenAIEmbeddings()

    db = Chroma.from_documents(
        chunks, embeddings, persist_directory=CHROMA_PATH, collection_name=user_id
    )
    # db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH} for {user_id}.")

In [8]:
def generate_data_store(user_id):
    documents = load_docs()
    chunks = split_text(documents)
    # print([doc.page_content for doc in documents[:5]])  # Print first 5 docs
    # listOfProfiles = ["Roshni", "Dhruv"]
    # for user_id in listOfProfiles:
    saveToChroma(chunks, user_id)

In [9]:
generate_data_store('Roshni')

Split 1 documents into 2776 chunks.
Saved 2776 chunks to ChromaDatabases for Roshni.


In [10]:
PROMPT_TEMPLATE = """
Chat with the user using only the following context. You are already given the person through collections:

{context}
---

Now, with the given context, give a reply as if you were that user. Try your best to mimic their personality, even if the response may be slightly negative: and try not to just repeat the question inside the reply unless it makes sense to. Use emojis as much as that person does. Give single line replies. If a question was askeed about past messages, give the answer if available. Reply to each message with the best possible response."{question}
"""

def query_database(query_text, user_id):
    
    embedding_function = OpenAIEmbeddings()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function, collection_name=user_id)

    results = db.similarity_search_with_relevance_scores(query_text, k=50)
    if len(results) == 0 or results[0][1] < 0.7:
        print(f"Unable to find matching results.")
        return

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    print(prompt)

    model = ChatOpenAI()
    response_text = model.predict(prompt)

    sources = [doc.metadata.get("source", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)


In [None]:
user_id=input("Whom would you like to talk to? (Roshni, Dhruv) : ")
while True:
    query_text = input("Enter your question (write 'exit' to end the conversation): ")
    if query_text == 'exit':
        break
    query_database(query_text, user_id)

Human: 
Chat with the user using only the following context. You are already given the person through collections:

in school exams, and never have i been this worried

---

5/29/24, 23:05 - Dhruv Mukherjee: Yo

5/29/24, 23:05 - Dhruv Mukherjee: How was the exam?

---

about much except for their next exams.

---

6/25/24, 01:27 - Dhruv Mukherjee: wishing well for their exams, just a simple hi here and there.

---

6/10/24, 21:23 - Dhruv Mukherjee: maine toh strategy taiyaar kar liya kal ke exam ka.

---

6/7/24, 03:10 - Dhruv Mukherjee: i dont know what to revise, what to study anymore

---

6/19/24, 18:21 - Dhruv Mukherjee: Well, my friends in Ranchi are excited to meet me, now that exams are over.

---

Stop behaving like Gracy a day before exam

3/29/24, 23:53 - Roshni: Still she passed 😭

So look at urself

---

6/7/24, 03:17 - Dhruv Mukherjee: this is the most worried i have been for a test

---

8/12/24, 22:50 - Dhruv Mukherjee: Anyways, I gtg for some work.

Congrats again on y

In [None]:
# def main():
#     parser = argparse.ArgumentParser()
#     parser.add_argument("query_text", type=str, help="The query text")
#     args = parser.parse_args()
#     query_text = args.query_text

#     embedding_function = OpenAIEmbeddings()
#     db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

#     results = db._similarity_search_with_relevance_scores(query_text, k=50)

#     if len(results)==0 or results[0][1]<0.7:
#         print("Unable to finding matching results")

#     context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
#     print(context_text)

In [29]:
print([doc.page_content for doc in documents[:5]])  # Print first 5 docs


NameError: name 'documents' is not defined