In [13]:
import openai
import os
import sys

# Import classes from modules

from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_postgres.vectorstores import PGVector





In [14]:
query = None
if len(sys.argv) > 1:
    query = sys.argv[1]

In [15]:
import json
from typing import overload
from langchain_core.documents import Document #type: ignore


@overload
def extract_data(path: str, main_key: str)->list[Document]:...

@overload
def extract_data(path: str)->list[Document]:...

def extract_data(path: str, main_key:str="items") -> list[Document]:
    documents: list[Document] = []
    with open(path, "r", encoding='utf8') as json_file:
        data:list[dict[str, str]] = json.load(json_file)[main_key]
        
        for each_data in data:
            page_content: str = ""
            for k in each_data:
                page_content = page_content + f"{k} {each_data[k]}"
            current_document = Document(
                page_content=page_content,
            )
            documents.append(current_document)
    return documents

In [16]:
json_file_path="/container/test_data.json"
data = extract_data(json_file_path)

In [17]:
data

[Document(page_content='Question = Please add watermark to the photo,Answer = adding watermark'),
 Document(page_content='Question = I want to add watermark to the photo,Answer = adding watermark'),
 Document(page_content='Question = add watermark to the photo,Answer = adding watermark'),
 Document(page_content='Question = Add watermark for me,Answer = adding watermark'),
 Document(page_content='Question = Add watermark,Answer = adding watermark'),
 Document(page_content='Question = Watermark please,Answer = adding watermark'),
 Document(page_content='Question = Watermark to the photo,Answer = adding watermark'),
 Document(page_content='Question = Input watermark,Answer = adding watermark'),
 Document(page_content='Question = ใส่ลายน้ำให้หน่อย,Answer = adding watermark')]

In [18]:
text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [19]:
texts

[Document(page_content='Question = Please add watermark to the photo,Answer = adding watermark'),
 Document(page_content='Question = I want to add watermark to the photo,Answer = adding watermark'),
 Document(page_content='Question = add watermark to the photo,Answer = adding watermark'),
 Document(page_content='Question = Add watermark for me,Answer = adding watermark'),
 Document(page_content='Question = Add watermark,Answer = adding watermark'),
 Document(page_content='Question = Watermark please,Answer = adding watermark'),
 Document(page_content='Question = Watermark to the photo,Answer = adding watermark'),
 Document(page_content='Question = Input watermark,Answer = adding watermark'),
 Document(page_content='Question = ใส่ลายน้ำให้หน่อย,Answer = adding watermark')]

In [20]:
# This part is used for embedding the docs and storing them into Vector DB and initializing the retriever.
connection = "postgresql+psycopg://langchain:langchain@pgvector:5432/langchain"  # Uses psycopg3!
collection_name = "my_docs"
embeddings = OpenAIEmbeddings()
vectorstore = PGVector(embeddings=embeddings, collection_name=collection_name, connection=connection, use_jsonb=True)

vectorstore.add_documents(data)
assert len(data) == len(vectorstore.similarity_search("", k=len(data)))
    

In [21]:
custom_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    Answer the question based ONLY on the following context, answer : adding watermark
    If the question asked by the user does not match the context, answer : I don't known.


    {context}
    Original question: {question}""",
)

In [22]:
# Create the language model
openai_apikey = ""
with open("/run/secrets/openai.txt", "r") as key_file:
    openai_apikey = key_file.read().strip()

llm1 = ChatOpenAI(model="gpt-3.5-turbo-16k",api_key=openai_apikey)

# Create the chain
chain1 = ConversationalRetrievalChain.from_llm(
    llm=llm1,
    retriever=vectorstore.as_retriever(),
    combine_docs_chain_kwargs={"prompt": custom_template}
)

In [23]:
print(llm1)

client=<openai.resources.chat.completions.Completions object at 0xffff7c0a0bf0> async_client=<openai.resources.chat.completions.AsyncCompletions object at 0xffff7c0081a0> model_name='gpt-3.5-turbo-16k' openai_api_key=SecretStr('**********') openai_proxy=''


In [24]:
chat_history = []
query = None  # Initialize query to avoid potential reference error

while True:
    if not query:
        query = input("User: ")
    if query in ['quit', 'q', 'exit']:
        break  
    result = chain1.invoke({"question": query, "chat_history": chat_history})  
    print("Chatbot:", result['answer'])

    chat_history.append((query, result['answer']))
    query = None

Chatbot: I don't know.
