In [12]:
import os
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
import urllib.parse
import time

# Set environment variables for Azure OpenAI
os.environ["AZURE_OPENAI_API_KEY"] = 'd8e93330e8384f06aa1c8ace726af49e'
os.environ["AZURE_OPENAI_ENDPOINT"] = 'https://dataiku-gpt4ommi.openai.azure.com'

# Embedding model configuration
embeddings = AzureOpenAIEmbeddings(
    deployment="gpt-4o",
    model="text-embedding-3-large",
    openai_api_type="azure",
    azure_endpoint="https://dataiku-gpt4ommi.openai.azure.com",
    openai_api_version="2023-05-15",
    chunk_size=1500,
)

DATA_DIR = "scrape_results"
FAISS_INDEX_DIR = "faiss_index"
VECTORSTORE_PATH = "combined_vectorstore"

def create_vector_database(data_dir: str = DATA_DIR, chunk_size=1500, chunk_overlap=200):
    os.makedirs(data_dir, exist_ok=True)
    all_text = ""
    for filename in os.listdir(data_dir):
        if filename.endswith('.txt'):
            file_path = os.path.join(data_dir, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    all_text += f.read() + "\n"
            except Exception as e:
                print(f"Error reading {filename}: {e}")

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_text(all_text)
    documents = [Document(page_content=chunk) for chunk in chunks]
    vector_store = FAISS.from_documents(documents=documents, embedding=embeddings)
    vector_store.save_local(FAISS_INDEX_DIR)
    return vector_store, embeddings

def process_documents_in_batches(documents, embeddings, batch_size=1, delay=1):
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        yield FAISS.from_documents(batch, embeddings)
        time.sleep(delay)

def main():
    all_texts = []
    for file_name in os.listdir(DATA_DIR):
        if file_name.endswith('.txt'):
            try:
                decoded_file_name = urllib.parse.unquote(file_name)
                file_path = os.path.join(DATA_DIR, decoded_file_name)
                short_path = os.path.abspath(file_path)
                loader = TextLoader(short_path, encoding='utf-8')
                documents = loader.load()
                text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
                texts = text_splitter.split_documents(documents)
                all_texts.extend(texts)

                print(f"file: {decoded_file_name}")
                time.sleep(5)
            except Exception as e:
                print(f"Error file {file_name}: {e}")

    combined_vectorstore = FAISS.from_documents(all_texts[:1], embeddings)
    for vs in process_documents_in_batches(all_texts[1:], embeddings, batch_size=1, delay=1):
        combined_vectorstore.merge_from(vs)

    combined_vectorstore.save_local(VECTORSTORE_PATH)

if __name__ == "__main__":
    main()

Processed file: 1.txt
Processed file: 10.txt
Processed file: 11.txt
Processed file: 12.txt
Processed file: 13.txt
Processed file: 14.txt
Processed file: 15.txt
Processed file: 16.txt
Processed file: 17.txt
Processed file: 18.txt
Processed file: 19.txt
Processed file: 2.txt
Processed file: 20.txt
Processed file: 21.txt
Processed file: 3.txt
Processed file: 4.txt
Processed file: 5.txt
Processed file: 6.txt
Processed file: 7.txt
Processed file: 8.txt
Processed file: 9.txt
Processed file: webpage_structure_car_insurent.txt
Processed file: webpage_structure_customer_impression.txt
Processed file: webpage_structure__category_career_.txt
Processed file: webpage_structure__mobile-app-presscon_.txt
Processed file: webpage_structure__personal-accident-insurance-tpb_.txt
Processed file: webpage_structure__privacy-notice_.txt
Processed file: webpage_structure__responsible-lending-announce_.txt
Processed file: webpage_structure__responsible-lending_announce_.txt
Processed file: webpage_structure__s

In [16]:
from langchain.vectorstores import FAISS
from langchain.tools import StructuredTool
from langchain.chains import RetrievalQA, ConversationChain
from langchain.chat_models import AzureChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain_community.embeddings.azure_openai import AzureOpenAIEmbeddings
from typing import Dict
import os

In [30]:
os.environ["AZURE_OPENAI_API_KEY"] = 'd8e93330e8384f06aa1c8ace726af49e'
os.environ["AZURE_OPENAI_ENDPOINT"] = 'https://dataiku-gpt4ommi.openai.azure.com'

embeddings = AzureOpenAIEmbeddings(
    deployment="gpt-4o",
    model="text-embedding-3-large",
    openai_api_type="azure",
    azure_endpoint="https://dataiku-gpt4ommi.openai.azure.com", 
    openai_api_version="2023-05-15",
    chunk_size=1500,
)

vectorstore_path = "combined_vectorstore/index.faiss"
combined_vectorstore = FAISS.load_local("combined_vectorstore", embeddings, allow_dangerous_deserialization=True)
combined_retriever = combined_vectorstore.as_retriever(search_kwargs={"k": 5})

# 4. Initialize the language model
llm = AzureChatOpenAI(
    openai_api_version="2024-08-01-preview",
    azure_endpoint="https://dataiku-gpt4ommi.openai.azure.com",
    azure_deployment="dataiku-ssci-gpt-4o",
)

# 5. Create a RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=combined_retriever,
    return_source_documents=True
)

# 6. Create a dictionary to store user memories
user_memories = {}

pre_history = [
    {"input": "system", "output": "คุณคือ พนักงานของศรีสวัสดิ์ซึ่งเป็นบริษัทปล่อยสินเชื่อ"},
    {"input": "assistant", "output": "เป้าหมายของคุณคือตอบคำถามเกี่ยวกับบริษัทนี้เท่านั้นห้ามพูดถึงเรื่องอื่นๆ"},
    {"input": "assistant", "output": "โปรดตอบคำถามให้เป็นภาษาเดียวกับที่ถูกถามมา"},
    {"input": "assistant", "output": "โปรดอบคำถามอย่างละเอียดเสมอๆ"}
]

def add_pre_history(user_id, pre_history):
    """Add pre-existing history to the conversation for a specific user."""
    if user_id not in user_memories:
        user_memories[user_id] = ConversationBufferMemory()
    for entry in pre_history:
        user_memories[user_id].save_context({"input": entry['input']}, {"output": entry['output']})


from typing import Dict, List

def process_query(query: str, user_id: str) -> Dict[str, any]:
    # # Get or create memory for the user
    # if user_id not in user_memories:
    #     add_pre_history(user_id, pre_history)

    # Create a conversation chain with user-specific memory
    conversation = ConversationChain(
        llm=llm,
        memory=user_memories[user_id],
        verbose=True
    )

    # Use the QA chain to get the initial answer
    qa_result = qa_chain({"query": query})
    initial_answer = qa_result['result']
    sources = qa_result['source_documents']

    # Use the conversation chain to generate a response that includes memory
    conversation_result = conversation.predict(input=f"Question: {query}\nInitial Answer: {initial_answer}")

    # Prepare the structured sources
    structured_sources = []
    for i, doc in enumerate(sources, 1):
        structured_sources.append({
            "number": i,
            "source": doc.metadata.get('source', 'Unknown'),
            "excerpt": doc.page_content[:200] + "..."
        })

    # Prepare the structured response
    structured_response = {
        "query": query,
        "answer": conversation_result,
        "initial_answer": initial_answer,
        "sources": structured_sources
    }

    return structured_response

# Update the StructuredTool to reflect the new return type
# kpp_document_tool = StructuredTool.from_function(
#     name="เอกสารการปล่อยสินเชื่อต่างๆ",
#     func=process_query,
#     description="นี่คือข้อมูลและเอกสารการปล่อยสินเชื่อในบริษัทศรีสวัสดิ์ซึ่งมีข้อมูลครบถ้วนทั้งหมด",
#     return_direct=True
# )

In [31]:
def interactive_chat():
    user_id = input("Enter your user ID to start chatting: ")
    if user_id not in user_memories:
        add_pre_history(user_id, pre_history)

    print("\nWelcome to the chat! You can start asking questions.")
    print("Type 'exit' to end the chat.\n")
    while True:
        query = input("Your question: ")
        if query.lower() == 'exit':
            print("Ending the chat.")
            break
        response = process_query(query, user_id)
        print(f"\nAnswer: {response['answer']}")
        print("\nSources:")
        for source in response['sources']:
            print(f"{source['number']}. {source['excerpt']}")

if __name__ == "__main__":
    interactive_chat()


Welcome to the chat! You can start asking questions.
Type 'exit' to end the chat.



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
Human: system
AI: คุณคือ พนักงานของศรีสวัสดิ์ซึ่งเป็นบริษัทปล่อยสินเชื่อ
Human: assistant
AI: เป้าหมายของคุณคือตอบคำถามเกี่ยวกับบริษัทนี้เท่านั้นห้ามพูดถึงเรื่องอื่นๆ
Human: assistant
AI: โปรดตอบคำถามให้เป็นภาษาเดียวกับที่ถูกถามมา
Human: assistant
AI: โปรดอบคำถามอย่างละเอียดเสมอๆ
Human: Question: รู้อะไรบ้าง 
Initial Answer: ฉันสามารถช่วยตอบคำถามและให้ข้อมูลในหลากหลายเรื่อง เช่น ด้านการเงิน การลงทุน การประกันภัย การจัดการทรัพย์สิน กฎหมาย การท่องเที่ยว แนะนำสถานที่ต่างๆ รวมถึงข้อมูลทั่วไปที่เกี่ยวกับธุรกิจและการบริการของบริษัทศรีสวัสดิ์ กรุณาบอกความต้องการหรือคำถามขอ