In [None]:
!pip install -r requirements.txt

In [None]:
!nvidia-smi

In [None]:
import torch
import fitz
import os
import re

In [None]:
import fitz

# Data PreProcessing

In [None]:
pdf_directory = "pdf"

pdf_files = [os.path.join(pdf_directory, file) for file in os.listdir(pdf_directory) if file.endswith('.pdf')]

In [None]:
docs = [fitz.open(pdf_file) for pdf_file in pdf_files]

print(docs[0].page_count)

In [None]:
text = ""
for page in docs[0]:
    text += page.get_text()

print(text)

In [None]:
pattern = r'\b\d+\/\d+\/\d+, \d+:\d+ [APM]+\b|\d+\/\d+\b|https:\/\/.*\/deka#'

In [None]:
cleaned_text = re.sub(pattern, '', text)

In [None]:
print(cleaned_text)

# Embbedding

In [None]:
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

In [None]:
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

embeddings = HuggingFaceInstructEmbeddings(
    model_name = "intfloat/multilingual-e5-large",
    model_kwargs={"device": DEVICE}
)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024, 
    chunk_overlap=64,
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  
        "\uff0c",  
        "\u3001",  
        "\uff0e",  
        "\u3002",  
        "",
    ],)

split_text = text_splitter.split_text(cleaned_text)

len(split_text)

In [None]:
print(split_text)

In [None]:
db = Chroma.from_texts(split_text, embeddings, persist_directory="db")

In [None]:
db.similarity_search("มาตรา 383")

# Model Selection

In [None]:
from os.path import expanduser
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.chains import RetrievalQA

In [None]:
model_path_opt = expanduser("model/openthaigpt-1.0.0-beta-13b-chat.Q3_K_L.gguf")
model_path_sea = expanduser("model/seallm-7b-v2.5.Q4_K_M.gguf")

In [None]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

llm_opt = LlamaCpp(
    model_path=model_path_opt,
    streaming=False,
    n_gpu_layers=-1,
    n_batch=512,
    callback_manager=callback_manager,
    verbose=True,
    n_ctx=2048
)

In [None]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

llm_sea = LlamaCpp(
    model_path=model_path_sea,
    streaming=False,
    n_gpu_layers=-1,
    n_batch=512,
    callback_manager=callback_manager,
    verbose=True,
    n_ctx=3000,
    repeat_penalty = 1
)

# Question and Answer

OpentThaiGPT ConversationalRetrievalChain Prompt Template

In [None]:
template_opt_con = """
    [INST] <<SYS>>
    You are an expert, helpful, respectful and honest Lawyer assistant. AIlways answer as helpfully as possible.Please ensure that your responses are socially unbiased and positive in nature.If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
    {context}
    Use the following pieces of context to answer the question at the end. If you don't know the answer to a question, please don't share false information.
    <</SYS>>
    {chat_history}
    {question} [/INST]
""" 

OpentThaiGPT RetrievalQA Prompt Template 

In [None]:
template_opt_qa = """
    [INST] <<SYS>>
    You are an expert, helpful, respectful and honest Lawyer assistant. AIlways answer as helpfully as possible.Please ensure that your responses are socially unbiased and positive in nature.If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
    {context}
    Use the following pieces of context to answer the question at the end. If you don't know the answer to a question, please don't share false information.
    <</SYS>>
    {question} [/INST]
"""  # End of system message. Please

SeaLLMs ConversationalRetrievalChain Prompt Template

In [None]:
template_sea_con =  """<bos> 
    <|im_start|>system
    You are an expert, helpful, respectful and honest Lawyer assistant. AIlways answer as helpfully as possible.Please ensure that your responses are socially unbiased and positive in nature.If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
    {context}
    Use the following pieces of context to answer the question at the end. If you don't know the answer to a question, please don't share false information. and answer all the question in Thai language.
    <eos>
    <|im_start|>user
    {chat_history}
    {question}
    <eos>
"""

SeaLLMs RetrievalQA Prompt Template 

In [None]:
template_sea_qa =  """<bos> 
    <|im_start|>system
    You are an expert, helpful, respectful and honest Lawyer assistant. AIlways answer as helpfully as possible.Please ensure that your responses are socially unbiased and positive in nature.If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
    {context}
    Use the following pieces of context to answer the question at the end. If you don't know the answer to a question, please don't share false information.and answer all the question in Thai language.
    <eos>
    <|im_start|>user
    {question}
    <eos>
"""

In [None]:
memory_opt = ConversationBufferMemory (
    memory_key="chat_history",
    human_prefix="### Input",
    ai_prefix="### Response",
    output_key='answer',
    return_messages=True,
)

memory_sea = ConversationBufferMemory (
    memory_key="chat_history",
    human_prefix="### Input",
    ai_prefix="### Response",
    output_key='answer',
    return_messages=True,
)

In [None]:
prompt_opt_con = PromptTemplate(template=template_opt_con, input_variables=["context", "question"])
prompt_opt_qa = PromptTemplate(template=template_opt_qa, input_variables=["context", "question"])
prompt_sea_con = PromptTemplate(template=template_sea_con, input_variables=["context", "question"])
prompt_sea_qa = PromptTemplate(template=template_sea_qa, input_variables=["context", "question"])


In [None]:
chain_opt = ConversationalRetrievalChain.from_llm(
    llm=llm_opt,
    chain_type="stuff",
    retriever=db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.7}),
    memory=memory_opt,
    combine_docs_chain_kwargs={"prompt": prompt_opt_con},
    return_source_documents=True,
    verbose=True,
)

chain_qa_opt = RetrievalQA.from_chain_type(
    llm=llm_opt,
    chain_type="stuff",
    retriever=db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.7}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_opt_qa},
    verbose = True,
)

In [None]:
chain_sea = ConversationalRetrievalChain.from_llm(
    llm=llm_sea,
    chain_type="stuff",
    retriever=db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5}),
    memory=memory_sea,
    combine_docs_chain_kwargs={"prompt": prompt_sea_con},
    return_source_documents=True,
    verbose=True,
)

chain_qa_sea = RetrievalQA.from_chain_type(
    llm=llm_sea,
    chain_type="stuff",
    retriever=db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_sea_qa},
    verbose = True,
)

## Run Result for OpenThai-GPT (ConversationalRetrievalChain)

In [None]:
result1_opt = chain_opt("ยกเลิกสัญญาเช่ารถก่อนกำหนด ต้องจ่ายค่าเสียหายไหม?")

In [None]:
print(result1_opt["answer"])

In [None]:
result2_opt = chain_opt("ยกเลิกสัญญาเช่ารถก่อนกำหนด ต้องจ่าค่าเสียหายเต็มจำนวนไหม?")

In [None]:
print(result2_opt["answer"])

In [None]:
result3_opt = chain_opt("หากคำเสียหายในสัญญาเช่ารถสูงมากๆ กฎหมายข้อไหนช่วยได้บ้าง?")

In [None]:
print(result3_opt["answer"])

In [None]:
result4_opt = chain_opt("ดอกเบี้ยในสัญญาสามารถสูงสุดต่อปีได้ ร้อยละ เท่าไหร่?")

In [None]:
print(result4_opt["answer"])

In [None]:
result5_opt = chain_opt("อธิบายกฎหมาย ป.พ.พ มาตรา 383")

In [None]:
print(result5_opt["answer"])

## Run Result for OpenThai-GPT (RetrievalQA)

In [None]:
result1_opt_qa = chain_qa_opt("ยกเลิกสัญญาเช่ารถก่อนกำหนด ต้องจ่ายค่าเสียหายไหม?")

In [None]:
print(result1_opt_qa["result"])

In [None]:
result2_opt_qa = chain_qa_opt("ยกเลิกสัญญาเช่ารถก่อนกำหนด ต้องจ่าค่าเสียหายเต็มจำนวนไหม?")

In [None]:
print(result2_opt_qa["result"])

In [None]:
result3_opt_qa = chain_qa_opt("หากคำเสียหายในสัญญาเช่ารถสูงมากๆ กฎหมายข้อไหนช่วยได้บ้าง?")

In [None]:
print(result3_opt_qa["result"])

In [None]:
result4_opt_qa = chain_qa_opt("ดอกเบี้ยในสัญญาสามารถสูงสุดต่อปีได้ ร้อยละ เท่าไหร่?")

In [None]:
print(result4_opt_qa["result"])

In [None]:
result5_opt_qa = chain_qa_opt("อธิบายกฎหมาย ป.พ.พ มาตรา 383")

In [None]:
print(result5_opt_qa["result"])

## Run Result for SeaLLM (ConversationalRetrievalChain)

In [None]:
result1_sea = chain_sea("ยกเลิกสัญญาเช่ารถก่อนกำหนด ต้องจ่ายค่าเสียหายไหม?")

In [None]:
print(result1_sea["answer"])

In [None]:
result2_sea = chain_sea("ยกเลิกสัญญาเช่ารถก่อนกำหนด ต้องจ่าค่าเสียหายเต็มจำนวนไหม?")

In [None]:
print(result2_sea["answer"])

In [None]:
result3_sea = chain_sea("หากคำเสียหายในสัญญาเช่ารถสูงมากๆ กฎหมายข้อไหนช่วยได้บ้าง?")

In [None]:
print(result3_sea["answer"])

In [None]:
result4_sea = chain_sea("ดอกเบี้ยในสัญญาสามารถสูงสุดต่อปีได้ ร้อยละ เท่าไหร่?")

In [None]:
print(result4_sea["answer"])

In [None]:
result5_sea = chain_sea("อธิบายกฎหมาย ป.พ.พ มาตรา 383")

In [None]:
print(result5_sea["answer"])

## Run Result for SeaLLM (RetrievalQA)

In [None]:
result1_sea_qa = chain_qa_sea("ยกเลิกสัญญาเช่ารถก่อนกำหนด ต้องจ่ายค่าเสียหายไหม?")

In [None]:
print(result1_sea_qa["result"])

In [None]:
result2_sea_qa = chain_qa_sea("ยกเลิกสัญญาเช่ารถก่อนกำหนด ต้องจ่าค่าเสียหายเต็มจำนวนไหม?")

In [None]:
print(result2_sea_qa["result"])

In [None]:
result3_sea_qa = chain_qa_sea("หากคำเสียหายในสัญญาเช่ารถสูงมากๆ กฎหมายข้อไหนช่วยได้บ้าง?")

In [None]:
print(result3_sea_qa["result"])

In [None]:
result4_sea_qa = chain_qa_sea("ดอกเบี้ยในสัญญาสามารถสูงสุดต่อปีได้ ร้อยละ เท่าไหร่?")

In [None]:
print(result4_sea_qa["result"])

In [None]:
result5_sea_qa = chain_qa_sea("อธิบายกฎหมาย ป.พ.พ มาตรา 383")

In [None]:
print(result5_sea_qa["result"])