In [None]:
%%capture
! pip install -U langchain-nomic langchain_community tiktoken langchainhub chromadb langchain langgraph gpt4all

In [None]:
! pip install pprintpp

Collecting pprintpp
  Downloading pprintpp-0.4.0-py2.py3-none-any.whl (16 kB)
Installing collected packages: pprintpp
Successfully installed pprintpp-0.4.0


In [None]:
from langchain_community.llms import HuggingFaceEndpoint

# get a token: https://huggingface.co/docs/api-inference/quicktour#get-your-api-token

from getpass import getpass

#HUGGINGFACEHUB_API_TOKEN = getpass()

import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

### Build Vector database (Search relevant info.)



In [None]:
# import pandas as pd
# df = pd.read_csv('/content/dict.csv')
# df[['subdistrict', 'district', 'province', 'zipcode']].to_csv('reference_address.csv', encoding='utf-8', index=False)

In [None]:
### LLM
local_llm = "llama3"

### Index
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings

# Change to csv loader
from langchain_community.document_loaders.csv_loader import CSVLoader

csv_path = "/content/reference_address.csv"
loader = CSVLoader(
  file_path=csv_path,
  encoding="utf-8",
  csv_args={
    'delimiter': ',',
    'fieldnames': ['subdistrict', 'district', 'province', 'zipcode']
  }
)
document = loader.load()
print(f"Loaded CSV file: {csv_path}")

# Initialize a text splitter to divide documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=0)
texts = text_splitter.split_documents(document)
print(f"Texts splitted for CSV file: {csv_path}")

# Add to vectorDB
# vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)
# vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=GPT4AllEmbeddings())

vectorstore = Chroma.from_documents(
  documents=texts,
  collection_name="rag-chroma",
  embedding=GPT4AllEmbeddings(),
  persist_directory="/content/chroma_db"
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
#retriever = vectorstore.similarity_search_with_score(k=1)

# vectorstore = Chroma.from_documents(
#     documents=texts,
#     collection_name="rag-chroma",
#     embedding=AzureOpenAIEmbeddings(
#       azure_deployment="EMBED",
#       openai_api_version="2023-05-15",
#     ),
#     # persist_directory="/content/chroma_db"
# )
# retriever = vectorstore.as_retriever()

Loaded CSV file: /content/reference_address.csv
Texts splitted for CSV file: /content/reference_address.csv


Downloading: 100%|██████████| 45.9M/45.9M [00:00<00:00, 69.0MiB/s]
Verifying: 100%|██████████| 45.9M/45.9M [00:00<00:00, 481MiB/s]


In [None]:
### Generate
from langchain.prompts import PromptTemplate
from langchain import hub
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_community.llms import HuggingFaceEndpoint
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
import pprint

# Prompt
#prompt = PromptTemplate(
#  template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are expect to perform Named Entity Recognition task in Thai address domain.
#  You get Thai address text and return following thai entities [house_number, building_name, floor, room, moo, trok_soi, road, subdistrict, district, province, post_code, unhandle_case] as a JSON format.
#  based on provided document and do not insert entitiesby yourself. <|eot_id|><|start_header_id|>user<|end_header_id|>
#  Question: {question}
#  Context: {context}
#  Named entities: <|eot_id|><|start_header_id|>parse assistant<|end_header_id|>""",
#  input_variables=["question", "document"],
#)

# Prompt
#prompt = PromptTemplate(
#  template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are expect to perform parse Thai address.
#  You get Thai address text and return parse result in following entities [house_number, building_name, floor, room, moo, trok_soi, road, subdistrict, district, province, post_code, unhandle_case] as a JSON format.
#  based on provided document (exactly match) and do not insert entities by yourself."<|eot_id|><|start_header_id|>user<|end_header_id|>
#  Question: {question}
#  Context: {context}
#  Named entities: <|eot_id|><|start_header_id|>parse assistant<|end_header_id|>""",
#  input_variables=["Question", "Document"],
#)

prompt = PromptTemplate(
  template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
  Please parse the following Thai address and convert it into a JSON object with the specified format.
  The format should include the fields: house_number, building_name, floor, room, moo, trok_soi, road, subdistrict, district, province, post_code, and unhandle_case.
  Ensure to parse the data with 100% confidence, and if any field is uncertain or lacks enough information, leave that field empty in the output.
  "<|eot_id|><|start_header_id|>user<|end_header_id|>
  Question: {question}
  <|eot_id|><|start_header_id|>parse assistant<|end_header_id|>""",
  input_variables=["question","context"],
  )

# LLM option:

# Local LLM model
# llm = ChatOllama(model=local_llm, format="json", temperature=0)

# Llama-3-8B-Instruct
repo_id = "meta-llama/Meta-Llama-3-8B-Instruct"
llm = HuggingFaceEndpoint(
  repo_id=repo_id,
  max_new_tokens=350,
  temperature=0.6,
  top_p=0.98
)

# llama-3-70b-Instruct
# from langchain_community.chat_models import ChatDatabricks
# llm = ChatDatabricks(
#   target_uri="databricks",
#   endpoint="databricks-meta-llama-3-70b-instruct",
#   temperature=0.01,
#   max_tokens=128
# )


# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = prompt | llm | JsonOutputParser()

# Run
context = """
Thai addressing system:
house_number: บ้านเลขที่
moo: หมู่(ห.)
trok_soi: ซอย/แยก(ซ.)
road: ถนน(ถ.)
subdistrict: ตำบล(ต.)/แขวง
district: อำเภอ(อ.)
province: จังหวัด(จ.)
unhandle_case: ใส่ข้อความที่ไม่สามารถระบุได้

 """

#Example 1 if any field is uncertain or lacks enough information, leave that field empty in the output
#Prompt: 58 ซอยสามัคคี 40 คลองท่อมเหนือ กระบี่ 11000
#Output:
#{'building_name': '',
# 'district': '',
# 'floor': '',
# 'house_number': '58',
# 'moo': '',
# 'post_code': '11000',
# 'province': 'กระบี่',
# 'road': '',
# 'room': '',
# 'subdistrict': 'คลองท่อมเหนือ',
# 'trok_soi': 'สามัคคี 40',
# 'unhandle_case': ''}

#Example 2
#Prompt: 129 เดอะมอลล์ ชอปปิ้ง คอมเพล็กซ์ ชั้นที่ 3 ห้องเลขที่ 3S-C2,A2 ถนนรัชดาภิเษก (ท่าพระ-ตากสิน) ต.บุคคโล อ.เขตธนบุรี จ.กรุงเพมหานคร 10600
#Output:
# {'building_name': 'เดอะมอลล์ ชอปปิ้ง คอมเพล็กซ์',
# 'district': 'เขตธนบุรี',
# 'floor': 'ชั้นที่ 3',
# 'house_number': '129',
# 'moo': '',
# 'post_code': '10600',
# 'province': 'กรุงเพมหานคร',
# 'road': 'ถนนรัชดาภิเษก (ท่าพระ-ตากสิน)',
# 'room': '3S-C2,A2',
# 'subdistrict': 'ต.บุคคโล',
# 'trok_soi': '',
# 'unhandle_case': ''}

# To edit input
#question = """123 ซอย สบายใจ แยก 3 แขวงสามเสนนอก เขตห้วยขวาง กรุงเทพมหานคร 10310"""
question = """129 เดอะมอลล์ ชอปปิ้ง คอมเพล็กซ์ ชั้นที่ 3 ห้องเลขที่ 3S-C2,A2 ถนน รัชดาภิเษก (ท่าพระ-ตากสิน) ต.บุคคโล อ.เขตธนบุรี จ.กรุงเพมหานคร 10600"""
#question = """58 ซอยสามัคคี 40   คลองท่อมเหนือ กระบี่ 11000"""
#question = """58 ซอยสามัคคี 40  คลองท่อมเหนือ กระบี่ 11000"""
docs = retriever.invoke(question)
pprint.pprint(docs)
generation = rag_chain.invoke({"context": context, "question": question})
# For example:
#   Text: 129 เดอะมอลล์ ชอปปิ้ง คอมเพล็กซ์ ชั้นที่ 3 ห้องเลขที่ 3S-C2,A2 ถนน รัชดาภิเษก (ท่าพระ-ตากสิน) ต.บุคคโล อ.เขตธนบุรี จ.กรุงเพมหานคร 10600
#   Named entities: {'house_number': '129', 'building_name': 'เดอะมอลล์ ชอปปิ้ง คอมเพล็กซ์', 'floor': 'ชั้นที่ 3', 'room': 'ห้องเลขที่ 3S-C2', 'moo': '', 'trok_soi': 'ท่าพระ-ตากสิน', 'road': 'รัชดาภิเษก', 'subdistrict': 'บุคคโล', 'district': 'ธนบุรี', 'province': 'กรุงเทพมหานคร', 'post_code': '10600', 'unhandle_case': 'A2'}

# generation = rag_chain.invoke({"context": context, "question": question})
pprint.pprint(generation)



Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful
[Document(page_content='subdistrict: คลองสองต้นนุ่น\ndistrict: ลาดกระบัง\nprovince: กรุงเทพมหานคร\nzipcode: 10520', metadata={'row': 193, 'source': '/content/reference_address.csv'}),
 Document(page_content='subdistrict: บ้านคลองสวน\ndistrict: พระสมุทรเจดีย์\nprovince: สมุทรปราการ\nzipcode: 10290', metadata={'row': 5192, 'source': '/content/reference_address.csv'}),
 Document(page_content='subdistrict: อีสานเขต\ndistrict: เฉลิมพระเกียรติ\nprovince: บุรีรัมย์\nzipcode: 31110', metadata={'row': 2763, 'source': '/content/reference_address.csv'}),
 Document(page_content='subdistrict: พระพุทธ\ndistrict: เฉลิมพระเกียรติ\nprovince: นครราชสีมา\nzipcode: 30230', metadata={'row': 1937, 'source': '/content/reference_address.csv'}),
 Document(page_content='

In [None]:
prompt

PromptTemplate(input_variables=['context', 'question'], template='<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are expect to perform parse Thai address.\n  You get Thai address text and return parse result in following entities [house_number, building_name, floor, room, moo, trok_soi, road, subdistrict, district, province, post_code, unhandle_case] as a JSON format.\n  based on provided document (exactly match) and do not insert entities by yourself."<|eot_id|><|start_header_id|>user<|end_header_id|>\n  Question: {question}\n  Context: {context}\n  Named entities: <|eot_id|><|start_header_id|>parse assistant<|end_header_id|>')

In [None]:
# Validate result input/output
import pprint
from collections import OrderedDict

pprint.pprint(question)
#pprint(OrderedDict(generation))

'58 ซอยสามัคคี 40   คลองท่อมเหนือ กระบี่ 11000'


In [None]:
generation

{'house_number': '58',
 'building_name': 'ซอยสามัคคี 40',
 'floor': '',
 'room': '',
 'moo': '',
 'trok_soi': 'สามัคคี',
 'road': '',
 'subdistrict': 'คลองท่อมเหนือ',
 'district': 'กระบี่',
 'province': 'กระบี่',
 'post_code': '11000',
 'unhandle_case': ''}

In [37]:
!git clone https://github.com/Sakanarmmfec/TESTGIT.git

Cloning into 'TESTGIT'...


In [40]:
!git pull origin main

remote: Repository not found.
fatal: repository 'https://github.com/Sakanarmmfec/titanic.git/' not found


In [43]:
import os
from getpass import getpass

# Input your GitHub username and personal access token
username = input("Enter your GitHub username: ")
token = getpass("Enter your GitHub personal access token: ")

# Configure Git to use your username and token
os.system(f'git config --global user.name "{username}"')
os.system(f'git config --global user.password "{token}"')

# Add, commit, and push changes


Enter your GitHub username: Sakanarmmfec
Enter your GitHub personal access token: ··········


0

In [44]:
!echo "# TESTGIT" >> README.md