In [1]:
import os
import pymupdf as pymu
import re
import unicodedata 
import requests
from dotenv import load_dotenv
import google.generativeai as genai
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
from typing import Annotated
from typing_extensions import TypedDict
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from IPython.display import Image, display

from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_core.prompts.few_shot import FewShotPromptTemplate
import pandas as pd


In [3]:

def process(page):
    # watermark
    watermark = r"ส ำนักงำนคณะกรรมกำรกฤษฎีกำ"
    watermark2 = r"สำนักงานคณะกรรมการกฤษฎีกา"
    try: 
        text = unicodedata.normalize("NFC", page.get_text())
    except:
        print("Processing text")
        text = page
    text = re.sub(watermark, "", text).strip()
    text = re.sub(watermark2, "", text).strip()

    # numbers
    thai_to_arabic = {
    "๐": "0",
    "๑": "1",
    "๒": "2",
    "๓": "3",
    "๔": "4",
    "๕": "5",
    "๖": "6",
    "๗": "7",
    "๘": "8",
    "๙": "9"
    }
    for k in thai_to_arabic.keys():
        text = re.sub(k, thai_to_arabic[k], text)

    # ignore page num for now
    text = re.sub(r"- (\d+) -", "", text)
    text = re.sub(r"^[^\u0E00-\u0E7F]*", "", text)

    # แพ่งพาณิชย์
    for m in re.finditer(r"มำตรำ (\d+)  ", text):
        text = re.sub(m.group(), f"##### มาตรา {m.group(1)}  \n", text)

    # the other one
    for m in re.finditer(r"มาตรา (\d+)  ", text):
        text = re.sub(m.group(), f"##### มาตรา {m.group(1)}  \n", text)    
    


    return text

In [4]:
llm = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
    )


def correct(body):
    prompt = """
    Below is a Thai text. Follow the instructions and the example.
    1. Correct mispellings from encoding errors.
    Do not do anything else. Do not remove #'s.

    {text}
    """
    prompt_template = PromptTemplate.from_template(prompt)

    chain = prompt_template | llm
    return chain.invoke(body).content

In [5]:
def batch_correct(PATH, corr=True):
    pdf = pymu.open(PATH)

    BATCH_SIZE = 5
    texts = ""
    with open(PATH.replace(".pdf", ".md"), "a", encoding="utf-8") as f:
        for i, page in enumerate(pdf):
            text = process(page)
            texts += text + "\n\n"
            
            if (i+1)%BATCH_SIZE == 0:
                if corr:
                    f.write(correct(texts) + "\n\n")
                else:
                    f.write(texts + "\n\n")
                texts = ""
        
        if texts.strip():
            if corr:
                f.write(correct(texts) + "\n\n")
            else:
                f.write(texts + "\n\n")

In [6]:
def format_md(PATH):
    with open(PATH, "r") as f:
        text = f.read()

    dt = {
        r"บรรพ": "# บรรพ",
        r"ส่วนที่": "## ส่วนที่",
        r"ลักษณะ": "### ลักษณะ",
        r"หมวด": "#### หมวด"
    }

    for k, v in dt.items():
        p = r"\n\s*\n" + k + r" (\d+)\s*\n([\u0E00-\u0E7F\s]*)\s*\n"
        for m in re.finditer(p, text):
            text = re.sub(m.group(), f"\n \n{v} {m.group(1)} {m.group(2).replace("\n", " ")} \n\n", text)


    with open(PATH.replace("data", "docs"), "w", encoding="utf-8") as f:
        f.write(text)
    

In [12]:
PATH = "../data/ประมวลกฎหมายแพ่งและพาณิชย์ (ฉบับอัพเดทล่าสุด).pdf"
# batch_correct(PATH)
format_md("../data/ประมวลกฎหมายแพ่งและพาณิชย์ (ฉบับอัพเดทล่าสุด).md")

In [7]:
PATH = '../data/สำนักงานคณะกรรมการกฤษฎีกา.pdf'
batch_correct(PATH, corr=False)
format_md('../data/สำนักงานคณะกรรมการกฤษฎีกา.md')

**DB**

In [4]:
import sys
import chromadb
sys.path.append("../src/")
from utils import get_retriever

In [5]:
PATH = "../docs/สำนักงานคณะกรรมการกฤษฎีกา.md"
rt = get_retriever(collection_name="CPC", md_path=PATH)

[32m2025-01-25 07:30:31.673[0m | [1mINFO    [0m | [36mutils[0m:[36mget_retriever[0m:[36m21[0m - [1mCreating database[0m


In [7]:
client = chromadb.PersistentClient("../chroma_db/")
client.list_collections()

[Collection(name=CCC), Collection(name=CPC)]