In [1]:
%pip install -r requirements.txt
%pip install faiss-cpu

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# import gdown
# from PyPDF2 import PdfReader, PdfWriter

# file_id = '13nSk1WIIRIH4Uh8C6Nj9KJEJzJsMTDaW'
# pdf_file_path = f'https://drive.google.com/uc?id={file_id}'
# file_base_name = 'file'
# gdown.download(pdf_file_path, f'{file_base_name}.pdf', quiet=False)
# pdf = PdfReader('file.pdf')
# pages = list(range(3, 18)) + list(range(39, 62))
# pdfWriter = PdfWriter()

# for page_num in pages:
#     pdfWriter.add_page(pdf.pages[page_num])

# with open(f'{file_base_name}_subset.pdf', 'wb') as f:
#     pdfWriter.write(f)

In [3]:
import joblib
import os
import nest_asyncio
from enum import Enum
from typing import List, Tuple
import nltk

from llama_parse import LlamaParse
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.chains import ConversationalRetrievalChain
from langchain.chains import RetrievalQA
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

import warnings
import openai

warnings.filterwarnings("ignore")
nest_asyncio.apply()

In [4]:
llamaparse_api_key = 'llx-pNes5rGZru1FvO1nINQMrAJMEso0OEWutgy8ejbGntSxNPeq'

client = openai.OpenAI(
    api_key='sk-GqA4Uj6iZXaykbOzIlFGtmdJr6VqiX94NhhjPZaf81kylRzh',
    base_url='https://api.opentyphoon.ai/v1'
)

In [5]:
DATA_DIR = "./data"
PARSED_DATA_FILE = os.path.join(DATA_DIR, "parsed_data.pkl")
PDF_FILE = r"file_subset.pdf"
PARSING_INSTRUCTIONS = """
The provided document is a statistical report from the National Statistical Office of Thailand.
It contains information about various industries, including employment, revenue.
The report is in Thai language.
The document is structured with tables and text sections.
Try to extract information accurately and answer questions concisely.
"""

class Language(Enum):
    THAI = "th"
    ENGLISH = "en"

def load_or_parse_data(data_file: str, pdf_file: str, parsing_instructions: str,
                      llamaparse_api_key: str, language: Language = Language.THAI) -> List:
    if os.path.exists(data_file):
        return joblib.load(data_file)

    try:
        parser = LlamaParse(
            api_key=llamaparse_api_key,
            result_type="markdown",
            parsing_instruction=parsing_instructions,
            max_timeout=5000,
            language=language.value,
        )
        parsed_data = parser.load_data(pdf_file)

        if not parsed_data:
            return []

    except Exception as e:
        return []

    joblib.dump(parsed_data, data_file)
    return parsed_data

In [6]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kongl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\kongl\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [None]:
def create_vector_database(llamaparse_api_key: str, pdf_file: str = PDF_FILE, data_file: str = PARSED_DATA_FILE) -> Tuple:
    os.makedirs(DATA_DIR, exist_ok=True)
    parsed_documents = load_or_parse_data(
        data_file=data_file,
        pdf_file=pdf_file,
        parsing_instructions=PARSING_INSTRUCTIONS,
        llamaparse_api_key=llamaparse_api_key
    )

    markdown_output = os.path.join(DATA_DIR, "output.md")
    with open(markdown_output, 'w', encoding='utf-8') as f:
        for doc in parsed_documents:
            f.write(doc.text + '\n')

    loader = UnstructuredMarkdownLoader(markdown_output)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=128)
    doc_chunks = text_splitter.split_documents(documents)
    embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
    vector_store = FAISS.from_documents(documents=doc_chunks, embedding=embed_model)
    faiss_index_path = os.path.join(DATA_DIR, "faiss_index")
    vector_store.save_local(faiss_index_path)

    return vector_store, embed_model


In [8]:
from openai import OpenAI

client = OpenAI(
    api_key='sk-GqA4Uj6iZXaykbOzIlFGtmdJr6VqiX94NhhjPZaf81kylRzh',
    base_url='https://api.opentyphoon.ai/v1'
)

def summarize_text(text, max_tokens=3000):
    tokens = text.split()
    return ' '.join(tokens[:max_tokens]) + '...' if len(tokens) > max_tokens else text

def generate_response(prompt):
    chat_completion = client.chat.completions.create(
        model="typhoon-v1.5x-70b-instruct",
        messages=[{"role": "user", "content": prompt}]
    )
    return chat_completion.choices[0].message.content

def retrieve_documents(query, retriever):
    return retriever.get_relevant_documents(query)

def ask_question(retriever, question):
    retrieved_docs = retrieve_documents(question, retriever)
    summarized_data = summarize_text("\n".join([doc.page_content for doc in retrieved_docs]), max_tokens=3000)
    prompt = f"Based on the following information about Pathum Thani development: {summarized_data}, answer this question: {question}"
    return generate_response(prompt)

In [9]:
llamaparse_api_key = "llx-bMf1NAZ0TS6EgfsYfXAZADVHk9VHwx79fdoU6E3pwkzBFRqD"
vector_db, embedding_model = create_vector_database(llamaparse_api_key)

In [17]:
retriever = vector_db.as_retriever(search_kwargs={'k': 3})
question = input("Enter your question: ")
response = ask_question(retriever, question)

print(response)

ข้อมูลนี้เกี่ยวกับการพัฒนาในจังหวัดปทุมธานี ซึ่งรวมถึง:

1. สัดส่วนของการผลิตต่างๆ เช่น อาหาร, เครื่องจักร, เคมีภัณฑ์, เสื้อผ้า, ยางและพลาสติก, เฟอร์นิเจอร์, ผลิตภัณฑ์จากอลูมิเนียม, การพิมพ์และการผลิตซ้ำสื่อบันทึก, ยานยนต์ และหมวดอื่นๆ
2. ความต้องการของสถานประกอบการในการได้รับความช่วยเหลือจากหน่วยงานของรัฐ
3. ประเภทของความช่วยเหลือที่สถานประกอบการต้องการมากที่สุด เช่น ลดต้นทุนการผลิต, ลดภาษี, ส่งเสริมการลงทุน, จัดหาแหล่งเงินกู้ดอกเบี้ยต่ำ, พักเงินต้นและดอกเบี้ย, และสนับสนุนเทคโนโลยีและเครื่องจักรทันสมัย
