In [1]:
import os
import json
import joblib
import warnings
from enum import Enum
from collections import deque
from typing import List, Dict, Tuple

import nltk
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from llama_parse import LlamaParse
from openai import OpenAI

nltk.download('punkt')
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kongl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
DATA_DIR = "./data"
PARSED_DATA_FILE = os.path.join(DATA_DIR, "parsed_data.pkl")
PDF_FILE = r"/content/file_subset.pdf"
PARSING_INSTRUCTIONS = """
The provided document is a statistical report from the National Statistical Office of Thailand.
It contains information about various industries, including employment and revenue.
The report is in the Thai language.
The document is structured with tables and text sections.
Try to extract information accurately and answer questions concisely.
"""

class Language(Enum):
    THAI = "th"
    ENGLISH = "en"

def load_or_parse_data(data_file: str, pdf_file: str, parsing_instructions: str,
                      llamaparse_api_key: str, language: Language = Language.THAI) -> List:
    if os.path.exists(data_file):
        return joblib.load(data_file)

    try:
        parser = LlamaParse(
            api_key=llamaparse_api_key,
            result_type="markdown",
            parsing_instruction=parsing_instructions,
            max_timeout=5000,
            language=language.value,
        )
        parsed_data = parser.load_data(pdf_file)
        if not parsed_data:
            return []
    except Exception as e:
        return []

    joblib.dump(parsed_data, data_file)
    return parsed_data


In [3]:
#Function to create a vector database
def create_vector_database(llamaparse_api_key: str, pdf_file: str = PDF_FILE, data_file: str = PARSED_DATA_FILE) -> Tuple:
    os.makedirs(DATA_DIR, exist_ok=True)
    parsed_documents = load_or_parse_data(
        data_file=data_file,
        pdf_file=pdf_file,
        parsing_instructions=PARSING_INSTRUCTIONS,
        llamaparse_api_key=llamaparse_api_key
    )

    markdown_output = os.path.join(DATA_DIR, "output.md")
    with open(markdown_output, 'w') as f:
        for doc in parsed_documents:
            f.write(doc.text + '\n')

    loader = UnstructuredMarkdownLoader(markdown_output)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=128)
    doc_chunks = text_splitter.split_documents(documents)
    embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")
    vector_store = FAISS.from_documents(documents=doc_chunks, embedding=embed_model)
    faiss_index_path = os.path.join(DATA_DIR, "faiss_index")
    vector_store.save_local(faiss_index_path)

    return vector_store, embed_model

In [4]:
# Conversational memory class
class ConversationalMemory:
    def __init__(self, max_length=10):
        self.history = deque(maxlen=max_length)

    def add_to_memory(self, question: str, response: str | None):
        if response is not None:
            self.history.append({"question": question, "response": response})

    def get_memory(self) -> List[Dict[str, str]]:
        return list(self.history)

    def save_memory_to_file(self, file_path: str):
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(self.get_memory(), f, ensure_ascii=False, indent=4)

    def load_memory_from_file(self, file_path: str):
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                self.history = deque(json.load(f), maxlen=self.history.maxlen)
        except FileNotFoundError:
            print(f"No existing memory file found at {file_path}. Starting fresh.")

def summarize_text(text, max_tokens=3000):
    tokens = text.split()
    return ' '.join(tokens[:max_tokens]) + '...' if len(tokens) > max_tokens else text

def generate_response(prompt, client):
    chat_completion = client.chat.completions.create(
        model="typhoon-v1.5x-70b-instruct",
        messages=[{"role": "user", "content": prompt}]
    )
    return chat_completion.choices[0].message.content

def retrieve_documents(query, retriever):
    return retriever.get_relevant_documents(query)

def ask_question(retriever, question, client):
    retrieved_docs = retrieve_documents(question, retriever)
    summarized_data = summarize_text("\n".join([doc.page_content for doc in retrieved_docs]), max_tokens=3000)
    prompt = f"Based on the following information: {summarized_data}, answer this question: {question}"
    return generate_response(prompt, client)

In [5]:
llamaparse_api_key = "llx-bMf1NAZ0TS6EgfsYfXAZADVHk9VHwx79fdoU6E3pwkzBFRqD"
vector_db, embedding_model = create_vector_database(llamaparse_api_key)

IndexError: list index out of range

In [None]:
if __name__ == "__main__":
    client = OpenAI(
        api_key='sk-GqA4Uj6iZXaykbOzIlFGtmdJr6VqiX94NhhjPZaf81kylRzh',
        base_url='https://api.opentyphoon.ai/v1'
    )
    retriever = vector_db.as_retriever(search_kwargs={'k': 10})
    question = input("Enter your question: ")
    response = ask_question(retriever, question, client)

    print(response)