In [6]:
# requirements = [
#     "langchain",
#     "langchain-community",
#     "llama-parse",
#     "fastembed",
#     "python-dotenv",
#     "langchain-groq",
#     "chainlit",
#     "sentence-transformers",
#     "openai",
#     "langchain-openai",
#     "nltk",
#     "joblib",
#     "gdown",
#     "PyPDF2",
#     "faiss-cpu",
#     "nest-asyncio",
#     "unstructured[md]"
# ]

# file_path = "requirements.txt"
# with open(file_path, "w") as f:
#     for package in requirements:
#         f.write(f"{package}\n")

In [7]:
import os
from typing import Tuple, List, Dict
from pypdf import PdfReader
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from openai import OpenAI

from PyPDF2 import PdfReader
from collections import deque
import json

In [8]:
# import gdown
# from PyPDF2 import PdfReader

# file_ids = [
#     '1ohQ7aQCiY4pKqkKl0_FssgYkkHmeYDWN',
#     '1SjDi9aY8_jQtDfe5wix-aFYuS5TP0pDG'
# ]

# for file_id in file_ids:
#     file_url = f'https://drive.googlea.com/uc?id={file_id}'
#     output_pdf = f'/content/{file_id}.pdf'
#     gdown.download(file_url, output_pdf, quiet=False)
#     reader = PdfReader(output_pdf)

#     for page_num in range(len(reader.pages)):
#         page = reader.pages[page_num]
#         print(f"Text from page {page_num + 1}:\n{page.extract_text()}\n")


In [9]:
DATA_DIR = "./data"
PARSED_DATA_FILE = os.path.join(DATA_DIR, "parsed_data.pkl")
PDF_FILE = [
    'DatasourcePDF/Merged_SplitDocument.pdf'
]

def extract_text_from_pdf(pdf_file: str) -> str:
    reader = PdfReader(pdf_file)
    all_text = ""
    for page in reader.pages:
        all_text += page.extract_text()
    return all_text

def create_vector_database(
    llamaparse_api_key: str,
    pdf_files: list = PDF_FILE,
    data_file: str = PARSED_DATA_FILE,
) -> Tuple:
    os.makedirs(DATA_DIR, exist_ok=True)
    all_text = ""
    for pdf_file in pdf_files:
        all_text += extract_text_from_pdf(pdf_file)

    text_output = os.path.join(DATA_DIR, "extracted_text.txt")
    with open(text_output, "w", encoding="utf-8") as f:
        f.write(all_text)

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=128)
    chunks = text_splitter.split_text(all_text)
    documents = [Document(page_content=chunk) for chunk in chunks]
    embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-m3") #BAAI/bge-base-en-v1.5
    vector_store = FAISS.from_documents(documents=documents, embedding=embed_model)
    
    faiss_index_path = os.path.join(DATA_DIR, "faiss_index")
    vector_store.save_local(faiss_index_path)

    return vector_store, embed_model

In [None]:
client = OpenAI(
    api_key='sk-GqA4Uj6iZXaykbOzIlFGtmdJr6VqiX94NhhjPZaf81kylRzh',
    base_url='https://api.opentyphoon.ai/v1'
)

def summarize_text(text, max_tokens=5000):
    tokens = text.split()
    return ' '.join(tokens[:max_tokens]) + '...' if len(tokens) > max_tokens else text

class ConversationalMemory:
    def __init__(self, max_length=10):
        self.history = deque(maxlen=max_length)
    def add_to_memory(self, question: str, response: str | None):
        if response is not None :
            self.history.append({"question": question, "response": response})
    def get_memory(self) -> List[Dict[str, str]]:
        return list(self.history)
    def save_memory_to_file(self, file_path: str):
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(self.get_memory(), f, indent=4)
    def load_memory_from_file(self, file_path: str):
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                self.history = deque(json.load(f), maxlen=self.history.maxlen)
        except FileNotFoundError:
            print(f"No existing memory file found at {file_path}. Starting fresh.")

def generate_response(prompt):
    chat_completion = client.chat.completions.create(
        model="typhoon-v1.5x-70b-instruct",
        messages=[{"role": "user", "content": prompt}]
    )
    return chat_completion.choices[0].message.content

def retrieve_documents(query, retriever):
    return retriever.get_relevant_documents(query)

def ask_question_with_memory(retriever, question, memory: ConversationalMemory):
    retrieved_docs = retrieve_documents(question, retriever)
    summarized_data = summarize_text("\n".join([doc.page_content for doc in retrieved_docs]), max_tokens=5000)
    history_context = "\n".join(
        [f"Q: {entry['question']}\nA: {entry['response']}" for entry in memory.get_memory()]
    )
    full_prompt = (
        f"Conversation history:\n{history_context}\n\n"
        f"Context for Pathum Thani development:\n{summarized_data}\n\n"
        f"New question: {question}"
    )
    response = generate_response(full_prompt)
    memory.add_to_memory(question, response)

    return response

llamaparse_api_key = "llx-pNes5rGZru1FvO1nINQMrAJMEso0OEWutgy8ejbGntSxNPeq"
vector_db, embed_model = create_vector_database(llamaparse_api_key)

retriever = vector_db.as_retriever(search_kwargs={'k': 3})

NameError: name 'vector_db' is not defined

In [None]:
if __name__ == "__main__":
    memory = ConversationalMemory(max_length=10)
    memory_file = "conversation_memory.json"
    memory.load_memory_from_file(memory_file)

    while True:
        question = input("Enter your question: ")
        if question.lower() == "exit":
            print("Goodbye!")
            break
        response = ask_question_with_memory(retriever, question, memory)
        print(f"Answer: {response}")
    memory.save_memory_to_file(memory_file)

No existing memory file found at conversation_memory.json. Starting fresh.


NameError: name 'retriever' is not defined