In [42]:
from langchain_community.document_loaders import PyPDFLoader
from PIL import Image
import pytesseract
import fitz

def extract_text_from_pdf_with_ocr(pdf_path):
    doc = fitz.open(pdf_path)
    all_docs = []

    for i, page in enumerate(doc):
        pix = page.get_pixmap(dpi=300)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        text = pytesseract.image_to_string(img, lang="ind")
        all_docs.append({"page": i + 1, "content": text})

    return all_docs

docs = extract_text_from_pdf_with_ocr("../Data/e-1693_Laporan Hasil Reviu Atas Perhitungan Unit Price Volume Pekerjaan Revitalisasi Halte .pdf")


In [43]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=520
)

final_chunks = []

for doc in docs:
    chunks = text_splitter.split_text(doc['content'])
    for i, chunk in enumerate(chunks):
        final_chunks.append({
            "text": chunk,
            "metadata": {"page": doc["page"], "chunk": i}
        })


In [44]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.faiss import FAISS
from langchain.schema import Document


embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

lc_docs = [
    Document(page_content=c["text"], metadata=c["metadata"])
    for c in final_chunks
]

db = FAISS.from_documents(lc_docs, embeddings)

In [45]:
import google.generativeai as genai
from dotenv import load_dotenv
import os

In [46]:

load_dotenv(dotenv_path="../config/.env")

True

In [47]:
google_api_key = os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=google_api_key)

In [48]:
model = genai.GenerativeModel(model_name = "gemini-1.5-flash")

In [49]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA


llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

retriever = db.as_retriever(search_kwargs={"k": 3})

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)


In [57]:
response = qa_chain.invoke({"query": "Siapa Pemeriksa dokumen ini?"})
print(response['result'])

Pemeriksa dokumen ini adalah Alvian Fitriadi, Senior Spesialis Auditor.
