In [2]:
# document_processor.ipynb
# File ini untuk dijalankan di Google Colab

# Instalasi dependensi
!pip install google-generativeai>=0.3.0 pypdf>=3.15.1 faiss-cpu>=1.7.4 python-dotenv>=1.0.0

import os
import glob
import zipfile
import pickle
import numpy as np
import faiss
import google.generativeai as genai
from typing import List, Dict, Any
from google.colab import files
from pypdf import PdfReader
from tqdm.notebook import tqdm

# Konfigurasi API key
GOOGLE_API_KEY = "AIzaSyBTtRe2IA3o14lxAMOsO83Xhsy2KxSIlMg"  # Ganti dengan API key Anda
genai.configure(api_key=GOOGLE_API_KEY)

# Buat folder data
os.makedirs("data/sample_docs", exist_ok=True)

# Upload dokumen
uploaded = files.upload()
for filename in uploaded.keys():
    if filename.endswith(".pdf") or filename.endswith(".txt"):
        with open(os.path.join("data/sample_docs", filename), "wb") as f:
            f.write(uploaded[filename])
    elif filename.endswith(".zip"):
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall("data/sample_docs")

print(f"Dokumen yang tersedia: {len(glob.glob('data/sample_docs/*'))}")

class DocumentProcessor:
    def __init__(self, data_dir: str = "data/sample_docs"):
        self.data_dir = data_dir
        self.documents = []
        self.embeddings = []
        self.embedding_model = "models/text-embedding-004"  # Model embedding terbaru

    def load_documents(self) -> List[Dict[str, Any]]:
        """Load documents from the data directory"""
        # Look for PDF files in the data directory
        pdf_files = glob.glob(os.path.join(self.data_dir, "*.pdf"))

        documents = []
        for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
            document = self._process_pdf(pdf_file)
            documents.append(document)

        # Look for text files in the data directory
        txt_files = glob.glob(os.path.join(self.data_dir, "*.txt"))

        for txt_file in tqdm(txt_files, desc="Processing TXTs"):
            document = self._process_txt(txt_file)
            documents.append(document)

        self.documents = documents
        print(f"Total dokumen diproses: {len(documents)}")
        return documents

    def _process_pdf(self, file_path: str) -> Dict[str, Any]:
        """Process a PDF file and extract text"""
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"

        return {
            "id": os.path.basename(file_path),
            "source": file_path,
            "text": text
        }

    def _process_txt(self, file_path: str) -> Dict[str, Any]:
        """Process a text file"""
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()

        return {
            "id": os.path.basename(file_path),
            "source": file_path,
            "text": text
        }

    def _chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
        """Split text into chunks of specified size"""
        return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

    def create_embeddings(self) -> List[List[float]]:
        """Create embeddings for all documents"""
        if not self.documents:
            self.load_documents()

        embeddings = []
        chunks_info = []  # Untuk menyimpan informasi chunk

        for doc_idx, doc in enumerate(tqdm(self.documents, desc="Creating embeddings")):
            # Chunk document if it's too large
            chunks = self._chunk_text(doc["text"])
            doc_embeddings = []

            for chunk_idx, chunk in enumerate(chunks):
                try:
                    embedding = genai.embed_content(
                        model=self.embedding_model,
                        content=chunk,
                        task_type="retrieval_document"
                    )
                    doc_embeddings.append(embedding["embedding"])

                    # Simpan informasi chunk untuk retrieval yang lebih baik
                    chunks_info.append({
                        "doc_idx": doc_idx,
                        "doc_id": doc["id"],
                        "chunk_idx": chunk_idx,
                        "text": chunk
                    })
                except Exception as e:
                    print(f"Error pada dokumen {doc['id']}, chunk {chunk_idx}: {str(e)}")
                    continue

            # Untuk metadata dokumen
            doc["chunks_count"] = len(chunks)

        # Simpan semua embeddings sebagai list
        self.embeddings = doc_embeddings
        self.chunks_info = chunks_info

        print(f"Total chunks dengan embeddings: {len(self.embeddings)}")
        return self.embeddings

    def save_to_vector_store(self, vector_store_path: str = "data/vector_store"):
        """Save embeddings to a vector store"""
        import faiss
        import numpy as np
        import pickle

        # Check if we have any embeddings
        if not self.embeddings:
            print("No embeddings found. Creating embeddings first...")
            self.create_embeddings()

        # Double-check after creating embeddings
        if not self.embeddings:
            raise ValueError("No documents or embeddings found. Please add documents to the data directory.")

        # Convert embeddings to numpy array
        embeddings_array = np.array(self.embeddings).astype('float32')

        # Create a FAISS index
        dimension = len(self.embeddings[0])
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings_array)

        # Save the index
        faiss.write_index(index, f"{vector_store_path}.index")

        # Save the documents and chunks info
        with open(f"{vector_store_path}.pkl", "wb") as f:
            data = {
                "documents": self.documents,
                "chunks_info": self.chunks_info
            }
            pickle.dump(data, f)

        print(f"Vector store saved to {vector_store_path}")

        # Download file untuk digunakan di lokal
        files.download(f"{vector_store_path}.index")
        files.download(f"{vector_store_path}.pkl")

# Jalankan proses
processor = DocumentProcessor()
processor.load_documents()
processor.create_embeddings()
processor.save_to_vector_store()

print("Proses selesai! File vector store telah tersedia untuk didownload.")

Saving Laporan KP - Hardcover (1).pdf to Laporan KP - Hardcover (1).pdf
Dokumen yang tersedia: 1


Processing PDFs:   0%|          | 0/1 [00:00<?, ?it/s]

Processing TXTs: 0it [00:00, ?it/s]

Total dokumen diproses: 1


Creating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Total chunks dengan embeddings: 60
Vector store saved to data/vector_store


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Proses selesai! File vector store telah tersedia untuk didownload.
