In [None]:
import os
import re
import numpy as np
import chromadb
import nltk
import pandas as pd 
from PyPDF2 import PdfReader
from langchain_community.embeddings import HuggingFaceEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module='PyPDF2')


KNOWLEDGE_BASE_DIR = "knowledge-base1"
DB_NAME = "chroma_db_Semantic3"
HEADER_CROP_PERCENTAGE = 0.15 
MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B"
MODEL_KWARGS = {'device': 'cpu'}
ENCODE_KWARGS = {'normalize_embeddings': False}

EXCEL_OUTPUT_FILENAME = "Hasil_CHunk.xlsx"

def preprocess_text(text: str) -> str:
    """Membersihkan teks """
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_text_with_metadata(pdf_path: str) -> list:
    """Mengekstrak teks dari file PDF dan membaginya berdasarkan format sumber."""
    data = []
    try:
        reader = PdfReader(pdf_path)
        if not reader.pages:
            print(f"  [PERINGATAN] PDF '{os.path.basename(pdf_path)}' kosong atau tidak dapat dibaca.")
            return []
        raw_text = ""
        print(f"  Mengekstrak teks dari {os.path.basename(pdf_path)} (semua halaman)...")
        for page in reader.pages:
            original_height = page.mediabox.height
            new_top = original_height * (1 - HEADER_CROP_PERCENTAGE)
            page.cropbox.upper_y = new_top
            raw_text += page.extract_text()
    except Exception as e:
        print(f"  [ERROR] Gagal membaca file {os.path.basename(pdf_path)}: {e}")
        return []

    try:
        sentences = re.split(r'(\w+\s\d+:\d+:\s)', raw_text)
        for i in range(1, len(sentences), 2):
            source = sentences[i].strip()
            text = sentences[i+1].strip()
            if text:
                cleaned_text = preprocess_text(text)
                data.append({"text": cleaned_text, "source": source})
    except Exception as e:
        print(f"  [ERROR] Gagal saat memecah teks dari {os.path.basename(pdf_path)}: {e}")
    return data

def process_data_normal_semantic_chunking(data, hf_embeddings, similarity_threshold=0.7):
    if not data:
        return []
    texts = [d['text'] for d in data]
    sources = [d['source'] for d in data]
    embeddings = np.array(hf_embeddings.embed_documents(texts))
    chunks = []
    current_chunk_texts = [texts[0]]
    current_chunk_sources = [sources[0]]
    for i in range(1, len(texts)):
        last_chunk_embedding = embeddings[i-1].reshape(1, -1)
        new_text_embedding = embeddings[i].reshape(1, -1)
        similarity = cosine_similarity(last_chunk_embedding, new_text_embedding)[0][0]
        if similarity >= similarity_threshold:
            current_chunk_texts.append(texts[i])
            current_chunk_sources.append(sources[i])
        else:
            chunks.append({
                "document": " ".join(current_chunk_texts),
                "metadata": {"source_range": f"{current_chunk_sources[0]}-{current_chunk_sources[-1]}"}
            })
            current_chunk_texts = [texts[i]]
            current_chunk_sources = [sources[i]]
    if current_chunk_texts:
        chunks.append({
            "document": " ".join(current_chunk_texts),
            "metadata": {"source_range": f"{current_chunk_sources[0]}-{current_chunk_sources[-1]}"}
        })
    return chunks

if __name__ == "__main__":
    if not os.path.exists(KNOWLEDGE_BASE_DIR):
        print(f"Error: Direktori '{KNOWLEDGE_BASE_DIR}' tidak ditemukan.")
        exit()

    try:
        nltk.data.find('tokenizers/punkt')
    except nltk.downloader.DownloadError:
        nltk.download('punkt')
        
    hf_embeddings = HuggingFaceEmbeddings(
        model_name=MODEL_NAME, model_kwargs=MODEL_KWARGS, encode_kwargs=ENCODE_KWARGS
    )

    print(f"Menghubungkan atau membuat database '{DB_NAME}'...")
    client = chromadb.PersistentClient(path=DB_NAME)
    collection = client.get_or_create_collection(
        name="semantic_chunks",
        embedding_function=chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction(model_name=MODEL_NAME)
    )

    pdf_files = [f for f in os.listdir(KNOWLEDGE_BASE_DIR) if f.endswith('.pdf')]
    if not pdf_files:
        print(f"  Tidak ada file PDF yang ditemukan di folder '{KNOWLEDGE_BASE_DIR}'.")
        exit()

    print("\nMemulai penambahan data ke database...")
    for pdf_name in tqdm(pdf_files, desc="Memproses semua PDF"):
        existing_docs = collection.get(where={"source_file": pdf_name}, limit=1)
        if existing_docs['ids']:
            print(f"\n  [INFO] Melewati {pdf_name}, data sudah ada di database.")
            continue
        
        print(f"\nMemproses file baru: {pdf_name}")
        pdf_path = os.path.join(KNOWLEDGE_BASE_DIR, pdf_name)
        
        extracted_data = extract_text_with_metadata(pdf_path)
        if not extracted_data:
            print(f"  Tidak ada data yang diekstrak dari {pdf_name}.")
            continue
            
        final_chunks = process_data_normal_semantic_chunking(extracted_data, hf_embeddings, similarity_threshold=0.7) 
        
        if final_chunks:
            ids = [f"{pdf_name}_{i}" for i in range(len(final_chunks))]
            documents = [c['document'] for c in final_chunks]
            metadatas = []
            for c in final_chunks:
                meta = c['metadata']
                meta['source_file'] = pdf_name
                metadatas.append(meta)

            collection.add(documents=documents, metadatas=metadatas, ids=ids)
            print(f"  Berhasil! {len(final_chunks)} potongan (chunks) dari {pdf_name} ditambahkan.")
            

    print("\n" + "="*50)
    print("PROSES PENAMBAHAN DATA SELESAI")
    print("="*50)
    print(f"\nMemulai ekspor keseluruhan database ke file Excel: {EXCEL_OUTPUT_FILENAME}")
    
    try:
        total_docs_in_db = collection.count()
        if total_docs_in_db == 0:
            print("Database kosong. Tidak ada data untuk diekspor.")
        else:
            print(f"Mengambil {total_docs_in_db} dokumen dari database...")
            all_data = collection.get(
                limit=total_docs_in_db,
                include=["documents", "metadatas"]
            )
            
            data_for_excel = []
            for i in range(len(all_data['ids'])):
                data_for_excel.append({
                    "Nomor": i + 1,
                    "source_file": all_data['metadatas'][i].get('source_file', 'Tidak Diketahui'),
                    "source_range": all_data['metadatas'][i].get('source_range', 'Tidak Diketahui'),
                    "isi_ayat": all_data['documents'][i]
                })
            
            df = pd.DataFrame(data_for_excel)
            
            if os.path.exists(EXCEL_OUTPUT_FILENAME):
                os.remove(EXCEL_OUTPUT_FILENAME)
                print(f"File '{EXCEL_OUTPUT_FILENAME}' yang sudah ada telah dihapus.")
                
            df.to_excel(EXCEL_OUTPUT_FILENAME, index=False)
            print(f"\nBerhasil! Seluruh data dari database telah diekspor ke '{EXCEL_OUTPUT_FILENAME}'.")

    except Exception as e:
        print(f"\n[ERROR] Terjadi kesalahan saat mengekspor data ke Excel: {e}")

    print("\n--- Program Selesai ---")