In [None]:
import os
import re
import numpy as np
import chromadb
import nltk
from PyPDF2 import PdfReader
from langchain_community.embeddings import HuggingFaceEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings


warnings.filterwarnings("ignore", category=UserWarning, module='PyPDF2')

KNOWLEDGE_BASE_DIR = "knowledge-base"
DB_NAME = "chroma_db_Semantic"
HEADER_CROP_PERCENTAGE = 0.15 
MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B"
MODEL_KWARGS = {'device': 'cpu'}
ENCODE_KWARGS = {'normalize_embeddings': False}

def preprocess_text(text: str) -> str:
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_text_with_metadata(pdf_path: str) -> list:
    data = []
    try:
        reader = PdfReader(pdf_path)
        raw_text = ""
        if len(reader.pages) > 2:
            for page in reader.pages[1:-1]:
                raw_text += page.extract_text()
        else:
            print(f"  [INFO] PDF '{os.path.basename(pdf_path)}' memiliki <= 2 halaman, dilewati.")
            return []
        sentences = re.split(r'(\w+\s\d+:\d+:\s)', raw_text)
        for i in range(1, len(sentences), 2):
            source = sentences[i].strip()
            text = sentences[i+1].strip()
            if text:
                data.append({"text": text, "source": source})
    except Exception as e:
        print(f"  [ERROR] Gagal memproses file {os.path.basename(pdf_path)}: {e}")
    return data
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
def process_data(data, hf_embeddings, fixed_threshold=0.4, c=0.7, init_constant=2.0):
    if not data:
        return []
    texts = [d['text'] for d in data]
    sources = [d['source'] for d in data]
    embeddings = np.array(hf_embeddings.embed_documents(texts))
    chunks = []
    current_chunk_texts = [texts[0]]
    current_chunk_sources = [sources[0]]
    cluster_start, cluster_end = 0, 1
    pairwise_min = -float('inf')
    for i in range(1, len(texts)):
        cluster_embeddings = embeddings[cluster_start:cluster_end]
        if cluster_end - cluster_start > 1:
            new_sentence_similarities = cosine_similarity(embeddings[i].reshape(1, -1), cluster_embeddings)[0]
            adjusted_threshold = pairwise_min * c * sigmoid((cluster_end - cluster_start) - 1)
            new_sentence_similarity = np.max(new_sentence_similarities)
            pairwise_min = min(np.min(new_sentence_similarities), pairwise_min)
        else:
            adjusted_threshold = 0
            similarity_to_first = cosine_similarity(embeddings[i].reshape(1, -1), cluster_embeddings)[0][0]
            pairwise_min = similarity_to_first
            new_sentence_similarity = init_constant * pairwise_min
            
        if new_sentence_similarity > max(adjusted_threshold, fixed_threshold):
            current_chunk_texts.append(texts[i])
            current_chunk_sources.append(sources[i])
            cluster_end += 1
        else:
            chunks.append({
                "document": " ".join(current_chunk_texts),
                "metadata": {"source_range": f"{current_chunk_sources[0]}-{current_chunk_sources[-1]}"}
            })
            current_chunk_texts = [texts[i]]
            current_chunk_sources = [sources[i]]
            cluster_start, cluster_end = i, i + 1
            pairwise_min = -float('inf')
    chunks.append({
        "document": " ".join(current_chunk_texts),
        "metadata": {"source_range": f"{current_chunk_sources[0]}-{current_chunk_sources[-1]}"}
    })
    return chunks
if __name__ == "__main__":
    if os.path.exists(DB_NAME):
        print(f"Database '{DB_NAME}' sudah ada. Proses pembuatan database baru dibatalkan.")
    else:
        print(f"Database '{DB_NAME}' tidak ditemukan. Memulai proses pembuatan database...")
        try:
            nltk.data.find('tokenizers/punkt')
        except nltk.downloader.DownloadError:
            print("  Mengunduh tokenizer NLTK (punkt)...")
            nltk.download('punkt')
        print("  Menginisialisasi model embedding (Qwen)...")
        hf_embeddings = HuggingFaceEmbeddings(
            model_name=MODEL_NAME,
            model_kwargs=MODEL_KWARGS,
            encode_kwargs=ENCODE_KWARGS
        )
        client = chromadb.PersistentClient(path=DB_NAME)
        collection = client.create_collection(
            name="semantic_chunks",
            embedding_function=chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction(model_name=MODEL_NAME)
        )
        pdf_files = [f for f in os.listdir(KNOWLEDGE_BASE_DIR) if f.endswith('.pdf')]
        if not pdf_files:
            print(f"  [PERINGATAN] Tidak ada file PDF yang ditemukan di folder '{KNOWLEDGE_BASE_DIR}'.")
            exit()
        for pdf_name in tqdm(pdf_files, desc="Memproses semua PDF"):
            pdf_path = os.path.join(KNOWLEDGE_BASE_DIR, pdf_name)
            extracted_data = extract_text_with_metadata(pdf_path)
            if not extracted_data:
                continue
            final_chunks = process_data(extracted_data, hf_embeddings)
            if final_chunks:
                ids = [f"{pdf_name}_{i}" for i in range(len(final_chunks))]
                documents = [c['document'] for c in final_chunks]
                metadatas = [c['metadata'] for c in final_chunks]
                collection.add(
                    documents=documents,
                    metadatas=metadatas,
                    ids=ids
                )
                print(f"  Berhasil! {len(final_chunks)} potongan (chunks) ditambahkan dari {pdf_name}.")
    print("\n--- Verifikasi: Menampilkan contoh chunk ---")
    if not os.path.exists(DB_NAME):
        print("Database belum dibuat. Jalankan kembali script untuk membuatnya.")
    else:
        client = chromadb.PersistentClient(path=DB_NAME)
        try:
            collection = client.get_collection(name="semantic_chunks")
            count = collection.count()
            print(f"Total dokumen dalam database: {count}")
            if count > 0:
                results_first = collection.get(limit=10, include=["documents", "metadatas"])
                print("\n--- 10 Chunk Pertama ---")
                for i, doc in enumerate(results_first['documents']):
                    source = results_first['metadatas'][i].get('source_range', 'Sumber tidak diketahui')
                    print(f"\n--- Chunk {i+1} ---")
                    print(f"Sumber: {source}")
                    print(f"Teks: {doc[:200]}...") 
                    print("-" * 60)
                results_last = collection.get(limit=1, offset=count-1, include=["documents", "metadatas"])
                if results_last['documents']:
                    print("\n--- Chunk Terakhir ---")
                    doc = results_last['documents'][0]
                    source = results_last['metadatas'][0].get('source_range', 'Sumber tidak diketahui')
                    print(f"Sumber: {source}")
                    print(f"Teks: {doc[:200]}...")
                    print("-" * 60)
            else:
                print("Database kosong.")
        except Exception as e:
            print(f"Gagal mengambil data dari database: {e}")

Database 'chroma_db_Semantic' sudah ada. Proses pembuatan database baru dibatalkan.

--- Verifikasi: Menampilkan contoh chunk ---
Total dokumen dalam database: 150

--- 10 Chunk Pertama ---

--- Chunk 1 ---
Sumber: Kejadian 1:1:-Kejadian 1:1:
Teks: Pada mulanya Allah 
menciptakan langit dan bumi....
------------------------------------------------------------

--- Chunk 2 ---
Sumber: Kejadian 1:2:-Kejadian 1:2:
Teks: Bumi belum 
berbentuk dan kosong; gelap gulita 
menutupi samudera raya, dan Roh 
Allah melayang -layang di atas 
permukaan air....
------------------------------------------------------------

--- Chunk 3 ---
Sumber: Kejadian 1:3:-Kejadian 1:3:
Teks: Berfirmanlah Allah: 
"Jadilah terang." Lalu terang itu jadi....
------------------------------------------------------------

--- Chunk 4 ---
Sumber: Kejadian 1:4:-Kejadian 1:4:
Teks: Allah melihat bahwa 
terang itu baik, lalu dipisahkan -
Nyalah terang itu dari gelap....
-------------------------------------------------------

In [1]:
print("hello")

hello
