# Pre-Pipeline: Import and combine dataset

In [10]:
import pandas as pd
df1 = pd.read_csv('original_dataset/hellosehat_dataset_10.csv', sep=';', encoding='utf-8')
df2 = pd.read_csv('original_dataset/alodokter_dataset_10.csv', sep=';', encoding='utf-8')
df3 = pd.read_csv('original_dataset/doktersehat_gizi_final_10.csv', sep=';', encoding='utf-8')

In [11]:
df_combine = pd.concat([df1, df2, df3], ignore_index=True)
print(f"Total dokumen setelah concat: {len(df_combine)}")
print(f"Index range: {df_combine.index.min()} - {df_combine.index.max()}")
print(f"Kolom: {df_combine.columns.tolist()}")

Total dokumen setelah concat: 2391
Index range: 0 - 2390
Kolom: ['URL', 'Judul', 'Konten']


In [13]:
df_combine.head()

Unnamed: 0,URL,Judul,Konten
0,https://hellosehat.com/nutrisi/berat-badan-tur...,Kenali 9 Penyebab Perut Buncit dan Cara Mengat...,Perut buncit memang mampu memengaruhi penampil...
1,https://hellosehat.com/nutrisi/tips-makan-seha...,"8 Merk Oven Gas Terbaik, Cocok untuk Bisnis Kue","Bagi Anda yang gemar bikin kue, oven gas menja..."
2,https://hellosehat.com/nutrisi/fakta-gizi/mere...,10 Merek Oatmeal yang Bergizi dan Cocok untuk ...,Butuh menu sarapan yang cepat? Berbagai merek ...
3,https://hellosehat.com/nutrisi/resep-sehat/jus...,4 Resep Jus untuk Bantu Meningkatkan Sistem Im...,"Setiap harinya, sistem imunitas pada tubuh bek..."
4,https://hellosehat.com/nutrisi/berat-badan-tur...,Apakah Sering Buang Air Bisa Menurunkan Berat ...,"Setelah diolah, dicerna, dan diambil semua giz..."


In [12]:
# FIX that thang!
# df_combine.to_csv('output_dataset/combined_nutrition_dataset_2.csv', index=False, encoding='utf-8', sep=";")

In [25]:
df = pd.read_csv('output_dataset/combined_nutrition_dataset.csv', sep=';', encoding='utf-8')
df.head()

Unnamed: 0,URL,Judul,Konten
0,https://hellosehat.com/nutrisi/berat-badan-tur...,Kenali 9 Penyebab Perut Buncit dan Cara Mengat...,Perut buncit memang mampu memengaruhi penampil...
1,https://hellosehat.com/nutrisi/tips-makan-seha...,"8 Merk Oven Gas Terbaik, Cocok untuk Bisnis Kue","Bagi Anda yang gemar bikin kue, oven gas menja..."
2,https://hellosehat.com/nutrisi/fakta-gizi/mere...,10 Merek Oatmeal yang Bergizi dan Cocok untuk ...,Butuh menu sarapan yang cepat? Berbagai merek ...
3,https://hellosehat.com/nutrisi/resep-sehat/jus...,4 Resep Jus untuk Bantu Meningkatkan Sistem Im...,"Setiap harinya, sistem imunitas pada tubuh bek..."
4,https://hellosehat.com/nutrisi/berat-badan-tur...,Apakah Sering Buang Air Bisa Menurunkan Berat ...,"Setelah diolah, dicerna, dan diambil semua giz..."


In [26]:
row = df.loc[2150]
print(row)

URL       https://doktersehat.com/gaya-hidup/gizi-dan-nu...
Judul        Sumber, Manfaat, dan Dampak Kekurangan Omega-3
Konten     Kita selalu menganggap kalau omega-3 adalah n...
Name: 2150, dtype: object


# Pre-Pipeline: Intent Tagging (Health Goals)

In [27]:
import pandas as pd
import csv
import sys

In [28]:
# --- 1. KONFIGURASI INPUT & OUTPUT ---
INPUT_FILE = 'output_dataset/combined_nutrition_dataset.csv'
OUTPUT_FILE = 'output_dataset/tagged_combined_nutrition_dataset.csv'


In [29]:

# --- 2. KAMUS KEYWORD INTENT ---
INTENT_KEYWORDS = {
    'diabetes': ['diabetes', 'gula darah', 'kencing manis', 'insulin', 'glukosa', 'hiperglikemia'],
    'anemia': ['anemia', 'kurang darah', 'zat besi', 'hemoglobin', 'pucat', 'lelah'],
    'kesehatan_ibu': ['hamil', 'menyusui', 'bumil', 'busui', 'asi', 'janin', 'kandungan', 'kehamilan'],
    'kesehatan_anak': ['anak', 'bayi', 'balita', 'si kecil', 'tumbuh kembang', 'imunisasi'],
    'berat_badan': ['berat badan', 'diet', 'kurus', 'gemuk', 'langsing', 'turun berat', 'buncit', 'lemak', 'kalori', 'obesitas'],
    'pembentukan_tubuh': ['otot', 'gym', 'fitness', 'binaraga', 'sixpack', 'latihan beban', 'workout', 'massa otot'],
    'kesehatan_pencernaan': ['pencernaan', 'usus', 'lambung', 'maag', 'gerd', 'sembelit', 'diare', 'serat'],
    'resep_sehat': ['resep', 'cara membuat', 'bahan-bahan', 'menu masakan', 'cara masak', 'hidangan'],
    'diet_khusus': ['keto', 'vegan', 'vegetarian', 'gluten free', 'rendah garam', 'dash diet', 'intermittent'],
    'pencegahan': ['mencegah', 'risiko', 'hindari', 'bahaya', 'waspada', 'gejala', 'tanda-tanda'],
    'fakta_gizi': ['kandungan gizi', 'nutrisi', 'protein', 'karbohidrat', 'vitamin', 'mineral', 'takaran saji'],
    'makanan_sehat': ['buah', 'sayur', 'organik', 'superfood', 'makanan sehat', 'bijian', 'kacang'],
    # Fallback
    'Kesehatan_umum': ['manfaat', 'khasiat', 'sehat', 'bugar', 'stamina', 'daya tahan', 'imun', 'kesehatan']
}



In [30]:
def get_top_3_intents(text):
    """
    Menghitung frekuensi keyword dan mengembalikan maksimal 3 intent teratas.
    """
    text = str(text).lower()
    scores = {}
    
    for intent, keywords in INTENT_KEYWORDS.items():
        count = 0
        for kw in keywords:
            count += text.count(kw)
        if count > 0:
            scores[intent] = count
    
    # Jika tidak ada match, return default
    if not scores:
        return "Kesehatan_umum"
    
    # Urutkan score tertinggi -> terendah
    sorted_intents = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    # Ambil 3 teratas
    top_3 = [item[0] for item in sorted_intents[:3]]
    return ", ".join(top_3)



In [31]:
# --- 3. EKSEKUSI UTAMA ---
def process_tagging():
    print(f"üìÇ Membaca file input: {INPUT_FILE} ...")
    
    try:
        # Membaca file dengan separator ';'
        # Kita asumsikan baris pertama adalah header [URL;Judul;Konten]
        df = pd.read_csv(INPUT_FILE, sep=';', on_bad_lines='skip')
        
        # Validasi kolom dasar (Sesuaikan nama kolom jika di file Anda berbeda)
        # Kita rename standar agar mudah diproses
        # Asumsi urutan kolom: 1. URL, 2. Judul, 3. Konten
        if len(df.columns) >= 3:
            df.columns = ['url', 'title', 'content'] + list(df.columns[3:])
        else:
            print("‚ùå Error: File input harus memiliki minimal 3 kolom (URL;Judul;Konten)")
            return

        print(f"üìä Total Data: {len(df)} baris.")
        
        # Bersihkan data (isi yang kosong dengan string kosong)
        df['title'] = df['title'].fillna('').astype(str)
        df['content'] = df['content'].fillna('').astype(str)
        
        print("üîç Melakukan Auto-Tagging Intent...")
        # Gabungkan Judul + Konten untuk pencarian keyword yang lebih akurat
        df['full_text_scan'] = df['title'] + " " + df['content']
        df['intent'] = df['full_text_scan'].apply(get_top_3_intents)
        
        # Hapus kolom bantuan
        df.drop(columns=['full_text_scan'], inplace=True)
        
        print(f"üíæ Menyimpan ke format final (Comma Separated + Quoted)...")
        
        # --- TEKNIK PENYIMPANAN PENTING ---
        # quoting=csv.QUOTE_ALL : Memaksa SEMUA kolom dibungkus tanda kutip "..."
        # Ini menjamin koma di dalam teks TIDAK akan dianggap sebagai pemisah kolom baru.
        df.to_csv(
            OUTPUT_FILE, 
            sep=',',              # Separator Koma
            quotechar='"',        # Pembungkus Tanda Kutip Ganda
            quoting=csv.QUOTE_ALL, # MODE AMAN: Bungkus semua data dengan kutip
            index=False,
            encoding='utf-8'
        )
        
        print(f"‚úÖ SUKSES! File tersimpan: {OUTPUT_FILE}")
        print("   Format: \"URL\",\"Title\",\"Content\",\"Intent\" (Comma Separated)")

    except FileNotFoundError:
        print(f"‚ùå File {INPUT_FILE} tidak ditemukan.")
    except Exception as e:
        print(f"‚ùå Terjadi kesalahan: {e}")

if __name__ == "__main__":
    process_tagging()

üìÇ Membaca file input: output_dataset/combined_nutrition_dataset.csv ...
üìä Total Data: 2425 baris.
üîç Melakukan Auto-Tagging Intent...
üíæ Menyimpan ke format final (Comma Separated + Quoted)...
‚úÖ SUKSES! File tersimpan: output_dataset/tagged_combined_nutrition_dataset.csv
   Format: "URL","Title","Content","Intent" (Comma Separated)


# Tahap 1: Preprocessing and chunking 

In [1]:
import pandas as pd
import re
from transformers import BertTokenizer
import json


KeyboardInterrupt



In [None]:
# Load dataset
df = pd.read_csv('output_dataset/tagged_combined_nutrition_dataset.csv')
print(f"Total dokumen: {len(df)}")
print(f"Kolom dataset: {df.columns.tolist()}")

In [None]:
# Load IndoBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p2')

# Fungsi preprocessing
def preprocess_text(text):
    """Normalisasi teks"""
    if pd.isna(text):
        return ""
    
    text = text.lower()
    text = re.sub(r'[^\w\s.,!?%-]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    return text

# Fungsi chunking dengan sliding window
def chunk_text_with_overlap(text, max_tokens=384, overlap=50):
    """
    Chunking dengan overlap sliding window
    max_tokens: 512 - 128 (buffer untuk question) = 384
    overlap: 50 tokens
    """
    chunks = []
    tokens = tokenizer.tokenize(text)
    
    if len(tokens) <= max_tokens:
        return [text]
    
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_text)
        
        if end == len(tokens):
            break
        
        start = end - overlap
    
    return chunks

In [None]:
# Proses semua dokumen
processed_data = []

for idx, row in df.iterrows():
    title = row.get('title', '')
    content = row.get('content', '')
    url = row.get('url', '')
    intent = row.get('intent', '')
    
    full_text = f"{title}. {content}".strip()
    cleaned_text = preprocess_text(full_text)
    
    if not cleaned_text:
        continue
    
    chunks = chunk_text_with_overlap(cleaned_text)
    
    for chunk_idx, chunk in enumerate(chunks):
        processed_data.append({
            'doc_id': idx,
            'chunk_id': chunk_idx,
            'title': title,
            'text': chunk,
            'url': url,
            'intent': intent,
            'token_count': len(tokenizer.tokenize(chunk))
        })
    
    if (idx + 1) % 100 == 0:
        print(f"Processed {idx + 1}/{len(df)} documents")

# Simpan hasil
df_processed = pd.DataFrame(processed_data)
df_processed.to_csv('output_dataset/processed_chunks.csv', index=False)

print(f"\nTotal chunks: {len(df_processed)}")
print(f"Rata-rata tokens per chunk: {df_processed['token_count'].mean():.2f}")
print(f"Max tokens: {df_processed['token_count'].max()}")
print(f"Min tokens: {df_processed['token_count'].min()}")

# Simpan corpus untuk MLM
all_texts = df_processed['text'].tolist()
with open('corpus_for_mlm.json', 'w', encoding='utf-8') as f:
    json.dump(all_texts, f, ensure_ascii=False, indent=2)

print("\nFile tersimpan:")
print("- processed_chunks.csv")
print("- corpus_for_mlm.json")

# Tahap 2: Domain ~~EXPANSION~~ Adaptation

In [None]:
import json
import torch
import re
import os
import gc
from transformers import (
    BertTokenizer, 
    BertForMaskedLM,
    DataCollatorForWholeWordMask,
    TrainingArguments,
    Trainer
)
from torch.utils.data import Dataset
from tqdm.auto import tqdm

In [2]:
# --- 1. SETUP GPU & CLEAR MEMORY ---
# Cek apakah CUDA (GPU) tersedia
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"‚úÖ GPU Ditemukan: {torch.cuda.get_device_name(0)}")
    print(f"   VRAM Total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    
    # Bersihkan cache memori sebelumnya (Penting jika menggunakan Notebook)
    torch.cuda.empty_cache()
    gc.collect()
else:
    print("GPU tidak ditemukan! Menggunakan CPU")
    device = torch.device("cpu")

‚úÖ GPU Ditemukan: NVIDIA GeForce RTX 4050 Laptop GPU
   VRAM Total: 6.44 GB


In [3]:
# --- 2. LOAD DATA ---
# Load corpus
try:
    with open('corpus_for_mlm.json', 'r', encoding='utf-8') as f:
        all_texts = json.load(f)
    print(f"Total texts: {len(all_texts)}")
except FileNotFoundError:
    print("File corpus tidak ditemukan, menggunakan dummy data.")
    all_texts = ["1", "2"]

# SPLIT DATA (90% Train, 10% Validation)
# Kita butuh ini agar model bisa diuji (Val Loss muncul)
split_idx = int(0.9 * len(all_texts))
train_texts = all_texts[:split_idx]
val_texts = all_texts[split_idx:]

print(f"Data Training: {len(train_texts)}")
print(f"Data Validasi: {len(val_texts)}")

# Load model dan tokenizer
model_name = 'indobenchmark/indobert-base-p2'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

Total texts: 7661
Data Training: 6894
Data Validasi: 767


Some weights of BertForMaskedLM were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# --- 3. PINDAHKAN MODEL KE GPU ---
model.to(device)

# Custom Dataset untuk MLM
class MLMDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Preprocess dengan progress bar
        print("Preprocessing texts...")
        self.encodings = []
        for text in tqdm(texts, desc="Tokenizing"):
            encoding = self.tokenizer(
                text,
                truncation=True,
                max_length=self.max_length,
                padding='max_length',
                return_tensors='pt'
            )
            self.encodings.append({
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze()
            })
    
    def __len__(self):
        return len(self.encodings)
    
    def __getitem__(self, idx):
        return self.encodings[idx]

# Custom Data Collator
class CustomWWMDataCollator(DataCollatorForWholeWordMask):
    def __init__(self, tokenizer, mlm_probability=0.15):
        super().__init__(
            tokenizer=tokenizer,
            mlm=True,
            mlm_probability=mlm_probability
        )
    
    def torch_mask_tokens(self, inputs, special_tokens_mask=None):
        labels = inputs.clone()
        probability_matrix = torch.full(labels.shape, self.mlm_probability)
        
        if special_tokens_mask is None:
            special_tokens_mask = [
                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
                for val in labels.tolist()
            ]
            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
        else:
            special_tokens_mask = special_tokens_mask.bool()
        
        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
        
        for i, input_ids in enumerate(inputs):
            tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
            for j, token in enumerate(tokens):
                if re.search(r'\d', token) or token in ['mg', 'gram', 'kg', 'ml', 'kkal', 'kalori', '%']:
                    probability_matrix[i, j] = 0.0
        
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]
        return inputs, labels

# Buat 2 Dataset Terpisah
print("\nMempersiapkan Training Set...")
train_dataset = MLMDataset(train_texts, tokenizer)

print("Mempersiapkan Validation Set...")
val_dataset = MLMDataset(val_texts, tokenizer)

# Data collator
data_collator = CustomWWMDataCollator(
    tokenizer=tokenizer,
    mlm_probability=0.15
)


Mempersiapkan Training Set...
Preprocessing texts...


Tokenizing:   0%|          | 0/6894 [00:00<?, ?it/s]

Mempersiapkan Validation Set...
Preprocessing texts...


Tokenizing:   0%|          | 0/767 [00:00<?, ?it/s]



In [6]:
# --- 4. TRAINING ARGUMENTS ---
training_args = TrainingArguments(
    output_dir='./indobert-gizi-mlm',
    overwrite_output_dir=True,
      
    # === HYPERPARAMETERS ===
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,              
    learning_rate=2e-5,
    weight_decay=0.01,

    # === EVALUATION SETTINGS (BARU) ===
    eval_strategy="steps",
    eval_steps=100,
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    
    # === LOGGING ===
    logging_dir='./logs',
    logging_steps=100,
    
    # === SYSTEM ===
    fp16=torch.cuda.is_available(),
    no_cuda=False,
    dataloader_pin_memory=True,
    dataloader_num_workers=0,       
    disable_tqdm=False,
    report_to='none',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

print("\nMulai Training...")
trainer.train()

# Simpan Model Akhir
print("\nMenyimpan model...")
model.save_pretrained("./indobert-gizi-mlm-final")
tokenizer.save_pretrained("./indobert-gizi-mlm-final")
print("Selesai!")


Mulai Training...


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss,Validation Loss
100,5.5637,4.428384
200,4.0442,3.764034
300,3.597,3.424848
400,3.3615,3.217364
500,3.1649,3.068305
600,3.0134,2.934664
700,2.8415,2.784796
800,2.7314,2.662641
900,2.6279,2.553934
1000,2.576,2.442628


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].



Menyimpan model...
Selesai!


## Clear Garbage

In [7]:
import torch
import gc 

def clear_gpu_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

clear_gpu_memory()

# Tahap 3: Membuat Pasangan QA

In [9]:
import pandas as pd
import json
import uuid
import time
import os
from tqdm import tqdm
from groq import Groq, RateLimitError

In [None]:
# --- 1. KONFIGURASI ---
API_KEY = "key " 
client = Groq(api_key=API_KEY)

# Model
PRIMARY_MODEL = "llama-3.3-70b-versatile"

# File Paths (Sesuaikan dengan notebook Anda)
INPUT_FILE = 'output_dataset/dataset_filtered_processed_chunks.csv'
OUTPUT_FILE = '3_qa_dataset_strict_gold.jsonl'

In [11]:
# --- 2. FUNGSI GENERATOR STRICT ---
def generate_qa_strict(context):
    prompt = f"""
    You are a strict Data Annotator for a SQuAD dataset.
    
    TASK: Extract 3 QA pairs from the provided text.
    
    CRITICAL RULES:
    1. The "answer" MUST be an EXACT SUBSTRING from the Context. Copy-paste ONLY. No paraphrasing.
    2. Provide 2 answerable questions and 1 unanswerable question.
    3. Language: Indonesian.
    
    CONTEXT:
    "{context}"
    
    OUTPUT FORMAT (JSON ONLY):
    [
      {{ "question": "Pertanyaan?", "answer": "exact substring", "is_impossible": false }},
      {{ "question": "Pertanyaan sulit?", "answer": "", "is_impossible": true }}
    ]
    """
    
    max_retries = 5
    retry_count = 0

    while retry_count < max_retries:
        try:
            completion = client.chat.completions.create(
                messages=[
                    {"role": "system", "content": "You output JSON only."},
                    {"role": "user", "content": prompt}
                ],
                model=PRIMARY_MODEL,
                temperature=0.0, # Wajib 0 agar tidak kreatif (konsisten)
                response_format={"type": "json_object"}
            )
            return json.loads(completion.choices[0].message.content)

        except RateLimitError as e:
            # --- LOGIKA MENANGANI RATE LIMIT (429) ---
            print(f"‚ö†Ô∏è Rate Limit Hit! (Percobaan {retry_count + 1}/{max_retries})")
            
            # Coba ambil informasi waktu tunggu dari header response
            retry_after = 0
            try:
                headers = e.response.headers
                
                # Coba header standar 'retry-after'
                if 'retry-after' in headers:
                    retry_after = float(headers['retry-after'])
                
                # Coba header spesifik 'x-ratelimit-reset-tokens' (biasanya format "12.34s")
                elif 'x-ratelimit-reset-tokens' in headers:
                    reset_tokens = headers['x-ratelimit-reset-tokens']
                    retry_after = float(reset_tokens.replace('s', ''))
                
                # Coba header spesifik 'x-ratelimit-reset-requests'
                elif 'x-ratelimit-reset-requests' in headers:
                    reset_reqs = headers['x-ratelimit-reset-requests']
                    # Parse format "12m30s" atau "12.34s" jika perlu, tapi float simpel biasanya cukup untuk seconds
                    if 'm' in reset_reqs: 
                        # Sederhana: jika ada menit, tunggu default 60 detik saja biar aman
                        retry_after = 60 
                    else:
                        retry_after = float(reset_reqs.replace('s', ''))
            
            except Exception as header_err:
                # Jika gagal parsing header, gunakan default
                print(f"   (Gagal baca header: {header_err})")
                retry_after = 20 # Default aman

            # Pastikan retry_after minimal 1 detik & tambahkan buffer
            wait_time = max(1, retry_after) + 2 
            
            print(f"‚è≥ Tidur selama {wait_time:.2f} detik sebelum mencoba lagi...")
            time.sleep(wait_time)
            
            retry_count += 1
            
        except Exception as e:
            # Error lain (misal koneksi putus, JSON malformed dari server, dll)
            print(f"‚ùå Error non-limit: {e}")
            return None

    print("‚ùå Gagal setelah max retries.")
    return None

In [12]:
def run_strict_pipeline():
    print(f"‚öôÔ∏è Menyiapkan Pipeline Strict Mode dengan Model: {PRIMARY_MODEL}")
    
    # Baca Data Input
    try:
        df = pd.read_csv(INPUT_FILE)
        # Ambil kolom text, pastikan string
        texts = df['text'].fillna('').astype(str).unique()
        total_awal = len(texts)
    except Exception as e:
        print(f"‚ùå Gagal baca input: {e}")
        return

    # Logika Resume (Cek data yg sudah ada)
    processed_contexts = set()
    if os.path.exists(OUTPUT_FILE):
        print(f"üìÇ Membaca checkpoint dari {OUTPUT_FILE}...")
        with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    data = json.loads(line)
                    processed_contexts.add(data['context'])
                except: continue
    
    # Filter data yang belum diproses
    texts_to_process = [t for t in texts if t not in processed_contexts]
    print(f"üìä Sisa data: {len(texts_to_process)} dari {total_awal} total chunk unik.")
    
    if not texts_to_process:
        print("üéâ Semua data sudah selesai!")
        return

    valid_count = 0
    dropped_count = 0
    
    # Loop Utama
    with open(OUTPUT_FILE, 'a', encoding='utf-8') as f:
        for text in tqdm(texts_to_process, desc="Generating QA"):
            if len(text) < 50: continue
            
            # Request ke LLM (Pakai fungsi SMART)
            qa_list = generate_qa_strict(text)
            
            # Parsing response
            if qa_list:
                # Handle variasi struktur JSON dari LLM
                if isinstance(qa_list, dict): 
                    qa_list = qa_list.get('qas', qa_list.get('qa_pairs', []))
                
                squad_qas = []
                # Pastikan qa_list adalah list sebelum iterasi
                if isinstance(qa_list, list):
                    for item in qa_list:
                        # Validasi Format
                        if not isinstance(item, dict): continue
                        
                        question = item.get('question')
                        answer_text = item.get('answer', '')
                        is_impossible = item.get('is_impossible', False)
                        
                        if is_impossible:
                            squad_qas.append({
                                "id": str(uuid.uuid4()),
                                "question": question,
                                "answers": [],
                                "is_impossible": True
                            })
                        else:
                            # --- STRICT CHECK (Validasi Substring) ---
                            if not answer_text: continue
                            
                            # Case insensitive search untuk fleksibilitas sedikit
                            # Tapi kita simpan text ASLI dari konteks
                            start_idx = text.find(answer_text)
                            
                            if start_idx != -1:
                                # LOLOS VALIDASI
                                squad_qas.append({
                                    "id": str(uuid.uuid4()),
                                    "question": question,
                                    "answers": [{
                                        "text": answer_text,
                                        "answer_start": start_idx
                                    }],
                                    "is_impossible": False
                                })
                                valid_count += 1
                            else:
                                # GAGAL VALIDASI (Halusinasi LLM) -> BUANG
                                dropped_count += 1
                
                # Simpan jika ada QA valid
                if squad_qas:
                    entry = {"context": text, "qas": squad_qas}
                    f.write(json.dumps(entry) + "\n")
                    f.flush()
            

    print("\n" + "="*50)
    print("üèÅ PROSES SELESAI")
    print(f"‚úÖ QA Valid Tersimpan: {valid_count}")
    print(f"üóëÔ∏è QA Halusinasi Dibuang: {dropped_count}")
    print(f"üìÇ Output: {OUTPUT_FILE}")
    print("="*50)

In [13]:
# Jalankan
if __name__ == "__main__":
    run_strict_pipeline()

‚öôÔ∏è Menyiapkan Pipeline Strict Mode dengan Model: llama-3.3-70b-versatile
üìÇ Membaca checkpoint dari 3_qa_dataset_strict_gold.jsonl...
üìä Sisa data: 699 dari 752 total chunk unik.


Generating QA:   0%|                                                                           | 0/699 [00:00<?, ?it/s]

‚ö†Ô∏è Rate Limit Hit! (Percobaan 1/5)
‚è≥ Tidur selama 541.00 detik sebelum mencoba lagi...


Generating QA:   0%|                                                              | 1/699 [09:02<105:05:31, 542.02s/it]

‚ö†Ô∏è Rate Limit Hit! (Percobaan 1/5)
‚è≥ Tidur selama 692.00 detik sebelum mencoba lagi...


Generating QA:   0%|‚ñè                                                             | 2/699 [20:34<122:07:03, 630.74s/it]

‚ö†Ô∏è Rate Limit Hit! (Percobaan 1/5)
‚è≥ Tidur selama 797.00 detik sebelum mencoba lagi...


Generating QA:   0%|‚ñé                                                             | 3/699 [33:52<136:42:46, 707.14s/it]

‚ö†Ô∏è Rate Limit Hit! (Percobaan 1/5)
‚è≥ Tidur selama 802.00 detik sebelum mencoba lagi...


Generating QA:   1%|‚ñé                                                             | 4/699 [47:15<143:48:31, 744.91s/it]

‚ö†Ô∏è Rate Limit Hit! (Percobaan 1/5)
‚è≥ Tidur selama 661.00 detik sebelum mencoba lagi...


Generating QA:   1%|‚ñç                                                             | 5/699 [58:17<137:49:15, 714.92s/it]

‚ö†Ô∏è Rate Limit Hit! (Percobaan 1/5)
‚è≥ Tidur selama 744.00 detik sebelum mencoba lagi...


Generating QA:   1%|‚ñå                                                           | 6/699 [1:10:42<139:34:49, 725.09s/it]

‚ö†Ô∏è Rate Limit Hit! (Percobaan 1/5)
‚è≥ Tidur selama 746.00 detik sebelum mencoba lagi...


Generating QA:   1%|‚ñå                                                           | 6/699 [1:23:08<160:02:46, 831.41s/it]


KeyboardInterrupt: 

# Tahap 4: Fine-Tuning QA

In [1]:
import json
import torch
import gc
from transformers import (
    BertTokenizerFast,
    BertForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator,
)
from torch.utils.data import Dataset
from tqdm.auto import tqdm
from transformers import BertModel, BertConfig 





In [2]:

# Clear GPU memory
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"‚úÖ GPU: {torch.cuda.get_device_name(0)}")
    print(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("‚ö†Ô∏è GPU tidak tersedia, menggunakan CPU")


‚úÖ GPU: NVIDIA GeForce RTX 4050 Laptop GPU
   VRAM: 6.44 GB


In [3]:
qa_pairs = []
with open('qa_dataset_final.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        qa_pairs.append(json.loads(line))

print(f"Total paragraf: {len(qa_pairs)}")

Total paragraf: 580


In [4]:
# Load model dari Domain Adaptation
model_path = './indobert-gizi-mlm-final'
tokenizer = BertTokenizerFast.from_pretrained(model_path)

# Load base BERT dari MLM model, lalu inisialisasi QA head baru
from transformers import BertConfig, BertForQuestionAnswering

print("\nüîß Loading BERT encoder dari Domain Adaptation...")

# === PERBAIKAN DI SINI ===
# Kita paksa num_labels=2 agar output layer QA hanya 2 (Start & End)
config = BertConfig.from_pretrained(model_path, num_labels=2)

# Gunakan .from_pretrained langsung dengan ignore_mismatched_sizes=True
# Ini akan memuat body BERT yang sudah di-training (MLM), tapi mereset head QA menjadi benar
model = BertForQuestionAnswering.from_pretrained(
    model_path, 
    config=config,
    ignore_mismatched_sizes=True
)

print("‚úÖ Model QA berhasil diinisialisasi.")
print(f"üîç Cek QA Outputs Layer: {model.qa_outputs}") 
# Pastikan output print di atas tertulis out_features=2

model.to('cuda' if torch.cuda.is_available() else 'cpu')


üîß Loading BERT encoder dari Domain Adaptation...


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ./indobert-gizi-mlm-final and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Model QA berhasil diinisialisasi.
üîç Cek QA Outputs Layer: Linear(in_features=768, out_features=2, bias=True)


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [5]:
# Parse data ke format training
train_examples = []

for para in qa_pairs:
    context = para['context']
    for qa in para['qas']:
        example = {
            'id': qa['id'],
            'question': qa['question'],
            'context': context,
        }
        
        if qa['is_impossible']:
            example['answers'] = {'answer_start': [], 'text': []}
        else:
            example['answers'] = {
                'answer_start': [qa['answers'][0]['answer_start']],
                'text': [qa['answers'][0]['text']]
            }
        
        train_examples.append(example)

print(f"Total QA pairs: {len(train_examples)}")



Total QA pairs: 1137


In [6]:
# Split train/validation (90/10)
split_idx = int(0.9 * len(train_examples))
train_data = train_examples[:split_idx]
val_data = train_examples[split_idx:]

print(f"Training: {len(train_data)}")
print(f"Validation: {len(val_data)}")


Training: 1023
Validation: 114


In [7]:
# Tokenisasi dengan batch processing
def tokenize_dataset(examples):
    questions = [ex['question'] for ex in examples]
    contexts = [ex['context'] for ex in examples]
    
    tokenized = tokenizer(
        questions,
        contexts,
        truncation='only_second',
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding='max_length'
    )
    
    sample_mapping = tokenized.pop('overflow_to_sample_mapping')
    offset_mapping = tokenized['offset_mapping']
    
    start_positions = []
    end_positions = []
    
    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized['input_ids'][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples[sample_index]['answers']
        
        # Unanswerable question
        if len(answers['answer_start']) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
            continue
        
        start_char = answers['answer_start'][0]
        end_char = start_char + len(answers['text'][0])
        
        # Find token start position
        token_start_index = 0
        while token_start_index < len(sequence_ids) and sequence_ids[token_start_index] != 1:
            token_start_index += 1
        
        # Find token end position
        token_end_index = len(input_ids) - 1
        while token_end_index >= 0 and sequence_ids[token_end_index] != 1:
            token_end_index -= 1
        
        # Check if answer is in this chunk
        if not (token_start_index < len(offsets) and 
                token_end_index < len(offsets) and
                offsets[token_start_index][0] <= start_char and 
                offsets[token_end_index][1] >= end_char):
            start_positions.append(cls_index)
            end_positions.append(cls_index)
            continue
        
        # Find exact token positions
        while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
            token_start_index += 1
        start_positions.append(token_start_index - 1)
        
        while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
            token_end_index -= 1
        end_positions.append(token_end_index + 1)
    
    tokenized['start_positions'] = start_positions
    tokenized['end_positions'] = end_positions
    tokenized.pop('offset_mapping')
    
    return tokenized

print("\nüîÑ Tokenizing training data...")
tokenized_train = tokenize_dataset(train_data)

print("üîÑ Tokenizing validation data...")
tokenized_val = tokenize_dataset(val_data)



üîÑ Tokenizing training data...
üîÑ Tokenizing validation data...


In [8]:
# Dataset class
class QADataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __len__(self):
        return len(self.encodings['input_ids'])
    
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'start_positions': torch.tensor(self.encodings['start_positions'][idx]),
            'end_positions': torch.tensor(self.encodings['end_positions'][idx])
        }

train_dataset = QADataset(tokenized_train)
val_dataset = QADataset(tokenized_val)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Val dataset size: {len(val_dataset)}")

Train dataset size: 1023
Val dataset size: 114


In [13]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./indobert-gizi-qa',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_steps=50,
    save_total_limit=2,
    eval_strategy='steps',
    eval_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=0,
    disable_tqdm=False,
    report_to='none',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=default_data_collator,
)


In [14]:
# Training
print("\n" + "="*50)
print("Memulai Fine-Tuning QA...")
print("="*50)
trainer.train()

# Simpan model
print("\nüíæ Menyimpan model...")
model.save_pretrained('./indobert-gizi-qa-final')
tokenizer.save_pretrained('./indobert-gizi-qa-final')

print("\n" + "="*50)
print("Fine-Tuning QA selesai!")
print("Model tersimpan di: ./indobert-gizi-qa-final")
print("="*50)


Memulai Fine-Tuning QA...


Step,Training Loss,Validation Loss
50,1.1089,3.265762
100,1.6924,3.037668
150,1.2174,3.643459
200,0.8783,3.207864
250,1.3528,3.005722
300,0.8493,3.371591
350,0.6917,3.491086



üíæ Menyimpan model...

Fine-Tuning QA selesai!
Model tersimpan di: ./indobert-gizi-qa-final


# Test Model

In [17]:
from transformers import BertTokenizerFast, BertForQuestionAnswering, pipeline

# 1. Load Model Final
model_path = "./indobert-gizi-qa-final" # Pastikan path ini sesuai output training
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForQuestionAnswering.from_pretrained(model_path)

# 2. Buat Pipeline QA
qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    device=0 # Gunakan GPU (0) atau CPU (-1)
)

# 3. Tes Manual
context_text = """
Diabetes melitus adalah penyakit kronis yang ditandai dengan kadar gula darah yang tinggi. 
Penyebab utamanya adalah kurangnya produksi insulin oleh pankreas atau ketidakmampuan tubuh menggunakan insulin.
Gejala umum meliputi sering haus, sering buang air kecil, dan penurunan berat badan drastis.
Pencegahan dapat dilakukan dengan menjaga pola makan sehat dan rutin berolahraga.
"""

questions = [
    "Apa itu diabetes melitus?",
    "Apa penyebab utama diabetes?",
    "Bagaimana cara mencegah ?",
    "Apa gejala Diabetes melitus?"
]

print("="*50)
print("ü§ñ HASIL TES MODEL QA")
print("="*50)

for q in questions:
    result = qa_pipeline(question=q, context=context_text)
    print(f"Tanya : {q}")
    print(f"Jawab : {result['answer']}")
    print(f"Score : {result['score']:.4f}")
    print("-" * 30)

Device set to use cuda:0


ü§ñ HASIL TES MODEL QA
Tanya : Apa itu diabetes melitus?
Jawab : kurangnya produksi insulin oleh pankreas atau ketidakmampuan tubuh menggunakan insulin
Score : 0.0543
------------------------------
Tanya : Apa penyebab utama diabetes?
Jawab : kurangnya produksi insulin oleh pankreas atau ketidakmampuan tubuh menggunakan insulin
Score : 0.0533
------------------------------
Tanya : Bagaimana cara mencegah ?
Jawab : menjaga pola makan sehat dan rutin berolahraga.
Score : 0.0310
------------------------------
Tanya : Apa gejala Diabetes melitus?
Jawab : kurangnya produksi insulin oleh pankreas atau ketidakmampuan tubuh menggunakan insulin
Score : 0.0368
------------------------------


# Tahap 5: Search FUnction (_or Search Engine, whaatever you called it_)

In [12]:
import pandas as pd
import torch
from transformers import BertTokenizerFast, BertForQuestionAnswering
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import chromadb
from chromadb.utils import embedding_functions
import numpy as np
import os

In [13]:
# Load processed chunks
print("üìÇ Loading dataset...")
df = pd.read_csv('output_dataset/processed_chunks.csv')
print(f"Total chunks: {len(df)}")


üìÇ Loading dataset...
Total chunks: 7661


In [14]:
# Load QA Model (Reader)
print("\n Loading QA Model...")
qa_model_path = './indobert-gizi-qa-final'
tokenizer = BertTokenizerFast.from_pretrained(qa_model_path)
qa_model = BertForQuestionAnswering.from_pretrained(qa_model_path)
qa_model.to('cuda' if torch.cuda.is_available() else 'cpu')
qa_model.eval()
print(" QA Model loaded")


 Loading QA Model...
 QA Model loaded


In [15]:
# ============================================
# RETRIEVER: Hybrid Search (Dense + BM25)
# ============================================

# Gunakan SentenceTransformer untuk embedding
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

def custom_embedding_function(texts):
    return embedding_model.encode(texts, show_progress_bar=False).tolist()

In [16]:
# Buat collection
CHROMA_DATA_PATH = "./chroma_db_store" # Folder tempat data akan disimpan selamanya

print("\nüîç Setting up Retriever...")

# 1. Gunakan PERSISTENT Client (Bukan Client biasa)
print(f"  ‚Üí Initializing ChromaDB from: {CHROMA_DATA_PATH}")
chroma_client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

# Setup Embedding Function
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
)

# 2. Get or Create Collection
# Fungsi ini akan meload collection jika ada, atau membuat baru jika belum ada
collection = chroma_client.get_or_create_collection(
    name="gizi_knowledge",
    embedding_function=embedding_func
)


üîç Setting up Retriever...
  ‚Üí Initializing ChromaDB from: ./chroma_db_store


In [17]:
# ============================================
# LOGIKA PENGISIAN DATA CERDAS
# ============================================

# Cek apakah database sudah berisi data?
if collection.count() == 0:
    print("  ‚ö†Ô∏è Database kosong. Memulai proses Indexing Dokumen (Hanya sekali)...")
    
    # --- PROSES ADD DOCUMENT (Sama seperti kode lama Anda) ---
    batch_size = 100
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size]
        collection.add(
            documents=batch['text'].tolist(),
            ids=[f"chunk_{j}" for j in range(i, min(i+batch_size, len(df)))],
            metadatas=[
                {
                    'title': str(row['title']),
                    'url': str(row['url']),
                    'intent': str(row['intent']),
                    'doc_id': int(row['doc_id']),
                    'chunk_id': int(row['chunk_id'])
                }
                for _, row in batch.iterrows()
            ]
        )
        if (i + batch_size) % 500 == 0:
            print(f"    Processed {min(i+batch_size, len(df))}/{len(df)} chunks")
            
    print("  ‚úÖ Indexing Selesai & Tersimpan Otomatis!")

else:
    print(f"  ‚úÖ Database ditemukan berisi {collection.count()} dokumen.")
    print("  ‚è© Skip indexing. Langsung siap dipakai.")

  ‚ö†Ô∏è Database kosong. Memulai proses Indexing Dokumen (Hanya sekali)...
    Processed 500/7661 chunks
    Processed 1000/7661 chunks
    Processed 1500/7661 chunks
    Processed 2000/7661 chunks
    Processed 2500/7661 chunks
    Processed 3000/7661 chunks
    Processed 3500/7661 chunks
    Processed 4000/7661 chunks
    Processed 4500/7661 chunks
    Processed 5000/7661 chunks
    Processed 5500/7661 chunks
    Processed 6000/7661 chunks
    Processed 6500/7661 chunks
    Processed 7000/7661 chunks
    Processed 7500/7661 chunks
  ‚úÖ Indexing Selesai & Tersimpan Otomatis!


In [18]:
# 2. BM25 (Sparse Retrieval)
print("  ‚Üí Initializing BM25...")
tokenized_corpus = [doc.lower().split() for doc in df['text'].tolist()]
bm25 = BM25Okapi(tokenized_corpus)
print("‚úÖ BM25 ready")

  ‚Üí Initializing BM25...
‚úÖ BM25 ready


In [19]:
# ============================================
# ENSEMBLE RETRIEVER
# ============================================

def hybrid_retrieve(query, top_k=5, dense_weight=0.6, bm25_weight=0.4):
    """
    Hybrid retrieval: Dense (ChromaDB) + Sparse (BM25)
    """
    # Dense retrieval
    dense_results = collection.query(
        query_texts=[query],
        n_results=top_k * 2
    )
    
    dense_docs = []
    for i, doc_id in enumerate(dense_results['ids'][0]):
        idx = int(doc_id.split('_')[1])
        dense_docs.append({
            'idx': idx,
            'text': dense_results['documents'][0][i],
            'metadata': dense_results['metadatas'][0][i],
            'distance': dense_results['distances'][0][i],
            'dense_score': 1 / (1 + dense_results['distances'][0][i])  # Convert distance to score
        })
    
    # BM25 retrieval
    tokenized_query = query.lower().split()
    bm25_scores = bm25.get_scores(tokenized_query)
    top_bm25_indices = np.argsort(bm25_scores)[::-1][:top_k * 2]
    
    bm25_docs = []
    for idx in top_bm25_indices:
        bm25_docs.append({
            'idx': int(idx),
            'bm25_score': float(bm25_scores[idx])
        })
    
    # Combine scores
    combined = {}
    for doc in dense_docs:
        idx = doc['idx']
        combined[idx] = {
            'text': doc['text'],
            'metadata': doc['metadata'],
            'score': dense_weight * doc['dense_score']
        }
    
    for doc in bm25_docs:
        idx = doc['idx']
        if idx in combined:
            combined[idx]['score'] += bm25_weight * doc['bm25_score']
        else:
            combined[idx] = {
                'text': df.iloc[idx]['text'],
                'metadata': {
                    'title': df.iloc[idx]['title'],
                    'url': df.iloc[idx]['url'],
                    'intent': df.iloc[idx]['intent']
                },
                'score': bm25_weight * doc['bm25_score']
            }
    
    # Sort by combined score
    sorted_results = sorted(combined.items(), key=lambda x: x[1]['score'], reverse=True)
    
    return [
        {
            'text': item[1]['text'],
            'metadata': item[1]['metadata'],
            'score': item[1]['score']
        }
        for item in sorted_results[:top_k]
    ]

In [23]:
def answer_question(question, context, confidence_threshold=0.0): # Set 0.0 biar kita lihat SEMUA hasil
    inputs = tokenizer(
        question,
        context,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    ).to(qa_model.device)
    
    with torch.no_grad():
        outputs = qa_model(**inputs)
    
    start_logits = outputs.start_logits[0]
    end_logits = outputs.end_logits[0]
    
    # Ambil posisi terbaik
    start_idx = torch.argmax(start_logits).item()
    end_idx = torch.argmax(end_logits).item()
    
    # Hitung probabilitas
    start_probs = torch.softmax(start_logits, dim=0)
    end_probs = torch.softmax(end_logits, dim=0)
    confidence = (start_probs[start_idx] * end_probs[end_idx]).item()
    
    # --- BAGIAN DEBUGGING ---
    print(f"\n   üïµÔ∏è [DEBUG] Q: {question}")
    print(f"   üìç Start Index: {start_idx} | End Index: {end_idx}")
    print(f"   üìä Confidence: {confidence:.4f}")
    
    # Cek apakah dia menunjuk [CLS] (Index 0)
    if start_idx == 0:
        print("   ‚ùå Model memprediksi: [CLS] (Unanswerable / Menyerah)")
        return None, confidence

    # Cek apakah start > end (Mustahil)
    if start_idx > end_idx:
        print("   ‚ùå Model memprediksi: Posisi Terbalik (Start > End)")
        return None, confidence

    # Extract jawaban
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    answer_tokens = tokens[start_idx:end_idx + 1]
    answer = tokenizer.convert_tokens_to_string(answer_tokens).replace('##', '')
    
    print(f"   ‚úÖ Calon Jawaban: {answer}")
    return answer, confidence

In [26]:
# ============================================
# CHATBOT PIPELINE (MODE BYPASS / SEARCH ENGINE)
# ============================================

def chatbot_pipeline(question, top_k=3):
    print(f"\n Question: {question}")
    print("="*60)
    
    # 1. Cari Dokumen (Retriever tetap bekerja!)
    print(" Retrieving relevant documents...")
    retrieved_docs = hybrid_retrieve(question, top_k=top_k)
    
    # Fallback jika tidak ada dokumen
    if not retrieved_docs:
        return {
            'answer': 'Maaf, saya belum menemukan informasi yang relevan di database.',
            'source': None,
            'confidence': 0.0,
            'intent': 'Unknown'
        }
        
    print(f"‚úÖ Found {len(retrieved_docs)} relevant documents")
    
    # 2. BYPASS QA MODEL
    # Alih-alih menyuruh model QA mikir, kita langsung ambil dokumen terbaik (Top 1)
    best_doc = retrieved_docs[0] 
    
    # Ambil snippet (misal 3 kalimat pertama atau 300 karakter)
    full_text = best_doc['text']
    # Potong biar gak kepanjangan
    snippet = full_text[:350] + "..." if len(full_text) > 350 else full_text
    
    # Format jawaban ala Search Engine
    response = (
        f"Berikut informasi yang saya temukan:\n\n"
        f"\"{snippet}\"\n\n"
        f"(Sumber: {best_doc['metadata']['title']})"
    )
    
    return {
        'answer': response,
        'source': best_doc['metadata']['url'],
        'confidence': best_doc['score'], # Pakai skor relevansi pencarian
        'intent': best_doc['metadata']['intent']
    }

In [27]:
# ============================================
# TESTING
# ============================================
# Test queries
test_questions = [
    "Apa penyebab perut buncit?",
    "Bagaimana cara menurunkan berat badan?",
    "Makanan apa yang baik untuk diet?"
]

print("\n Testing pipeline...")
for q in test_questions:
    result = chatbot_pipeline(q)
    print(f"\n Answer: {result['answer']}")
    print(f" Source: {result['source']}")
    print(f" Confidence: {result['confidence']:.2%}")
    print(f"  Intent: {result['intent']}")


 Testing pipeline...

‚ùì Question: Apa penyebab perut buncit?
üîç Retrieving relevant documents...
‚úÖ Found 3 relevant documents

 Answer: Berikut informasi yang saya temukan:

"10 makanan penyebab perut buncit no . 7 sering dikonsumsi . makanan penyebab perut buncit bisa menjadi penyebab penumpukan lemak di perut . kondisi ini bisa meningkatkan berbagai risiko penyakit , namun masih bisa diatasi dengan asupan nutrisi yang tepat , meningkatkan aktivitas fisik , hingga mengurangi stres . lalu , apa saja makanan yang harus d..."

(Sumber: 10 Makanan Penyebab Perut Buncit (No. 7 Sering Dikonsumsi))
 Source: https://doktersehat.com/gaya-hidup/gizi-dan-nutrisi/makanan-penyebab-perut-buncit/
 Confidence: 399.97%
  Intent: berat_badan, Kesehatan_umum, kesehatan_ibu

‚ùì Question: Bagaimana cara menurunkan berat badan?
üîç Retrieving relevant documents...
‚úÖ Found 3 relevant documents

 Answer: Berikut informasi yang saya temukan:

"alasan ketidaksuburan pada perempuan . selain itu , beb