# Import and combine dataset

In [10]:
import pandas as pd
df1 = pd.read_csv('original_dataset/hellosehat_dataset_10.csv', sep=';', encoding='utf-8')
df2 = pd.read_csv('original_dataset/alodokter_dataset_10.csv', sep=';', encoding='utf-8')
df3 = pd.read_csv('original_dataset/doktersehat_gizi_final_10.csv', sep=';', encoding='utf-8')

In [11]:
df_combine = pd.concat([df1, df2, df3], ignore_index=True)
print(f"Total dokumen setelah concat: {len(df_combine)}")
print(f"Index range: {df_combine.index.min()} - {df_combine.index.max()}")
print(f"Kolom: {df_combine.columns.tolist()}")

Total dokumen setelah concat: 2391
Index range: 0 - 2390
Kolom: ['URL', 'Judul', 'Konten']


In [13]:
df_combine.head()

Unnamed: 0,URL,Judul,Konten
0,https://hellosehat.com/nutrisi/berat-badan-tur...,Kenali 9 Penyebab Perut Buncit dan Cara Mengat...,Perut buncit memang mampu memengaruhi penampil...
1,https://hellosehat.com/nutrisi/tips-makan-seha...,"8 Merk Oven Gas Terbaik, Cocok untuk Bisnis Kue","Bagi Anda yang gemar bikin kue, oven gas menja..."
2,https://hellosehat.com/nutrisi/fakta-gizi/mere...,10 Merek Oatmeal yang Bergizi dan Cocok untuk ...,Butuh menu sarapan yang cepat? Berbagai merek ...
3,https://hellosehat.com/nutrisi/resep-sehat/jus...,4 Resep Jus untuk Bantu Meningkatkan Sistem Im...,"Setiap harinya, sistem imunitas pada tubuh bek..."
4,https://hellosehat.com/nutrisi/berat-badan-tur...,Apakah Sering Buang Air Bisa Menurunkan Berat ...,"Setelah diolah, dicerna, dan diambil semua giz..."


In [12]:
# FIX that thang!
# df_combine.to_csv('output_dataset/combined_nutrition_dataset_2.csv', index=False, encoding='utf-8', sep=";")

In [25]:
df = pd.read_csv('output_dataset/combined_nutrition_dataset.csv', sep=';', encoding='utf-8')
df.head()

Unnamed: 0,URL,Judul,Konten
0,https://hellosehat.com/nutrisi/berat-badan-tur...,Kenali 9 Penyebab Perut Buncit dan Cara Mengat...,Perut buncit memang mampu memengaruhi penampil...
1,https://hellosehat.com/nutrisi/tips-makan-seha...,"8 Merk Oven Gas Terbaik, Cocok untuk Bisnis Kue","Bagi Anda yang gemar bikin kue, oven gas menja..."
2,https://hellosehat.com/nutrisi/fakta-gizi/mere...,10 Merek Oatmeal yang Bergizi dan Cocok untuk ...,Butuh menu sarapan yang cepat? Berbagai merek ...
3,https://hellosehat.com/nutrisi/resep-sehat/jus...,4 Resep Jus untuk Bantu Meningkatkan Sistem Im...,"Setiap harinya, sistem imunitas pada tubuh bek..."
4,https://hellosehat.com/nutrisi/berat-badan-tur...,Apakah Sering Buang Air Bisa Menurunkan Berat ...,"Setelah diolah, dicerna, dan diambil semua giz..."


In [26]:
row = df.loc[2150]
print(row)

URL       https://doktersehat.com/gaya-hidup/gizi-dan-nu...
Judul        Sumber, Manfaat, dan Dampak Kekurangan Omega-3
Konten     Kita selalu menganggap kalau omega-3 adalah n...
Name: 2150, dtype: object


# Intent Tagging (Health Goals)

In [27]:
import pandas as pd
import csv
import sys

In [28]:
# --- 1. KONFIGURASI INPUT & OUTPUT ---
INPUT_FILE = 'output_dataset/combined_nutrition_dataset.csv'
OUTPUT_FILE = 'output_dataset/tagged_combined_nutrition_dataset.csv'


In [29]:

# --- 2. KAMUS KEYWORD INTENT ---
INTENT_KEYWORDS = {
    'diabetes': ['diabetes', 'gula darah', 'kencing manis', 'insulin', 'glukosa', 'hiperglikemia'],
    'anemia': ['anemia', 'kurang darah', 'zat besi', 'hemoglobin', 'pucat', 'lelah'],
    'kesehatan_ibu': ['hamil', 'menyusui', 'bumil', 'busui', 'asi', 'janin', 'kandungan', 'kehamilan'],
    'kesehatan_anak': ['anak', 'bayi', 'balita', 'si kecil', 'tumbuh kembang', 'imunisasi'],
    'berat_badan': ['berat badan', 'diet', 'kurus', 'gemuk', 'langsing', 'turun berat', 'buncit', 'lemak', 'kalori', 'obesitas'],
    'pembentukan_tubuh': ['otot', 'gym', 'fitness', 'binaraga', 'sixpack', 'latihan beban', 'workout', 'massa otot'],
    'kesehatan_pencernaan': ['pencernaan', 'usus', 'lambung', 'maag', 'gerd', 'sembelit', 'diare', 'serat'],
    'resep_sehat': ['resep', 'cara membuat', 'bahan-bahan', 'menu masakan', 'cara masak', 'hidangan'],
    'diet_khusus': ['keto', 'vegan', 'vegetarian', 'gluten free', 'rendah garam', 'dash diet', 'intermittent'],
    'pencegahan': ['mencegah', 'risiko', 'hindari', 'bahaya', 'waspada', 'gejala', 'tanda-tanda'],
    'fakta_gizi': ['kandungan gizi', 'nutrisi', 'protein', 'karbohidrat', 'vitamin', 'mineral', 'takaran saji'],
    'makanan_sehat': ['buah', 'sayur', 'organik', 'superfood', 'makanan sehat', 'bijian', 'kacang'],
    # Fallback
    'Kesehatan_umum': ['manfaat', 'khasiat', 'sehat', 'bugar', 'stamina', 'daya tahan', 'imun', 'kesehatan']
}



In [30]:
def get_top_3_intents(text):
    """
    Menghitung frekuensi keyword dan mengembalikan maksimal 3 intent teratas.
    """
    text = str(text).lower()
    scores = {}
    
    for intent, keywords in INTENT_KEYWORDS.items():
        count = 0
        for kw in keywords:
            count += text.count(kw)
        if count > 0:
            scores[intent] = count
    
    # Jika tidak ada match, return default
    if not scores:
        return "Kesehatan_umum"
    
    # Urutkan score tertinggi -> terendah
    sorted_intents = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    # Ambil 3 teratas
    top_3 = [item[0] for item in sorted_intents[:3]]
    return ", ".join(top_3)



In [31]:
# --- 3. EKSEKUSI UTAMA ---
def process_tagging():
    print(f"üìÇ Membaca file input: {INPUT_FILE} ...")
    
    try:
        # Membaca file dengan separator ';'
        # Kita asumsikan baris pertama adalah header [URL;Judul;Konten]
        df = pd.read_csv(INPUT_FILE, sep=';', on_bad_lines='skip')
        
        # Validasi kolom dasar (Sesuaikan nama kolom jika di file Anda berbeda)
        # Kita rename standar agar mudah diproses
        # Asumsi urutan kolom: 1. URL, 2. Judul, 3. Konten
        if len(df.columns) >= 3:
            df.columns = ['url', 'title', 'content'] + list(df.columns[3:])
        else:
            print("‚ùå Error: File input harus memiliki minimal 3 kolom (URL;Judul;Konten)")
            return

        print(f"üìä Total Data: {len(df)} baris.")
        
        # Bersihkan data (isi yang kosong dengan string kosong)
        df['title'] = df['title'].fillna('').astype(str)
        df['content'] = df['content'].fillna('').astype(str)
        
        print("üîç Melakukan Auto-Tagging Intent...")
        # Gabungkan Judul + Konten untuk pencarian keyword yang lebih akurat
        df['full_text_scan'] = df['title'] + " " + df['content']
        df['intent'] = df['full_text_scan'].apply(get_top_3_intents)
        
        # Hapus kolom bantuan
        df.drop(columns=['full_text_scan'], inplace=True)
        
        print(f"üíæ Menyimpan ke format final (Comma Separated + Quoted)...")
        
        # --- TEKNIK PENYIMPANAN PENTING ---
        # quoting=csv.QUOTE_ALL : Memaksa SEMUA kolom dibungkus tanda kutip "..."
        # Ini menjamin koma di dalam teks TIDAK akan dianggap sebagai pemisah kolom baru.
        df.to_csv(
            OUTPUT_FILE, 
            sep=',',              # Separator Koma
            quotechar='"',        # Pembungkus Tanda Kutip Ganda
            quoting=csv.QUOTE_ALL, # MODE AMAN: Bungkus semua data dengan kutip
            index=False,
            encoding='utf-8'
        )
        
        print(f"‚úÖ SUKSES! File tersimpan: {OUTPUT_FILE}")
        print("   Format: \"URL\",\"Title\",\"Content\",\"Intent\" (Comma Separated)")

    except FileNotFoundError:
        print(f"‚ùå File {INPUT_FILE} tidak ditemukan.")
    except Exception as e:
        print(f"‚ùå Terjadi kesalahan: {e}")

if __name__ == "__main__":
    process_tagging()

üìÇ Membaca file input: output_dataset/combined_nutrition_dataset.csv ...
üìä Total Data: 2425 baris.
üîç Melakukan Auto-Tagging Intent...
üíæ Menyimpan ke format final (Comma Separated + Quoted)...
‚úÖ SUKSES! File tersimpan: output_dataset/tagged_combined_nutrition_dataset.csv
   Format: "URL","Title","Content","Intent" (Comma Separated)


# Preprocessing and chunking 

In [32]:
import pandas as pd
import re
from transformers import BertTokenizer
import json

In [2]:
# Load dataset
df = pd.read_csv('tagged_combined_nutrition_dataset.csv')
print(f"Total dokumen: {len(df)}")
print(f"Kolom dataset: {df.columns.tolist()}")

Total dokumen: 2390
Kolom dataset: ['URL', 'Judul', 'Konten']


In [3]:
# Load IndoBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p2')

# Fungsi preprocessing
def preprocess_text(text):
    """Normalisasi teks"""
    if pd.isna(text):
        return ""
    
    text = text.lower()
    text = re.sub(r'[^\w\s.,!?%-]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    return text

# Fungsi chunking dengan sliding window
def chunk_text_with_overlap(text, max_tokens=384, overlap=50):
    """
    Chunking dengan overlap sliding window
    max_tokens: 512 - 128 (buffer untuk question) = 384
    overlap: 50 tokens
    """
    chunks = []
    tokens = tokenizer.tokenize(text)
    
    if len(tokens) <= max_tokens:
        return [text]
    
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_text)
        
        if end == len(tokens):
            break
        
        start = end - overlap
    
    return chunks

In [4]:
# Proses semua dokumen
processed_data = []

for idx, row in df.iterrows():
    title = row.get('Judul', '') if 'Judul' in df.columns else ''
    content = row.get('Konten', '') if 'Konten' in df.columns else row.get('text', '')
    url = row.get('URL', '') if 'URL' in df.columns else ''
    
    full_text = f"{title}. {content}".strip()
    cleaned_text = preprocess_text(full_text)
    
    if not cleaned_text:
        continue
    
    chunks = chunk_text_with_overlap(cleaned_text)
    
    for chunk_idx, chunk in enumerate(chunks):
        processed_data.append({
            'doc_id': idx,
            'chunk_id': chunk_idx,
            'title': title,
            'text': chunk,
            'url': url,
            'token_count': len(tokenizer.tokenize(chunk))
        })
    
    if (idx + 1) % 100 == 0:
        print(f"Processed {idx + 1}/{len(df)} documents")

# Simpan hasil
df_processed = pd.DataFrame(processed_data)
df_processed.to_csv('processed_chunks.csv', index=False)

print(f"\nTotal chunks: {len(df_processed)}")
print(f"Rata-rata tokens per chunk: {df_processed['token_count'].mean():.2f}")
print(f"Max tokens: {df_processed['token_count'].max()}")
print(f"Min tokens: {df_processed['token_count'].min()}")

Processed 100/2390 documents
Processed 200/2390 documents
Processed 300/2390 documents
Processed 400/2390 documents
Processed 500/2390 documents
Processed 600/2390 documents
Processed 700/2390 documents
Processed 800/2390 documents
Processed 900/2390 documents
Processed 1000/2390 documents
Processed 1100/2390 documents
Processed 1200/2390 documents
Processed 1300/2390 documents
Processed 1400/2390 documents
Processed 1500/2390 documents
Processed 1600/2390 documents
Processed 1700/2390 documents
Processed 1800/2390 documents
Processed 1900/2390 documents
Processed 2000/2390 documents
Processed 2100/2390 documents
Processed 2200/2390 documents
Processed 2300/2390 documents

Total chunks: 7661
Rata-rata tokens per chunk: 334.99
Max tokens: 388
Min tokens: 33


In [5]:
# Simpan corpus untuk MLM
all_texts = df_processed['text'].tolist()
with open('corpus_for_mlm.json', 'w', encoding='utf-8') as f:
    json.dump(all_texts, f, ensure_ascii=False, indent=2)

print("\nFile tersimpan:")
print("- processed_chunks.csv")
print("- corpus_for_mlm.json")


File tersimpan:
- processed_chunks.csv
- corpus_for_mlm.json


# Domain ~~EXPANSION~~ Adaptation

In [1]:
import json
import torch
import re
from transformers import (
    BertTokenizer, 
    BertForMaskedLM,
    DataCollatorForWholeWordMask,
    TrainingArguments,
    Trainer
)
from torch.utils.data import Dataset
from tqdm.auto import tqdm

2026-01-22 07:01:24.452778: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load corpus
with open('corpus_for_mlm.json', 'r', encoding='utf-8') as f:
    texts = json.load(f)

print(f"Total texts: {len(texts)}")

Total texts: 7661


In [3]:
# Load model dan tokenizer
model_name = 'indobenchmark/indobert-base-p2'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

# Custom Dataset untuk MLM
class MLMDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Preprocess dengan progress bar
        print("Preprocessing texts...")
        self.encodings = []
        for text in tqdm(texts, desc="Tokenizing"):
            encoding = self.tokenizer(
                text,
                truncation=True,
                max_length=self.max_length,
                padding='max_length',
                return_tensors='pt'
            )
            self.encodings.append({
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze()
            })
    
    def __len__(self):
        return len(self.encodings)
    
    def __getitem__(self, idx):
        return self.encodings[idx]

Some weights of BertForMaskedLM were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Custom Data Collator untuk WWM tanpa masking angka & satuan
class CustomWWMDataCollator(DataCollatorForWholeWordMask):
    def __init__(self, tokenizer, mlm_probability=0.15):
        super().__init__(
            tokenizer=tokenizer,
            mlm=True,
            mlm_probability=mlm_probability
        )
    
    def torch_mask_tokens(self, inputs, special_tokens_mask=None):
        """Override untuk hindari masking angka & satuan"""
        labels = inputs.clone()
        probability_matrix = torch.full(labels.shape, self.mlm_probability)
        
        if special_tokens_mask is None:
            special_tokens_mask = [
                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
                for val in labels.tolist()
            ]
            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
        else:
            special_tokens_mask = special_tokens_mask.bool()
        
        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
        
        # Hindari masking angka & satuan
        for i, input_ids in enumerate(inputs):
            tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
            for j, token in enumerate(tokens):
                # Skip jika token mengandung angka atau satuan umum
                if re.search(r'\d', token) or token in ['mg', 'gram', 'kg', 'ml', 'kkal', 'kalori', '%']:
                    probability_matrix[i, j] = 0.0
        
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100
        
        # 80% replaced with [MASK]
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
        
        # 10% replaced with random token
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]
        
        # 10% unchanged
        return inputs, labels

In [5]:
# Buat dataset
print("\nMempersiapkan dataset...")
train_dataset = MLMDataset(texts, tokenizer)

# Data collator dengan WWM
data_collator = CustomWWMDataCollator(
    tokenizer=tokenizer,
    mlm_probability=0.15
)

# Training arguments
training_args = TrainingArguments(
    output_dir='./indobert-gizi-mlm',
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
    disable_tqdm=False,
    report_to='none',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)


Mempersiapkan dataset...
Preprocessing texts...


Tokenizing:   0%|          | 0/7661 [00:00<?, ?it/s]



In [6]:
# Training
print("\n" + "="*50)
print("Memulai Domain Adaptation...")
print("="*50)
trainer.train()

# Simpan model
print("\nMenyimpan model...")
model.save_pretrained('./indobert-gizi-mlm-final')
tokenizer.save_pretrained('./indobert-gizi-mlm-final')

print("\n" + "="*50)
print("Domain Adaptation selesai!")
print("Model tersimpan di: ./indobert-gizi-mlm-final")
print("="*50)


Memulai Domain Adaptation...


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
100,5.7468
200,4.1589
300,3.6366
400,3.3511
500,3.2187
600,2.9721
700,2.903
800,2.7924
900,2.7614
1000,2.6742



Menyimpan model...

Domain Adaptation selesai!
Model tersimpan di: ./indobert-gizi-mlm-final
