# Import

In [1]:
import pandas as pd
import numpy as np
import re
import string

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [3]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Preprocessing function

In [6]:
class TextPreprocessor:
    def __init__(self):
        # Inisialisasi Stemmer dan Stopword Remover
        self.stemmer = StemmerFactory().create_stemmer()
        self.stopword_remover = StopWordRemoverFactory().create_stop_word_remover()
        
        # Stopwords tambahan khusus domain nutrisi
        self.custom_stopwords = set([
            'sub', 'topik', 'embed', 'health', 'tool', 'bmi',
            'hellosehat', 'com', 'https', 'www'
        ])
        
    def case_folding(self, text):
        """Convert text to lowercase"""
        return text.lower()
    
    def remove_noise(self, text):
        # Remove URLs
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    def tokenize(self, text):
        return word_tokenize(text)
    
    def remove_stopwords(self, text):
        text = self.stopword_remover.remove(text)
        tokens = text.split()
        tokens = [t for t in tokens if t not in self.custom_stopwords]
        return ' '.join(tokens)
    
    def stem_text(self, text):
        return self.stemmer.stem(text)
    
    def preprocess(self, text):
        text = self.case_folding(text)
        text = self.remove_noise(text)
        text = self.remove_stopwords(text)
        text = self.stem_text(text)
        return text

preprocessor = TextPreprocessor()

# Preprocessing Data

In [7]:
df1 = pd.read_csv('hellosehat_dataset_10.csv', sep=';', quotechar='"', encoding='utf-8')

print(f"Jumlah artikel: {len(df1)}")
print(f"Kolom: {df1.columns.tolist()}")
print(f"\nContoh data:")
print(df1.head(2))

Jumlah artikel: 614
Kolom: ['URL', 'Judul', 'Konten']

Contoh data:
                                                 URL  \
0  https://hellosehat.com/nutrisi/berat-badan-tur...   
1  https://hellosehat.com/nutrisi/tips-makan-seha...   

                                               Judul  \
0  Kenali 9 Penyebab Perut Buncit dan Cara Mengat...   
1    8 Merk Oven Gas Terbaik, Cocok untuk Bisnis Kue   

                                              Konten  
0  Perut buncit memang mampu memengaruhi penampil...  
1  Bagi Anda yang gemar bikin kue, oven gas menja...  


In [8]:
df2 = pd.read_csv('alodokter_dataset_10.csv', sep=';', quotechar='"', encoding='utf-8')

print(f"Jumlah artikel: {len(df2)}")
print(f"Kolom: {df2.columns.tolist()}")
print(f"\nContoh data:")
print(df2.head(2))

Jumlah artikel: 931
Kolom: ['URL', 'Judul', 'Konten']

Contoh data:
                                                 URL  \
0  https://www.alodokter.com/6-makanan-dengan-ind...   
1  https://www.alodokter.com/cayenne-pepper-kenal...   

                                               Judul  \
0  6 Makanan dengan Indeks Glikemik Tinggi yang P...   
1  Cayenne Pepper, Kenali Manfaat di Balik Rasa P...   

                                              Konten  
0  Penderita diabetes sebaiknya membatasi konsums...  
1  Cayenne pepper adalah salah satu jenis cabai t...  


In [9]:
df_combine = pd.concat([df1, df2], ignore_index=True)
print(f"Total dokumen setelah concat: {len(df_combine)}")
print(f"Index range: {df_combine.index.min()} - {df_combine.index.max()}")
print(f"Kolom: {df_combine.columns.tolist()}")

Total dokumen setelah concat: 1545
Index range: 0 - 1544
Kolom: ['URL', 'Judul', 'Konten']


# Saving Files (skip if this step file exist)

In [13]:
# Preprocessing untuk judul dan konten
df_combine['judul_clean'] = df_combine['Judul'].apply(preprocessor.preprocess)
df_combine['konten_clean'] = df_combine['Konten'].apply(preprocessor.preprocess)

# Gabungkan judul dan konten untuk representasi dokumen
df_combine['document'] = df_combine['judul_clean'] + ' ' + df_combine['konten_clean']
# Simpan semua hasil preprocessing
df_combine[['judul_clean', 'konten_clean', 'document']].to_csv('preprocessing_results.csv', index=False, encoding='utf-8')

print("Preprocessing selesai")
print(f"\nContoh hasil preprocessing:")
print(f"Original: {df_combine['Judul'].iloc[0][:100]}...")
print(f"Cleaned: {df_combine['judul_clean'].iloc[0][:100]}...")

Preprocessing selesai

Contoh hasil preprocessing:
Original: Kenali 9 Penyebab Perut Buncit dan Cara Mengatasinya...
Cleaned: kenal sebab perut buncit cara atas...


# Test Query

In [10]:
df_prepro = pd.read_csv('preprocessing_results.csv', encoding='utf-8')

print(f"Kolom: {df_prepro.columns.tolist()}")

print("File preprocessing")
print(f"Index range: {df_prepro.index.min()} - {df_prepro.index.max()}")
print(f"Jumlah artikel: {len(df_prepro)}")

Kolom: ['judul_clean', 'konten_clean', 'document']
File preprocessing
Index range: 0 - 1544
Jumlah artikel: 1545


In [11]:
HEALTH_GOALS = {
    'menurunkan_berat_badan': {
        'keywords': ['kurus', 'langsing', 'diet', 'turun berat', 'menurunkan berat badan', 
                    'obesitas', 'gemuk', 'lemak'],
        'related_concepts': ['kalori', 'olahraga', 'metabolisme', 'lemak', 'berat badan']
    },
    'kesehatan_anak': {
        'keywords': ['anak', 'balita', 'tumbuh kembang', 'pertumbuhan anak', 'nutrisi anak',
                    'obesitas anak', 'gizi anak'],
        'related_concepts': ['anak', 'pertumbuhan', 'obesitas', 'gizi', 'vitamin']
    },
    'detoksifikasi': {
        'keywords': ['detoks', 'detoksifikasi', 'racun', 'membersihkan tubuh', 'juice cleanse'],
        'related_concepts': ['jus', 'detoks', 'racun', 'enzim', 'vitamin']
    },
    'diet_khusus': {
        'keywords': ['gluten free', 'bebas gluten', 'celiac', 'alergi', 'sensitivitas'],
        'related_concepts': ['gluten', 'celiac', 'tepung', 'alergi', 'protein']
    },
    'memasak_sehat': {
        'keywords': ['resep', 'memasak', 'cara masak', 'teknik memasak', 'blanching',
                    'olahan', 'makanan sehat'],
        'related_concepts': ['masak', 'teknik', 'nutrisi', 'sayur', 'vitamin']
    }
}


In [12]:
class QueryProcessor:
    def __init__(self, health_goals):
        self.health_goals = health_goals
        self.preprocessor = TextPreprocessor()
        
    def extract_intent(self, query):
        """Ekstrak tujuan kesehatan dari query"""
        query_clean = self.preprocessor.preprocess(query)
        query_lower = query.lower()
        
        # Hitung skor untuk setiap tujuan kesehatan
        scores = {}
        for goal, info in self.health_goals.items():
            score = 0
            # Cek keyword match
            for keyword in info['keywords']:
                if keyword in query_lower:
                    score += 2  # Bobot lebih tinggi untuk keyword match
            # Cek related concept match
            for concept in info['related_concepts']:
                if concept in query_clean:
                    score += 1
            scores[goal] = score
        
        # Ambil tujuan dengan skor tertinggi
        if max(scores.values()) > 0:
            best_goal = max(scores, key=scores.get)
            return best_goal, scores[best_goal]
        return None, 0
    
    def expand_query(self, query, intent):
        """Expand query dengan konsep terkait"""
        if intent and intent in self.health_goals:
            related_concepts = self.health_goals[intent]['related_concepts']
            expanded = query + ' ' + ' '.join(related_concepts)
            return self.preprocessor.preprocess(expanded)
        return self.preprocessor.preprocess(query)

query_processor = QueryProcessor(HEALTH_GOALS)


In [13]:
# Test intent extraction
test_queries = [
    "Bagaimana cara menurunkan berat badan dengan sehat?",
    "Resep makanan untuk anak obesitas",
    "Apakah cold pressed juice bagus untuk detoks?",
    "Makanan bebas gluten untuk penderita celiac",
    "Cara memasak sayuran agar nutrisinya tidak hilang"
]

print("\nContoh Intent Extraction:")
for query in test_queries:
    intent, score = query_processor.extract_intent(query)
    expanded = query_processor.expand_query(query, intent)
    print(f"\nQuery: {query}")
    print(f"Intent: {intent} (score: {score})")
    print(f"Expanded: {expanded[:80]}...")


Contoh Intent Extraction:

Query: Bagaimana cara menurunkan berat badan dengan sehat?
Intent: menurunkan_berat_badan (score: 3)
Expanded: bagaimana cara turun berat badan sehat kalori olahraga metabolisme lemak berat b...

Query: Resep makanan untuk anak obesitas
Intent: kesehatan_anak (score: 4)
Expanded: resep makan anak obesitas anak tumbuh obesitas gizi vitamin...

Query: Apakah cold pressed juice bagus untuk detoks?
Intent: detoksifikasi (score: 3)
Expanded: cold pressed juice bagus detoks jus detoks racun enzim vitamin...

Query: Makanan bebas gluten untuk penderita celiac
Intent: diet_khusus (score: 6)
Expanded: makan bebas gluten derita celiac gluten celiac tepung alergi protein...

Query: Cara memasak sayuran agar nutrisinya tidak hilang
Intent: memasak_sehat (score: 5)
Expanded: cara masak sayur nutrisi hilang masak teknik nutrisi sayur vitamin...


In [14]:
# TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=500,  # Batasi jumlah fitur
    min_df=1,  # Minimal muncul di 1 dokumen
    max_df=0.8,  # Maksimal muncul di 80% dokumen
    ngram_range=(1, 2),  # Unigram dan bigram
    sublinear_tf=True  # Gunakan log scaling untuk TF
)

In [15]:
# Fit dan transform dokumen
tfidf_matrix = tfidf_vectorizer.fit_transform(df_prepro['document'])

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"  - Jumlah dokumen: {tfidf_matrix.shape[0]}")
print(f"  - Jumlah fitur (terms): {tfidf_matrix.shape[1]}")

# Top terms berdasarkan TF-IDF
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.sum(axis=0).A1
top_terms_idx = tfidf_scores.argsort()[-20:][::-1]
top_terms = [(feature_names[i], tfidf_scores[i]) for i in top_terms_idx]

print(f"\nTop 20 terms berdasarkan TF-IDF score:")
for i, (term, score) in enumerate(top_terms, 1):
    print(f"{i:2d}. {term:20s} : {score:.2f}")

TF-IDF matrix shape: (1545, 500)
  - Jumlah dokumen: 1545
  - Jumlah fitur (terms): 500

Top 20 terms berdasarkan TF-IDF score:
 1. manfaat              : 91.91
 2. buah                 : 79.87
 3. vitamin              : 76.41
 4. darah                : 72.59
 5. lemak                : 71.17
 6. tinggi               : 71.08
 7. hari                 : 70.92
 8. sakit                : 70.61
 9. jaga                 : 70.57
10. nutrisi              : 70.13
11. bantu                : 70.09
12. turun                : 69.70
13. buat                 : 69.12
14. berat                : 68.60
15. kurang               : 68.41
16. badan                : 67.73
17. gula                 : 67.22
18. dalam                : 66.95
19. berat badan          : 66.53
20. anda                 : 65.69


In [16]:
class RetrievalEngine:
    def __init__(self, tfidf_vectorizer, tfidf_matrix, df_combine, query_processor):
        self.vectorizer = tfidf_vectorizer
        self.tfidf_matrix = tfidf_matrix
        self.df_combine = df_combine
        self.query_processor = query_processor
        
    def search(self, query, top_k=5, use_intent=True):
        """Search dokumen berdasarkan query"""
        intent, intent_score = self.query_processor.extract_intent(query)
        
        if use_intent and intent:
            query_processed = self.query_processor.expand_query(query, intent)
        else:
            query_processed = self.query_processor.preprocessor.preprocess(query)
        
        query_vector = self.vectorizer.transform([query_processed])
        similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        top_indices = similarities.argsort()[-top_k:][::-1]
        
        results = []
        for rank, idx in enumerate(top_indices, 1):  # ← PERBAIKAN: enumerate dengan start=1
            results.append({
                'rank': rank,
                'doc_id': int(idx),  # ← KUNCI: gunakan idx langsung (bukan df.index[idx])
                'title': self.df_combine.iloc[idx]['Judul'],
                'url': self.df_combine.iloc[idx]['URL'],
                'similarity_score': float(similarities[idx]),
                'snippet': self.df_combine.iloc[idx]['Konten'][:200] + '...'
            })
        
        return {
            'query': query,
            'intent': intent,
            'intent_score': intent_score,
            'query_processed': query_processed,
            'results': results,
            # 'snippet': snippet,
        }

# Inisialisasi ulang retrieval engine
retrieval_engine = RetrievalEngine(tfidf_vectorizer, tfidf_matrix, df_combine, query_processor)


In [17]:
test_query = "cara menurunkan berat badan"
result = retrieval_engine.search(test_query, top_k=5)

print(f"\nQuery: {test_query}")
print(f"Intent: {result['intent']} (score: {result['intent_score']})")
print(f"Query processed: {result['query_processed'][:100]}...")
print(f"\nTop 5 Results:")
for r in result['results']:
    print(f"{r['rank']}. [ID:{r['doc_id']}] {r['title'][:60]}...")
    print(f"   Score: {r['similarity_score']:.4f}")
    print(f"   Snippet: {r['snippet']}")


Query: cara menurunkan berat badan
Intent: menurunkan_berat_badan (score: 3)
Query processed: cara turun berat badan kalori olahraga metabolisme lemak berat badan...

Top 5 Results:
1. [ID:281] Berapa Kg Berat Badan yang Boleh Turun dalam Seminggu?...
   Score: 0.4971
   Snippet: Belakangan ini, banyak metode diet bermunculan dengan klaim dapat menurunkan berat badan dalam seminggu. Selain mengurangi asupan makan dan memperbanyak olahraga, diet tersebut menganjurkan penggunaan...
2. [ID:312] Bagaimana Penurunan Berat Badan yang Baik?...
   Score: 0.4322
   Snippet: Banyak yang mengira menurunkan berat badan merupakan pertanda lemak tubuh hilang atau berkurang. Faktanya tidak demikian. Bbanyak hal yang memengaruhi penurunan berat badan. Simak penjelasan kaitan be...
3. [ID:1202] Cara Menurunkan Berat Badan Dalam 1 Minggu Secara Alami...
   Score: 0.4224
   Snippet: Meski terkesan memaksa, menurunkan berat badan dalam 1 minggu secara alami sangat mungkin terjadi. Hal ini bahkan sudah di

# Evaluation for Queries

EVALUATION_QUERIES = [
    {
        'query': 'cara menurunkan berat badan dengan jus',
        'relevant_docs': [0, 2, 3, 4, 5],
        'intent': 'menurunkan_berat_badan'
    },
    {
        'query': 'obesitas pada anak dan cara mengatasinya',
        'relevant_docs': [0, 5, 10, 11, 12],
        'intent': 'kesehatan_anak'
    },
    {
        'query': 'teknik memasak yang mempertahankan nutrisi',
        'relevant_docs': [2, 5, 6, 7, 8],
        'intent': 'memasak_sehat'
    },
    {
        'query': 'resep makanan tanpa gluten',
        'relevant_docs': [0, 5, 14, 16, 22],
        'intent': 'diet_khusus'
    },
    {
        'query': 'diet detoks dengan cold pressed juice',
        'relevant_docs': [0, 3, 4, 7, 12],
        'intent': 'detoksifikasi'
    },
    {
        'query': 'makanan sehat untuk anak gemuk',
        'relevant_docs': [0, 1, 2, 3, 4],
        'intent': 'kesehatan_anak'
    },
    
]


In [23]:
# Fungsi bantuan untuk membuat Ground Truth secara otomatis
# Mencari dokumen yang mengandung kata-kata dari query
def generate_ground_truth(queries, df, preprocessor):
    updated_queries = []
    
    print("Generating Ground Truth (mencari ID dokumen yang relevan)...")
    
    for item in queries:
        query_text = item['query']
        # Preprocess query untuk mendapatkan kata dasar
        clean_query = preprocessor.preprocess(query_text)
        query_terms = clean_query.split()
        
        relevant_ids = []
        
        # Scan semua dokumen
        for idx, row in df.iterrows():
            # Gabung judul dan konten clean untuk pengecekan
            doc_content = (str(row['judul_clean']) + " " + str(row['konten_clean']))
            
            # Cek apakah SEMUA atau SEBAGIAN BESAR kata kunci muncul di dokumen
            # Kita pakai logika: Minimal 75% kata kunci harus ada
            matches = sum(1 for term in query_terms if term in doc_content)
            threshold = len(query_terms) * 0.75 
            
            if matches >= threshold:
                relevant_ids.append(idx)
        
        # Update item dengan relevant_docs yang baru ditemukan
        item['relevant_docs'] = relevant_ids
        updated_queries.append(item)
        print(f"Query: '{query_text}' -> Found {len(relevant_ids)} relevant docs.")
        
    return updated_queries

# Definisi Query Awal
RAW_QUERIES = [
    {
        'query': 'cara menurunkan berat badan dengan jus',
        'intent': 'menurunkan_berat_badan'
    },
    {
        'query': 'obesitas pada anak dan cara mengatasinya',
        'intent': 'kesehatan_anak'
    },
    {
        'query': 'teknik memasak yang mempertahankan nutrisi',
        'intent': 'memasak_sehat'
    },
    {
        'query': 'resep makanan tanpa gluten',
        'intent': 'diet_khusus'
    },
    {
        'query': 'diet detoks dengan cold pressed juice',
        'intent': 'detoksifikasi'
    },
    {
        'query': 'makanan sehat untuk anak gemuk',
        'intent': 'kesehatan_anak'
    }
]

# Generate Ground Truth yang Valid menggunakan df_prepro (hasil preprocessing)
# Pastikan df_prepro sudah ter-load (dari Cell 14)
EVALUATION_QUERIES = generate_ground_truth(RAW_QUERIES, df_prepro, preprocessor)

Generating Ground Truth (mencari ID dokumen yang relevan)...
Query: 'cara menurunkan berat badan dengan jus' -> Found 650 relevant docs.
Query: 'obesitas pada anak dan cara mengatasinya' -> Found 319 relevant docs.
Query: 'teknik memasak yang mempertahankan nutrisi' -> Found 170 relevant docs.
Query: 'resep makanan tanpa gluten' -> Found 10 relevant docs.
Query: 'diet detoks dengan cold pressed juice' -> Found 3 relevant docs.
Query: 'makanan sehat untuk anak gemuk' -> Found 379 relevant docs.


In [24]:
print("Verifikasi EVALUATION_QUERIES:")
for i, eq in enumerate(EVALUATION_QUERIES, 1):
    print(f"\n{i}. Query: {eq['query']}")
    print(f"   Relevant docs: {eq['relevant_docs']}")
    
    for doc_id in eq['relevant_docs']:
        if doc_id < len(df_combine):
            print(f"   ✓ Doc {doc_id}: {df_combine.iloc[doc_id]['Judul'][:60]}...")
        else:
            print(f"   ✗ Doc {doc_id}: INDEX OUT OF RANGE! (max: {len(df)-1})")

Verifikasi EVALUATION_QUERIES:

1. Query: cara menurunkan berat badan dengan jus
   Relevant docs: [0, 4, 5, 7, 9, 11, 12, 13, 14, 15, 16, 23, 24, 25, 26, 28, 29, 30, 31, 34, 36, 39, 42, 44, 45, 46, 50, 53, 56, 59, 60, 63, 64, 65, 67, 68, 69, 73, 74, 77, 79, 80, 81, 82, 83, 84, 86, 87, 88, 91, 92, 93, 96, 97, 98, 99, 100, 104, 105, 106, 108, 109, 111, 112, 113, 118, 119, 120, 122, 123, 124, 125, 127, 128, 129, 131, 132, 133, 136, 138, 139, 140, 141, 144, 145, 149, 150, 152, 153, 155, 157, 158, 160, 162, 164, 166, 167, 171, 174, 175, 177, 178, 180, 187, 188, 189, 191, 192, 194, 195, 197, 199, 201, 203, 204, 206, 209, 213, 218, 219, 224, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 241, 242, 244, 245, 246, 248, 250, 251, 254, 257, 260, 261, 266, 268, 269, 271, 272, 273, 276, 277, 279, 281, 283, 286, 287, 288, 295, 296, 297, 303, 304, 306, 309, 310, 312, 315, 317, 318, 319, 320, 321, 322, 323, 325, 326, 327, 330, 339, 341, 342, 344, 345, 346, 348, 349, 350, 351, 355, 356, 357, 359, 3

In [25]:
class Evaluator:
    def __init__(self, retrieval_engine):
        self.engine = retrieval_engine
        
    def precision_at_k(self, retrieved_docs, relevant_docs, k):
        """Hitung Precision@K"""
        retrieved_k = retrieved_docs[:k]
        relevant_retrieved = len(set(retrieved_k) & set(relevant_docs))
        return relevant_retrieved / k if k > 0 else 0
    
    def recall_at_k(self, retrieved_docs, relevant_docs, k):
        """Hitung Recall@K"""
        retrieved_k = retrieved_docs[:k]
        relevant_retrieved = len(set(retrieved_k) & set(relevant_docs))
        return relevant_retrieved / len(relevant_docs) if len(relevant_docs) > 0 else 0
    
    def f1_score(self, precision, recall):
        """Hitung F1-Score"""
        if precision + recall == 0:
            return 0
        return 2 * (precision * recall) / (precision + recall)
    
    def average_precision(self, retrieved_docs, relevant_docs):
        """Hitung Average Precision"""
        if len(relevant_docs) == 0:
            return 0
        
        precision_sum = 0
        relevant_count = 0
        
        for k, doc_id in enumerate(retrieved_docs, 1):
            if doc_id in relevant_docs:
                relevant_count += 1
                precision_sum += relevant_count / k
        
        return precision_sum / len(relevant_docs)
    
    def evaluate(self, eval_queries, k_values=[1, 3, 5]):
        """Evaluasi sistem dengan multiple queries"""
        results = {
            'queries': [],
            'precision': {f'P@{k}': [] for k in k_values},  
            'recall': {f'R@{k}': [] for k in k_values},    
            'f1': {f'F1@{k}': [] for k in k_values},       
            'ap': []                                       
        }
        
        for eval_query in eval_queries:
            query = eval_query['query']
            relevant_docs = eval_query['relevant_docs']
            
            # Retrieve dokumen
            search_result = self.engine.search(query, top_k=max(k_values))
            retrieved_docs = [int(r['doc_id']) for r in search_result['results']]
            
            # Hitung metrik untuk setiap k
            query_metrics = {'query': query}
            
            for k in k_values:
                p_at_k = self.precision_at_k(retrieved_docs, relevant_docs, k)
                r_at_k = self.recall_at_k(retrieved_docs, relevant_docs, k)
                f1 = self.f1_score(p_at_k, r_at_k)
                
                results['precision'][f'P@{k}'].append(p_at_k)
                results['recall'][f'R@{k}'].append(r_at_k)
                results['f1'][f'F1@{k}'].append(f1)
                
                query_metrics[f'P@{k}'] = p_at_k
                query_metrics[f'R@{k}'] = r_at_k
                query_metrics[f'F1@{k}'] = f1
            
            # Average Precision
            ap = self.average_precision(retrieved_docs, relevant_docs)
            results['ap'].append(ap)
            query_metrics['AP'] = ap
            
            results['queries'].append(query_metrics)
        
        # Hitung rata-rata (lebih simple: 'mean' bukan 'mean_metrics')
        results['mean'] = {
            **{k: np.mean(v) for k, v in results['precision'].items()},
            **{k: np.mean(v) for k, v in results['recall'].items()},
            **{k: np.mean(v) for k, v in results['f1'].items()},
            'MAP': np.mean(results['ap'])
        }
        
        return results

# Inisialisasi
evaluator = Evaluator(retrieval_engine)
eval_results = evaluator.evaluate(EVALUATION_QUERIES, k_values=[1, 3, 5])

In [32]:
# INI SUDAH BENAR! ✅
eval_results = evaluator.evaluate(EVALUATION_QUERIES, k_values=[1, 3, 5])

print("\nMetrik Rata-rata:")
for metric, value in eval_results['mean'].items():
    print(f"{metric:10s}: {value:.4f}")

print("\nDetail per Query:")
for i, q_res in enumerate(eval_results['queries'][:5]):
    print(f"\nQuery: {q_res['query']}")
    print(f"  P@5 : {q_res['P@5']:.4f}")
    print(f"  AP  : {q_res['AP']:.4f}")


Metrik Rata-rata:
P@1       : 0.6667
P@3       : 0.6667
P@5       : 0.6000
R@1       : 0.0022
R@3       : 0.0612
R@5       : 0.0643
F1@1      : 0.0044
F1@3      : 0.0667
F1@5      : 0.0589
MAP       : 0.0265

Detail per Query:

Query: cara menurunkan berat badan dengan jus
  P@5 : 0.8000
  AP  : 0.0058

Query: obesitas pada anak dan cara mengatasinya
  P@5 : 1.0000
  AP  : 0.0157

Query: teknik memasak yang mempertahankan nutrisi
  P@5 : 0.6000
  AP  : 0.0133

Query: resep makanan tanpa gluten
  P@5 : 0.0000
  AP  : 0.0000

Query: diet detoks dengan cold pressed juice
  P@5 : 0.2000
  AP  : 0.1111
