# Hoax Detection System - Complete Notebook

This notebook contains a complete hoax detection system that includes:
1. **Data Scraping** - Collecting news articles from various sources
2. **Data Preprocessing** - Cleaning and normalizing text data
3. **Model Training** - Training machine learning models for hoax detection
4. **Prediction** - Using trained models to predict whether news is hoax or valid

## Overview
- **Sources**: Kompas.com (valid news) and TurnBackHoax.id (hoax articles)
- **Models**: Naive Bayes and Random Forest classifiers
- **Features**: TF-IDF vectorization with text preprocessing
- **Language**: Indonesian text processing using Sastrawi and NLTK

## 1. Import Required Libraries

First, let's import all the necessary libraries for our hoax detection system.

In [None]:
# Web scraping and data handling
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import sys
import re
import string
import joblib
from typing import List, Dict
from multiprocessing import Pool

# Natural Language Processing
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
import nltk

# Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download required NLTK data
nltk.download('stopwords', quiet=True)

print("All libraries imported successfully!")

## 2. Configuration and Constants

Define constants and configuration for our scraping and processing pipeline.

In [None]:
# Web scraping configuration
HEADERS = {
    "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36")
}

# Kompas scraping config
INDEX_URL = "https://indeks.kompas.com/"
PAGE_URL = INDEX_URL + "?page={}"
MAX_PAGES = 10  # Reduced for demo purposes

# TurnBackHoax scraping config
TURNBACK_BASE_URL = "https://turnbackhoax.id/page/"
TURNBACK_MAX_PAGES = 10  # Reduced for demo purposes

# Classification tags
HOAX_TAGS = ("[SALAH]", "[PENIPUAN]", "[FITNAH]", "[DISINFORMASI]", "[HOAX]")
NON_HOAX_TAGS = ("[VALID]", "[BENAR]", "[FAKTA]", "[KLARIFIKASI]")

# Directory paths
DATA_DIR = "data"
CLEAN_DATA_DIR = "data_clean"
MODEL_DIR = "model"

# Create directories if they don't exist
for directory in [DATA_DIR, CLEAN_DATA_DIR, MODEL_DIR]:
    os.makedirs(directory, exist_ok=True)

print("Configuration set up successfully!")

## 3. Data Scraping Functions

### 3.1 Kompas.com Scraper (Valid News)

In [None]:
def scrape_kompas_index(pages: int = MAX_PAGES) -> List[Dict]:
    """Scrape news articles from Kompas.com index pages"""
    records: List[Dict] = []

    for page in range(1, pages + 1):
        url = INDEX_URL if page == 1 else PAGE_URL.format(page)
        print(f"Scraping Kompas page {page}: {url}")
        
        try:
            html = requests.get(url, headers=HEADERS, timeout=30).text
        except Exception as e:
            print(f"Failed to fetch {url}: {e}")
            continue

        soup = BeautifulSoup(html, "html.parser")

        # Try different article selectors
        items = soup.select("div.article__list article")
        if not items:
            items = soup.select("div.articleList.-list div.articleItem")

        for art in items:
            title_tag = art.select_one("h3.article__title a, h2.articleTitle")
            if not title_tag:
                continue
                
            link = title_tag["href"] if title_tag.name == "a" else art.select_one("a.article-link")["href"]
            title = title_tag.get_text(strip=True)

            # Add [VALID] tag to mark as legitimate news
            title = "[VALID] " + title

            # Extract date and author if available
            date_tag = art.select_one("div.article__date, div.articlePost-date")
            author_tag = art.select_one("div.article__author")
            tanggal = date_tag.get_text(strip=True) if date_tag else ""
            author = author_tag.get_text(strip=True) if author_tag else ""

            excerpt = get_article_excerpt(link)
            records.append({
                "Judul": title,
                "Link": link,
                "Tanggal": tanggal,
                "Author": author,
                "Isi Ringkas": excerpt
            })
    
    return records

def get_article_excerpt(url: str) -> str:
    """Extract article excerpt from the given URL"""
    try:
        res = requests.get(url, headers=HEADERS, timeout=30)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "html.parser")
        p = soup.select_one("div.read__content p")
        return p.get_text(strip=True) if p else ""
    except Exception:
        return ""

print("Kompas scraper functions defined!")

### 3.2 TurnBackHoax.id Scraper (Hoax Articles)

In [None]:
def scrape_turnbackhoax(max_pages: int = TURNBACK_MAX_PAGES) -> List[Dict]:
    """Scrape hoax articles from TurnBackHoax.id"""
    data = []
    
    for page in range(1, max_pages + 1):
        url = f"{TURNBACK_BASE_URL}{page}/"
        print(f"Scraping TurnBackHoax page {page}: {url}")
        
        try:
            res = requests.get(url, headers=HEADERS, timeout=30)
            res.raise_for_status()
        except Exception as e:
            print(f"Failed to fetch page {page}: {e}")
            continue

        soup = BeautifulSoup(res.text, "html.parser")
        articles = soup.select("article.mh-loop-item")

        for article in articles:
            title_tag = article.select_one("h3.entry-title a")
            date_tag = article.select_one("span.mh-meta-date")
            author_tag = article.select_one("span.mh-meta-author a")
            excerpt_tag = article.select_one("div.mh-excerpt p")

            data.append({
                "Judul": title_tag.text.strip() if title_tag else "",
                "Link": title_tag["href"] if title_tag else "",
                "Tanggal": date_tag.text.strip() if date_tag else "",
                "Author": author_tag.text.strip() if author_tag else "",
                "Isi Ringkas": excerpt_tag.text.strip() if excerpt_tag else ""
            })
    
    return data

print("TurnBackHoax scraper function defined!")

### 3.3 Run Scrapers and Save Data

In [None]:
def save_to_csv(data: List[Dict], filename: str, directory: str = DATA_DIR):
    """Save data to CSV file"""
    path = os.path.join(directory, filename)
    df = pd.DataFrame(data)
    df.to_csv(path, index=False, encoding="utf-8-sig")
    print(f"Data saved to {path} ({len(df)} rows)")
    return df

# Scrape Kompas data (valid news)
print("=== Scraping Kompas.com (Valid News) ===")
kompas_data = scrape_kompas_index(pages=3)  # Reduced for demo
kompas_df = save_to_csv(kompas_data, "kompas.csv")

print("\n=== Scraping TurnBackHoax.id (Hoax Articles) ===")
turnback_data = scrape_turnbackhoax(max_pages=3)  # Reduced for demo
turnback_df = save_to_csv(turnback_data, "turnbackhoax.csv")

print("\n=== Scraping Complete ===")
print(f"Kompas articles: {len(kompas_data)}")
print(f"TurnBackHoax articles: {len(turnback_data)}")
print(f"Total articles: {len(kompas_data) + len(turnback_data)}")

## 4. Data Preprocessing and Text Normalization

### 4.1 Text Cleaning Functions

In [None]:
# Initialize stemmer and stopwords
stemmer = StemmerFactory().create_stemmer()
stop_words = set(stopwords.words('indonesian'))

def clean_text(text: str) -> str:
    """Clean and normalize Indonesian text"""
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords
    tokens = [w for w in text.split() if w not in stop_words and len(w) > 2]
    
    # Apply stemming
    stemmed_tokens = [stemmer.stem(w) for w in tokens]
    
    return ' '.join(stemmed_tokens)

def process_batch(df_chunk):
    """Process a batch of data for parallel processing"""
    df_chunk["Isi Ringkas Clean"] = df_chunk["Isi Ringkas"].apply(clean_text)
    return df_chunk

print("Text cleaning functions defined!")

### 4.2 Process and Clean All Data

In [None]:
def process_data_files():
    """Process all CSV files in the data directory"""
    all_files = [f for f in os.listdir(DATA_DIR) if f.endswith(".csv")]
    
    for filename in all_files:
        print(f"\nProcessing {filename}...")
        df = pd.read_csv(os.path.join(DATA_DIR, filename))
        
        # Check if required column exists
        if "Isi Ringkas" not in df.columns:
            print(f"Column 'Isi Ringkas' not found in {filename}")
            continue
        
        # Clean the text data
        print(f"Cleaning text data for {len(df)} articles...")
        df["Isi Ringkas Clean"] = df["Isi Ringkas"].apply(clean_text)
        
        # Save cleaned data
        out_path = os.path.join(CLEAN_DATA_DIR, filename.replace(".csv", "_cleaned.csv"))
        df.to_csv(out_path, index=False, encoding="utf-8-sig")
        print(f"Cleaned data saved to: {out_path}")
        
        # Show sample of cleaned data
        if len(df) > 0:
            print(f"Sample cleaned text: {df['Isi Ringkas Clean'].iloc[0][:100]}...")

# Process all data files
process_data_files()
print("\n=== Data preprocessing complete! ===")

## 5. Model Training

### 5.1 Data Preparation and Labeling

In [None]:
def make_label(title: str) -> int:
    """Create label based on article title tags"""
    title = title.upper()
    
    # Check for HOAX tags
    if any(tag in title for tag in HOAX_TAGS):
        return 1  # HOAX
    
    # Check for VALID tags
    if any(tag in title for tag in NON_HOAX_TAGS):
        return 0  # VALID
    
    # For Kompas articles without explicit tags, assume valid
    if "kompas" in title.lower():
        return 0  # VALID
    
    # Default to HOAX for TurnBackHoax articles
    return 1  # HOAX

def load_training_data(clean_dir: str = CLEAN_DATA_DIR):
    """Load and prepare training data"""
    # Load all cleaned CSV files
    frames = []
    for filename in os.listdir(clean_dir):
        if filename.endswith("_cleaned.csv"):
            df = pd.read_csv(os.path.join(clean_dir, filename))
            frames.append(df)
    
    if not frames:
        raise FileNotFoundError("No cleaned CSV files found in data_clean/")
    
    # Combine all data
    df = pd.concat(frames, ignore_index=True)
    
    # Create labels
    df["label"] = df["Judul"].apply(make_label)
    
    # Remove rows with missing text or labels
    df = df.dropna(subset=["Isi Ringkas Clean", "label"])
    df = df[df["Isi Ringkas Clean"].str.len() > 0]
    
    texts = df["Isi Ringkas Clean"]
    labels = df["label"].astype(int)
    
    print("Dataset Statistics:")
    print(f"Total articles: {len(df)}")
    print("Label distribution:")
    label_counts = labels.value_counts().rename({0: "VALID", 1: "HOAX"})
    print(label_counts)
    print(f"Class balance: {label_counts.min() / label_counts.max():.2f}")
    
    return texts, labels, df

# Load training data
texts, labels, full_df = load_training_data()
print("\nTraining data loaded successfully!")

### 5.2 Feature Extraction and Model Training

In [None]:
def train_models(texts, labels, ngram_range=(1, 2)):
    """Train Naive Bayes and Random Forest models"""
    print(f"\n=== Training Models ===")
    print(f"TF-IDF n-gram range: {ngram_range}")
    
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        max_features=5000,
        ngram_range=ngram_range,
        sublinear_tf=True,
        stop_words=None  # We already removed stopwords
    )
    
    # Vectorize the text
    X = vectorizer.fit_transform(texts)
    print(f"Feature matrix shape: {X.shape}")
    
    # Split data for training and testing
    X_train, X_test, y_train, y_test = train_test_split(
        X, labels, test_size=0.2, random_state=42, stratify=labels
    )
    
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Test set: {X_test.shape[0]} samples")
    
    # Train Naive Bayes
    print("\nTraining Naive Bayes...")
    nb_model = MultinomialNB()
    nb_model.fit(X_train, y_train)
    
    # Train Random Forest
    print("Training Random Forest...")
    rf_model = RandomForestClassifier(
        n_estimators=100,  # Reduced for faster training
        n_jobs=-1,
        random_state=42
    )
    rf_model.fit(X_train, y_train)
    
    # Evaluate models
    print("\n=== Model Evaluation ===")
    
    for name, model in [("Naive Bayes", nb_model), ("Random Forest", rf_model)]:
        print(f"\n{name} Results:")
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Accuracy: {accuracy:.3f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred, target_names=["VALID", "HOAX"], zero_division=0))
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
    
    return vectorizer, nb_model, rf_model

# Train the models
vectorizer, nb_model, rf_model = train_models(texts, labels)
print("\n=== Model training complete! ===")

### 5.3 Save Trained Models

In [None]:
def save_models(vectorizer, nb_model, rf_model, model_dir: str = MODEL_DIR):
    """Save trained models and vectorizer"""
    print(f"\n=== Saving Models to {model_dir}/ ===")
    
    # Save vectorizer
    vectorizer_path = os.path.join(model_dir, "vectorizer.pkl")
    joblib.dump(vectorizer, vectorizer_path)
    print(f"Vectorizer saved: {vectorizer_path}")
    
    # Save Naive Bayes model
    nb_path = os.path.join(model_dir, "nb.pkl")
    joblib.dump(nb_model, nb_path)
    print(f"Naive Bayes model saved: {nb_path}")
    
    # Save Random Forest model
    rf_path = os.path.join(model_dir, "rf.pkl")
    joblib.dump(rf_model, rf_path)
    print(f"Random Forest model saved: {rf_path}")
    
    print("\nAll models saved successfully!")

# Save the trained models
save_models(vectorizer, nb_model, rf_model)

## 6. Prediction System

### 6.1 Load Trained Models and Setup Predictor

In [None]:
def load_trained_models(model_dir: str = MODEL_DIR):
    """Load trained models for prediction"""
    vectorizer_path = os.path.join(model_dir, "vectorizer.pkl")
    nb_path = os.path.join(model_dir, "nb.pkl")
    rf_path = os.path.join(model_dir, "rf.pkl")
    
    # Check if all model files exist
    missing_files = []
    for path in [vectorizer_path, nb_path, rf_path]:
        if not os.path.exists(path):
            missing_files.append(path)
    
    if missing_files:
        raise FileNotFoundError(f"Missing model files: {missing_files}")
    
    # Load models
    print("Loading trained models...")
    loaded_vectorizer = joblib.load(vectorizer_path)
    loaded_nb = joblib.load(nb_path)
    loaded_rf = joblib.load(rf_path)
    
    print("Models loaded successfully!")
    return loaded_vectorizer, loaded_nb, loaded_rf

# Load the trained models
pred_vectorizer, pred_nb, pred_rf = load_trained_models()

# Label mapping
LABEL_MAP = {0: "TIDAK HOAX", 1: "HOAX"}

### 6.2 Prediction Functions

In [None]:
def predict_hoax(text: str, show_details: bool = True):
    """Predict whether a text is hoax or not"""
    # Clean the input text
    cleaned_text = clean_text(text)
    
    if not cleaned_text.strip():
        return "Error: Text is empty after cleaning"
    
    # Vectorize the text
    X = pred_vectorizer.transform([cleaned_text])
    
    # Get predictions from both models
    nb_prob = pred_nb.predict_proba(X)[0]
    rf_prob = pred_rf.predict_proba(X)[0]
    
    # Probability of being HOAX (class 1)
    nb_hoax_prob = nb_prob[1] * 100
    rf_hoax_prob = rf_prob[1] * 100
    
    # Get predictions
    nb_prediction = LABEL_MAP[pred_nb.predict(X)[0]]
    rf_prediction = LABEL_MAP[pred_rf.predict(X)[0]]
    
    # Ensemble prediction (average probability)
    avg_hoax_prob = (nb_hoax_prob + rf_hoax_prob) / 2
    ensemble_prediction = "HOAX" if avg_hoax_prob >= 50 else "TIDAK HOAX"
    
    results = {
        "text": text[:100] + "..." if len(text) > 100 else text,
        "cleaned_text": cleaned_text[:100] + "..." if len(cleaned_text) > 100 else cleaned_text,
        "naive_bayes": {
            "prediction": nb_prediction,
            "hoax_probability": round(nb_hoax_prob, 2)
        },
        "random_forest": {
            "prediction": rf_prediction,
            "hoax_probability": round(rf_hoax_prob, 2)
        },
        "ensemble": {
            "prediction": ensemble_prediction,
            "hoax_probability": round(avg_hoax_prob, 2)
        }
    }
    
    if show_details:
        print(f"\n=== Hoax Detection Results ===")
        print(f"Text: {results['text']}")
        print(f"\nNaive Bayes: {results['naive_bayes']['prediction']} (HOAX prob: {results['naive_bayes']['hoax_probability']}%)")
        print(f"Random Forest: {results['random_forest']['prediction']} (HOAX prob: {results['random_forest']['hoax_probability']}%)")
        print(f"\n🤖 Final Prediction: {results['ensemble']['prediction']} (HOAX prob: {results['ensemble']['hoax_probability']}%)")
        
        # Confidence level
        confidence = abs(results['ensemble']['hoax_probability'] - 50)
        if confidence > 30:
            print(f"Confidence: HIGH ({confidence:.1f}%)")
        elif confidence > 15:
            print(f"Confidence: MEDIUM ({confidence:.1f}%)")
        else:
            print(f"Confidence: LOW ({confidence:.1f}%)")
    
    return results

print("Prediction functions ready!")

### 6.3 Test the Prediction System

In [None]:
# Test with sample texts
test_texts = [
    "Vaksin COVID-19 menyebabkan magnetisme pada tubuh manusia",
    "Pemerintah mengumumkan program bantuan sosial untuk keluarga kurang mampu",
    "Minum air putih hangat dapat menyembuhkan kanker dalam 3 hari",
    "Bank Indonesia mengumumkan kebijakan suku bunga terbaru",
    "Temuan ilmuwan: Makan nasi dapat menyebabkan kematian mendadak"
]

print("=== Testing Prediction System ===")
for i, text in enumerate(test_texts, 1):
    print(f"\n--- Test {i} ---")
    result = predict_hoax(text)
    print("-" * 50)

## 7. Interactive Prediction Interface

### 7.1 Batch Prediction Function

In [None]:
def predict_batch(texts: List[str]) -> pd.DataFrame:
    """Predict multiple texts at once and return as DataFrame"""
    results = []
    
    for text in texts:
        try:
            result = predict_hoax(text, show_details=False)
            results.append({
                "Text": text[:50] + "..." if len(text) > 50 else text,
                "NB_Prediction": result['naive_bayes']['prediction'],
                "NB_Prob": result['naive_bayes']['hoax_probability'],
                "RF_Prediction": result['random_forest']['prediction'],
                "RF_Prob": result['random_forest']['hoax_probability'],
                "Final_Prediction": result['ensemble']['prediction'],
                "Final_Prob": result['ensemble']['hoax_probability']
            })
        except Exception as e:
            results.append({
                "Text": text[:50] + "..." if len(text) > 50 else text,
                "NB_Prediction": "Error",
                "NB_Prob": 0,
                "RF_Prediction": "Error",
                "RF_Prob": 0,
                "Final_Prediction": "Error",
                "Final_Prob": 0
            })
    
    return pd.DataFrame(results)

# Test batch prediction
batch_results = predict_batch(test_texts)
print("\n=== Batch Prediction Results ===")
print(batch_results.to_string(index=False))

### 7.2 Model Performance Analysis

In [None]:
def analyze_model_performance():
    """Analyze model performance on the training dataset"""
    print("=== Model Performance Analysis ===")
    
    # Load cleaned data for analysis
    texts, labels, df = load_training_data()
    
    # Get predictions for all data
    X = pred_vectorizer.transform(texts)
    
    nb_predictions = pred_nb.predict(X)
    rf_predictions = pred_rf.predict(X)
    
    # Calculate accuracies
    nb_accuracy = accuracy_score(labels, nb_predictions)
    rf_accuracy = accuracy_score(labels, rf_predictions)
    
    print(f"\nOverall Performance on Training Data:")
    print(f"Naive Bayes Accuracy: {nb_accuracy:.3f}")
    print(f"Random Forest Accuracy: {rf_accuracy:.3f}")
    
    # Feature importance for Random Forest
    if hasattr(pred_rf, 'feature_importances_'):
        feature_names = pred_vectorizer.get_feature_names_out()
        importance_scores = pred_rf.feature_importances_
        
        # Get top 10 most important features
        top_indices = importance_scores.argsort()[-10:][::-1]
        
        print(f"\nTop 10 Most Important Features (Random Forest):")
        for i, idx in enumerate(top_indices, 1):
            print(f"{i:2d}. {feature_names[idx]:15s} (importance: {importance_scores[idx]:.4f})")
    
    return nb_accuracy, rf_accuracy

# Analyze performance
nb_acc, rf_acc = analyze_model_performance()

## 8. Interactive Prediction Interface

### 8.1 Custom Text Prediction

In [None]:
def interactive_prediction():
    """Interactive function for testing custom texts"""
    print("\n=== Interactive Hoax Detection ===")
    print("Enter news text to check if it's potentially hoax or not.")
    print("Type 'quit' to exit.\n")
    
    while True:
        try:
            user_input = input("Enter news text: ").strip()
            
            if user_input.lower() in ['quit', 'exit', 'q']:
                print("Goodbye!")
                break
            
            if not user_input:
                print("Please enter some text.")
                continue
            
            # Predict
            result = predict_hoax(user_input)
            
        except KeyboardInterrupt:
            print("\nGoodbye!")
            break
        except Exception as e:
            print(f"Error: {e}")

# Uncomment the line below to run interactive mode
# interactive_prediction()

print("Interactive prediction function is ready!")
print("Uncomment the line above to run interactive mode.")

### 8.2 Real-time News Article Analysis

In [None]:
# Example: Analyze recent articles from our dataset
def analyze_sample_articles(n_samples: int = 5):
    """Analyze a sample of articles from the dataset"""
    try:
        # Load some sample data
        sample_files = []
        for filename in os.listdir(DATA_DIR):
            if filename.endswith(".csv"):
                sample_files.append(filename)
        
        if not sample_files:
            print("No data files found for analysis.")
            return
        
        # Load a sample file
        sample_df = pd.read_csv(os.path.join(DATA_DIR, sample_files[0]))
        
        # Get random samples
        if len(sample_df) > n_samples:
            samples = sample_df.sample(n=n_samples)
        else:
            samples = sample_df
        
        print(f"\n=== Analyzing {len(samples)} Sample Articles ===")
        
        for idx, row in samples.iterrows():
            title = row.get('Judul', 'No Title')
            content = row.get('Isi Ringkas', 'No Content')
            
            if pd.isna(content) or not content.strip():
                print(f"\nArticle {idx}: {title[:60]}...")
                print("No content available for analysis.")
                continue
            
            print(f"\n--- Article {idx} ---")
            print(f"Title: {title}")
            result = predict_hoax(content)
            print("-" * 50)
    
    except Exception as e:
        print(f"Error analyzing articles: {e}")

# Analyze sample articles
analyze_sample_articles(3)

## 9. Summary and Usage Guide

### 9.1 System Summary

In [None]:
def print_system_summary():
    """Print a comprehensive summary of the hoax detection system"""
    print("\n" + "="*60)
    print("           HOAX DETECTION SYSTEM SUMMARY")
    print("="*60)
    
    print("\n📊 DATA SOURCES:")
    print("   • Kompas.com (Valid news articles)")
    print("   • TurnBackHoax.id (Hoax articles)")
    
    print("\n🔧 PREPROCESSING:")
    print("   • Text normalization (lowercase, remove punctuation)")
    print("   • Stopword removal (Indonesian)")
    print("   • Stemming (Sastrawi)")
    print("   • TF-IDF vectorization")
    
    print("\n🤖 MODELS:")
    print("   • Naive Bayes Classifier")
    print("   • Random Forest Classifier")
    print("   • Ensemble prediction (average probability)")
    
    print("\n📁 FILES CREATED:")
    print(f"   • {DATA_DIR}/kompas.csv - Raw Kompas articles")
    print(f"   • {DATA_DIR}/turnbackhoax.csv - Raw TurnBackHoax articles")
    print(f"   • {CLEAN_DATA_DIR}/*_cleaned.csv - Processed articles")
    print(f"   • {MODEL_DIR}/vectorizer.pkl - TF-IDF vectorizer")
    print(f"   • {MODEL_DIR}/nb.pkl - Naive Bayes model")
    print(f"   • {MODEL_DIR}/rf.pkl - Random Forest model")
    
    print("\n🎯 USAGE:")
    print("   • Use predict_hoax(text) for single predictions")
    print("   • Use predict_batch(texts) for multiple predictions")
    print("   • Run interactive_prediction() for testing")
    
    print("\n📈 PERFORMANCE:")
    if 'nb_acc' in globals() and 'rf_acc' in globals():
        print(f"   • Naive Bayes Accuracy: {nb_acc:.3f}")
        print(f"   • Random Forest Accuracy: {rf_acc:.3f}")
    else:
        print("   • Performance metrics available after training")
    
    print("\n" + "="*60)
    print("         System ready for hoax detection!")
    print("="*60)

# Print system summary
print_system_summary()

### 9.2 Quick Usage Examples

In [None]:
print("\n=== QUICK USAGE EXAMPLES ===")

# Example 1: Single prediction
print("\n1. Single Text Prediction:")
example_text = "Penelitian terbaru menunjukkan bahwa minum kopi dapat mencegah diabetes."
print(f"Text: {example_text}")
result = predict_hoax(example_text, show_details=False)
print(f"Prediction: {result['ensemble']['prediction']} (Confidence: {result['ensemble']['hoax_probability']}%)")

# Example 2: Batch prediction
print("\n2. Batch Prediction:")
batch_texts = [
    "Pemerintah mengumumkan kebijakan ekonomi baru",
    "Minum air lemon dapat menyembuhkan semua penyakit",
    "Universitas Indonesia membuka program beasiswa"
]

batch_df = predict_batch(batch_texts)
print(batch_df[['Text', 'Final_Prediction', 'Final_Prob']].to_string(index=False))

print("\n3. How to use interactively:")
print("   # Uncomment and run this line for interactive mode:")
print("   # interactive_prediction()")

print("\n=== System is ready for use! ===")