<a href="https://colab.research.google.com/github/Pra1hamCodes/Chat-anaylzer/blob/main/NLP_MAIN1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [70]:
#!/usr/bin/env python3
"""
Fact-Check Web Scraper - PolitiFact
====================================
This script scrapes 500 fact-checking articles from PolitiFact and saves them
into a single CSV dataset.

Requirements:
- requests
- beautifulsoup4
- pandas
- tqdm
- lxml (optional but recommended for faster parsing)

Usage:
    python fact_check_scraper.py
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import logging
from typing import List, Dict, Optional
from urllib.parse import urljoin
import re

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Constants
POLITIFACT_BASE_URL = "https://www.politifact.com"
POLITIFACT_FACTCHECKS_URL = "https://www.politifact.com/factchecks/"

# Request headers to avoid being blocked
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
}

# Delay between requests to be respectful to the servers
REQUEST_DELAY = 1  # seconds


def make_request(url: str, max_retries: int = 3) -> Optional[requests.Response]:
    """
    Make a GET request with retry logic.

    Args:
        url: The URL to request
        max_retries: Maximum number of retry attempts

    Returns:
        Response object if successful, None otherwise
    """
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=HEADERS, timeout=30)
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            logger.warning(f"Request failed for {url} (attempt {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                time.sleep(REQUEST_DELAY * (attempt + 1))  # Exponential backoff
            else:
                logger.error(f"Failed to retrieve {url} after {max_retries} attempts")
    return None


def scrape_politifact(target_count: int = 500) -> List[Dict[str, str]]:
    """
    Scrape fact-check articles from PolitiFact.

    Args:
        target_count: Number of articles to scrape

    Returns:
        List of dictionaries containing article data
    """
    logger.info(f"Starting to scrape PolitiFact for {target_count} articles...")
    articles = []
    page = 1

    with tqdm(total=target_count, desc="Scraping PolitiFact") as pbar:
        while len(articles) < target_count:
            # Construct URL for current page
            url = f"{POLITIFACT_FACTCHECKS_URL}?page={page}"
            logger.debug(f"Scraping page {page}: {url}")

            response = make_request(url)
            if not response:
                logger.error(f"Failed to retrieve page {page}, stopping scrape.")
                break

            try:
                soup = BeautifulSoup(response.content, 'html.parser')

                # Find all fact-check items on the page
                fact_checks = soup.find_all('div', class_='m-statement__quote')

                if not fact_checks:
                    logger.warning(f"No fact-checks found on page {page} with primary selector, trying alternative...")
                    # Try alternative selectors if the main one fails
                    fact_checks = soup.find_all('article', class_='m-statement')

                if not fact_checks:
                    logger.info(f"No more fact-checks found. Scraped {len(articles)} articles total.")
                    break

                for item in fact_checks:
                    if len(articles) >= target_count:
                        break

                    try:
                        article_data = extract_politifact_article(item)
                        if article_data:
                            articles.append(article_data)
                            pbar.update(1)
                    except Exception as e:
                        logger.error(f"Error extracting article data: {e}")
                        continue

                # Add delay between page requests
                time.sleep(REQUEST_DELAY)
                page += 1

            except Exception as e:
                logger.error(f"Error parsing page {page}: {e}")
                page += 1
                continue

    logger.info(f"Finished scraping. Scraped {len(articles)} articles from PolitiFact")
    return articles


def extract_politifact_article(item: BeautifulSoup) -> Optional[Dict[str, str]]:
    """
    Extract article data from a PolitiFact fact-check item.

    Args:
        item: BeautifulSoup element containing the fact-check

    Returns:
        Dictionary with article data or None if extraction fails
    """
    try:
        # Find the parent article element for better context
        article = item.find_parent('article') or item

        # Extract claim
        claim_elem = article.find('div', class_='m-statement__quote')
        claim = claim_elem.get_text(strip=True) if claim_elem else "N/A"

        # Extract article URL
        link_elem = claim_elem.find('a', href=True) if claim_elem else article.find('a', href=True)
        if not (link_elem and link_elem.get('href')):
            return None # Skip if no URL found
        article_url = urljoin(POLITIFACT_BASE_URL, link_elem['href'])

        # Extract date
        date_elem = article.find('div', class_='m-statement__body')
        date_text = "N/A"
        if date_elem:
            date_match = re.search(r'(\w+ \d+, \d{4})', date_elem.get_text())
            if date_match:
                date_text = date_match.group(1)

        # --- NEW, MORE ROBUST VERDICT EXTRACTION LOGIC ---
        verdict = "N/A"
        # First, find the specific 'meter' container div
        meter_container = article.find('div', class_='m-statement__meter')
        if meter_container:
            # Then, find the image with an 'alt' tag inside that container
            img_elem = meter_container.find('img', alt=True)
            if img_elem and img_elem.get('alt'):
                # Takes "Truth-O-Meter: Pants on Fire" and extracts "Pants on Fire"
                verdict = img_elem['alt'].split(':')[-1].strip()
        # --- END OF NEW LOGIC ---

        return {
            'claim': claim,
            'source_site': 'PolitiFact',
            'date': date_text,
            'verdict': verdict,
            'article_url': article_url
        }

    except Exception as e:
        logger.error(f"Error extracting PolitiFact article details: {e}")
        return None


def main():
    """
    Main function to orchestrate the scraping process.
    """
    logger.info("Starting PolitiFact scraping script...")

    # Scrape PolitiFact for 500 articles
    politifact_articles = scrape_politifact(target_count=500)

    if not politifact_articles:
        logger.error("No articles were scraped. Exiting.")
        return

    logger.info(f"Total articles scraped: {len(politifact_articles)}")

    # Create DataFrame
    df = pd.DataFrame(politifact_articles)

    # Save to CSV
    output_file = 'fact_check_dataset.csv'
    df.to_csv(output_file, index=False, encoding='utf-8')

    # Print confirmation
    print(f"\nScraping complete. Saved {len(df)} articles to {output_file}")

    # Display sample of the data
    print("\nSample of scraped data:")
    print(df.head())

    # Display data statistics
    print(f"\nDataset statistics:")
    print(f"Total articles: {len(df)}")
    if 'verdict' in df.columns:
        print(f"\nPolitiFact verdict distribution:")
        print(df['verdict'].value_counts())


if __name__ == "__main__":
    main()

Scraping PolitiFact: 100%|██████████| 500/500 [00:20<00:00, 24.89it/s]


Scraping complete. Saved 500 articles to fact_check_dataset.csv

Sample of scraped data:
                                               claim source_site  \
0               “Portland is burning to the ground.”  PolitiFact   
1  “Los demócratas están amenazando con cerrar to...  PolitiFact   
2  Thirty percent of U.S. medical residents are i...  PolitiFact   
3  “You never heard Biden say” the U.S. has “the ...  PolitiFact   
4  “Democrats are threatening to shut down the en...  PolitiFact   

              date     verdict  \
0  October 7, 2025       false   
1  October 2, 2025       false   
2  October 2, 2025   half-true   
3  October 2, 2025  pants-fire   
4  October 1, 2025       false   

                                         article_url  
0  https://www.politifact.com/factchecks/2025/oct...  
1  https://www.politifact.com/factchecks/2025/oct...  
2  https://www.politifact.com/factchecks/2025/oct...  
3  https://www.politifact.com/factchecks/2025/oct...  
4  https://www.politi




In [3]:
!pip install transformers datasets sentence-transformers pandas scikit-learn torch



In [74]:
#!/usr/bin/env python3
"""
Misinformation Detection Pipeline - Phase 2
===========================================
This script implements a four-stage NLP pipeline for fact-checking:
1. Claim Identification (Summarization) - Pre-trained
2. Named Entity Recognition (NER) - Pre-trained
3. Evidence Retrieval (Sentence Embedding) - Pre-trained
4. Veracity Classification - FINE-TUNED on scraped PolitiFact data

REAL USE VERSION: Fetches actual article content from URLs in scraped data.
NO SAMPLE DATA USED ANYWHERE.

Requirements:
- transformers
- datasets
- sentence-transformers
- pandas
- scikit-learn
- torch
- numpy
- requests
- beautifulsoup4
"""

import pandas as pd
import numpy as np
from typing import List, Dict, Tuple, Optional
import logging
from pathlib import Path
import warnings
import requests
from bs4 import BeautifulSoup
import time
warnings.filterwarnings('ignore')

# Hugging Face imports
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset, DatasetDict
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Constants
DATA_FILE = 'fact_check_dataset.csv'
MODEL_SAVE_PATH = './fine-tuned-classifier'
BASE_MODEL_NAME = 'distilbert-base-uncased'

# Request headers for fetching articles
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}


# ============================================================================
# UTILITY: FETCH ARTICLE CONTENT FROM URL
# ============================================================================

def fetch_article_content(url: str, max_retries: int = 3) -> Optional[str]:
    """
    Fetch actual article content from URL in scraped dataset.

    Args:
        url: Article URL from the scraped data
        max_retries: Maximum number of retry attempts

    Returns:
        Article text content or None if fetch fails
    """
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=HEADERS, timeout=30)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract article body text
            article_body = None

            # For PolitiFact
            if 'politifact.com' in url:
                article_body = soup.find('article', class_='m-textblock')
                if not article_body:
                    article_body = soup.find('div', class_='short-on-time')

            # Generic fallback
            if not article_body:
                article_body = soup.find('article')
            if not article_body:
                article_body = soup.find('div', class_='content')

            if article_body:
                # Extract text and clean it
                text = article_body.get_text(separator=' ', strip=True)
                text = ' '.join(text.split())
                return text
            else:
                logger.warning(f"Could not find article body in {url}")
                return None

        except Exception as e:
            logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 * (attempt + 1))
            else:
                logger.error(f"Failed to fetch {url} after {max_retries} attempts")

    return None


def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences for evidence retrieval.

    Args:
        text: Input text

    Returns:
        List of sentences
    """
    sentences = []
    for sent in text.split('.'):
        sent = sent.strip()
        if len(sent) > 20:  # Ignore very short fragments
            sentences.append(sent)
    return sentences


# ============================================================================
# MODEL 1: CLAIM IDENTIFICATION (SUMMARIZATION)
# ============================================================================

def load_summarization_model():
    """
    Load pre-trained summarization model for extracting core claims.
    """
    logger.info("Loading summarization model (Model 1)...")
    summarizer = pipeline(
        "summarization",
        model="facebook/bart-large-cnn",
        device=-1  # Use CPU; set to 0 for GPU
    )
    return summarizer


def get_claim_summary(text: str, summarizer) -> str:
    """
    Extract core claim from a longer text using summarization.

    Args:
        text: Input text to analyze
        summarizer: Pre-loaded summarization pipeline

    Returns:
        Concise summary of the core claim
    """
    try:
        max_input_length = 1024
        text = text[:max_input_length]
        summary = summarizer(
            text,
            max_length=100,
            min_length=20,
            do_sample=False
        )
        return summary[0]['summary_text']
    except Exception as e:
        logger.error(f"Summarization error: {e}")
        return text[:200]  # Fallback to truncation


# ============================================================================
# MODEL 2: NAMED ENTITY RECOGNITION (NER)
# ============================================================================

def load_ner_model():
    """
    Load pre-trained NER model for entity extraction.
    """
    logger.info("Loading NER model (Model 2)...")
    ner = pipeline(
        "ner",
        model="dslim/bert-base-NER",
        aggregation_strategy="simple",
        device=-1
    )
    return ner


def extract_entities(text: str, ner_pipeline) -> List[Dict[str, str]]:
    """
    Extract named entities from text.

    Args:
        text: Input text to analyze
        ner_pipeline: Pre-loaded NER pipeline

    Returns:
        List of dictionaries containing entity information
    """
    try:
        entities = ner_pipeline(text)
        return [
            {
                'entity': ent['entity_group'],
                'word': ent['word'],
                'score': round(ent['score'], 3)
            }
            for ent in entities
        ]
    except Exception as e:
        logger.error(f"NER error: {e}")
        return []


# ============================================================================
# MODEL 3: EVIDENCE RETRIEVAL (SENTENCE EMBEDDING)
# ============================================================================

def load_sentence_transformer():
    """
    Load pre-trained sentence transformer for semantic similarity.
    """
    logger.info("Loading sentence transformer (Model 3)...")
    return SentenceTransformer('all-MiniLM-L6-v2')


def find_best_evidence(
    claim: str,
    article_sentences: List[str],
    sentence_model
) -> Tuple[str, float]:
    """
    Find the most relevant sentence from an article that relates to the claim.

    Args:
        claim: The claim to verify
        article_sentences: List of sentences from the article
        sentence_model: Pre-loaded sentence transformer model

    Returns:
        Tuple of (best_sentence, similarity_score)
    """
    try:
        if not article_sentences:
            return "", 0.0

        claim_embedding = sentence_model.encode(claim, convert_to_tensor=True)
        sentence_embeddings = sentence_model.encode(
            article_sentences,
            convert_to_tensor=True
        )

        similarities = util.cos_sim(claim_embedding, sentence_embeddings)[0]
        best_idx = similarities.argmax().item()
        best_score = similarities[best_idx].item()

        return article_sentences[best_idx], round(best_score, 3)

    except Exception as e:
        logger.error(f"Evidence retrieval error: {e}")
        return (article_sentences[0] if article_sentences else ""), 0.0


# ============================================================================
# MODEL 4: VERACITY CLASSIFICATION (FINE-TUNING)
# ============================================================================

def load_and_prepare_data(data_file: str) -> pd.DataFrame:
    """
    Load scraped PolitiFact data, map verdicts to simplified labels, and prepare for training.

    Args:
        data_file: Path to fact_check_dataset.csv

    Returns:
        Filtered and cleaned DataFrame ready for training
    """
    logger.info(f"Loading data from {data_file}...")
    df = pd.read_csv(data_file)
    logger.info(f"Loaded {len(df)} total articles")

    df = df[df['source_site'] == 'PolitiFact'].copy()
    logger.info(f"Filtered to {len(df)} PolitiFact articles")

    # --- START: CORRECTED MAPPING LOGIC ---
    logger.info("Mapping detailed verdicts to simplified labels (TRUE, FALSE, PANTS ON FIRE)...")

    # This map now EXACTLY matches the unique verdicts found in your CSV file.
    # We are intentionally ignoring 'full-flop' and 'half-flip' as they aren't direct truth ratings.
    label_map = {
        'true': 'TRUE',
        'mostly-true': 'TRUE',
        'half-true': 'TRUE',
        'barely-true': 'TRUE', # Added new category
        'false': 'FALSE',
        # Note: 'mostly-false' was not in your output, so it's not included.
        'pants-fire': 'PANTS ON FIRE'
    }
    df['verdict'] = df['verdict'].map(label_map)

    # Drop any rows that didn't map correctly (this will now correctly drop 'full-flop' and 'half-flip')
    initial_rows = len(df)
    df.dropna(subset=['verdict'], inplace=True)
    if len(df) < initial_rows:
        logger.warning(f"Dropped {initial_rows - len(df)} rows with unmappable or excluded verdicts (like 'full-flop').")
    # --- END: CORRECTED MAPPING LOGIC ---

    logger.info("Verdict distribution after mapping:")
    logger.info(f"\n{df['verdict'].value_counts()}")

    if len(df) == 0:
        raise ValueError("No valid training data remains after mapping labels. Check the dataset.")

    return df

def create_label_mappings(df: pd.DataFrame) -> Tuple[Dict, Dict]:
    """
    Create label-to-ID and ID-to-label mappings from the data.

    Args:
        df: DataFrame with the 'verdict' column

    Returns:
        Tuple of (label2id, id2label) dictionaries
    """
    unique_labels = sorted(df['verdict'].astype(str).unique())
    label2id = {label: idx for idx, label in enumerate(unique_labels)}
    id2label = {idx: label for label, idx in label2id.items()}
    logger.info(f"Final label mappings created: {label2id}")
    return label2id, id2label


def prepare_dataset_for_training(
    df: pd.DataFrame,
    label2id: Dict,
    test_size: float = 0.2
) -> DatasetDict:
    """
    Convert DataFrame to Hugging Face DatasetDict with a train/validation split.

    Args:
        df: DataFrame with 'claim' and 'verdict' columns
        label2id: Label to ID mapping
        test_size: Fraction of data for validation

    Returns:
        DatasetDict with 'train' and 'validation' splits
    """
    df['label'] = df['verdict'].map(label2id)
    df.dropna(subset=['label'], inplace=True)
    df['label'] = df['label'].astype(int)

    texts = df['claim'].tolist()
    labels = df['label'].tolist()

    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels,
        test_size=test_size,
        random_state=42,
        stratify=labels
    )
    logger.info(f"Training samples: {len(train_texts)}")
    logger.info(f"Validation samples: {len(val_texts)}")

    train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
    val_dataset = Dataset.from_dict({'text': val_texts, 'label': val_labels})

    return DatasetDict({'train': train_dataset, 'validation': val_dataset})


def tokenize_dataset(dataset_dict: DatasetDict, tokenizer):
    """Tokenize the dataset using the model's tokenizer."""
    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            padding='max_length',
            truncation=True,
            max_length=128
        )
    logger.info("Tokenizing dataset...")
    return dataset_dict.map(tokenize_function, batched=True, remove_columns=['text'])


def compute_metrics(eval_pred):
    """Compute accuracy and F1 score for model evaluation."""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {'accuracy': accuracy, 'f1': f1}


def fine_tune_classifier(
    dataset_dict: DatasetDict,
    label2id: Dict,
    id2label: Dict,
    model_name: str = BASE_MODEL_NAME,
    output_dir: str = MODEL_SAVE_PATH
):
    """
    Fine-tune a classification model on the scraped data.
    """
    logger.info(f"Fine-tuning {model_name} for veracity classification...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenized_datasets = tokenize_dataset(dataset_dict, tokenizer)

    num_labels = len(label2id)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )

    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_dir=f'{output_dir}/logs',
        logging_steps=10,
        save_total_limit=2,
        seed=42
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    logger.info("Starting training...")
    trainer.train()

    logger.info("Evaluating model...")
    eval_results = trainer.evaluate()
    logger.info(f"Evaluation results: {eval_results}")

    predictions = trainer.predict(tokenized_datasets['validation'])
    pred_labels = np.argmax(predictions.predictions, axis=-1)
    true_labels = predictions.label_ids

    target_names = [id2label[i] for i in sorted(id2label.keys())]
    report = classification_report(true_labels, pred_labels, target_names=target_names)
    logger.info(f"\nClassification Report:\n{report}")

    logger.info(f"Saving fine-tuned model to {output_dir}...")
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    logger.info("Fine-tuning complete!")


# ============================================================================
# COMPLETE PIPELINE DEMONSTRATION
# ============================================================================

def predict_verdict(
    claim: str,
    article_body: str,
    ner_pipeline,
    sentence_model,
    classifier_path: str = MODEL_SAVE_PATH
) -> Dict:
    """
    Complete end-to-end pipeline for fact-checking a claim.
    """
    logger.info("Running complete fact-checking pipeline...")

    entities = extract_entities(claim, ner_pipeline)
    logger.info(f"Entities extracted: {len(entities)} entities")

    sentences = split_into_sentences(article_body)
    evidence_sentence, similarity_score = find_best_evidence(
        claim, sentences, sentence_model
    )
    logger.info(f"Best evidence found (similarity: {similarity_score})")

    classifier = pipeline(
        "text-classification",
        model=classifier_path,
        device=-1
    )
    classifier_input = f"{claim} [SEP] {evidence_sentence}"
    prediction = classifier(classifier_input)[0]

    return {
        'claim': claim,
        'entities': entities,
        'evidence_sentence': evidence_sentence,
        'evidence_similarity': similarity_score,
        'predicted_verdict': prediction['label'],
        'confidence': round(prediction['score'], 3)
    }


def print_analysis(results: Dict):
    """Print formatted analysis results."""
    print("\n" + "="*70)
    print("FACT-CHECKING ANALYSIS")
    print("="*70)
    print(f"\nCLAIM:\n  {results['claim']}")
    print(f"\nENTITIES IDENTIFIED:")
    if results['entities']:
        for ent in results['entities']:
            print(f"  - {ent['word']} ({ent['entity']}) [confidence: {ent['score']}]")
    else:
        print("  No entities found")
    print(f"\nBEST EVIDENCE (similarity: {results['evidence_similarity']}):\n  {results['evidence_sentence']}")
    print(f"\nPREDICTED VERDICT: {results['predicted_verdict']}")
    print(f"CONFIDENCE: {results['confidence']}")
    print("="*70 + "\n")


# ============================================================================
# MAIN EXECUTION
# ============================================================================

def main():
    """
    Main function to execute the complete pipeline.
    """
    logger.info("="*70)
    logger.info("Phase 2: Model Training and Pipeline Setup")
    logger.info("REAL USE VERSION - Using actual scraped data and article URLs")
    logger.info("="*70)

    # STEP 1: Load, clean, and map data
    df = load_and_prepare_data(DATA_FILE)
    label2id, id2label = create_label_mappings(df)
    dataset_dict = prepare_dataset_for_training(df, label2id)

    # STEP 2: Fine-tune the veracity classification model
    fine_tune_classifier(dataset_dict, label2id, id2label)

    # STEP 3: Load all pre-trained models for the pipeline
    logger.info("\n" + "="*70)
    logger.info("Loading pre-trained models for pipeline...")
    logger.info("="*70)
    summarizer = load_summarization_model()
    ner = load_ner_model()
    sentence_model = load_sentence_transformer()

    # STEP 4: Demonstrate pipeline on a sample from the dataset
    logger.info("\n" + "="*70)
    logger.info("COMPLETE PIPELINE DEMONSTRATION ON SCRAPED DATA")
    logger.info("="*70)

    # Use an article from the validation set for a fair test
    val_df = df[~df['claim'].isin(dataset_dict['train']['text'])]
    if val_df.empty:
        val_df = df # Fallback if split is weird

    sample_article = val_df.sample(n=1, random_state=42).iloc[0]
    test_claim = sample_article['claim']
    test_url = sample_article['article_url']
    test_actual_verdict = sample_article['verdict']

    logger.info(f"\nTesting with sample article:")
    logger.info(f"Claim: {test_claim[:100]}...")
    logger.info(f"URL: {test_url}")
    logger.info(f"Actual verdict: {test_actual_verdict}")

    logger.info("\nFetching article content...")
    test_article_content = fetch_article_content(test_url)

    if test_article_content:
        logger.info(f"Article fetched ({len(test_article_content)} chars)")
        results = predict_verdict(
            test_claim,
            test_article_content,
            ner,
            sentence_model
        )
        print_analysis(results)

        logger.info(f"Actual verdict from scraped data: {test_actual_verdict}")
        logger.info(f"Predicted verdict: {results['predicted_verdict']}")
        logger.info(f"Match: {'✓' if results['predicted_verdict'] == test_actual_verdict else '✗'}")
    else:
        logger.warning("Could not fetch article content for complete pipeline demo.")

    # FINAL SUMMARY
    logger.info("\n" + "="*70)
    logger.info("PHASE 2 COMPLETE")
    logger.info("="*70)
    logger.info(f"Fine-tuned model saved to: {MODEL_SAVE_PATH}")
    logger.info(f"Training data source: {DATA_FILE}")
    logger.info(f"Total PolitiFact articles used: {len(df)}")
    logger.info(f"Unique verdict labels used for training: {list(label2id.keys())}")
    logger.info("\nAll training and demonstrations used only scraped data.")
    logger.info("="*70)


if __name__ == "__main__":
    main()



Map:   0%|          | 0/396 [00:00<?, ? examples/s]

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7943,0.819766,0.666667,0.533333
2,0.7591,0.779091,0.69697,0.606313
3,0.6803,0.766828,0.686869,0.608124


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Device set to use cpu



FACT-CHECKING ANALYSIS

CLAIM:
  “The NFL admits to canceling Elon's DOGE Super Bowl ads.”

ENTITIES IDENTIFIED:
  - NFL (ORG) [confidence: 0.996999979019165]
  - Elon (ORG) [confidence: 0.9980000257492065]
  - Super Bowl (MISC) [confidence: 0.9980000257492065]

BEST EVIDENCE (similarity: 0.951):
  "The NFL admits to canceling Elon's DOGE Super Bowl ads," a Feb

PREDICTED VERDICT: FALSE
CONFIDENCE: 0.787



In [75]:
#!/usr/bin/env python3
"""
Misinformation Detection Pipeline - Phase 3: Inference Backend
==============================================================
This script provides the core analysis logic for fact-checking using:
1. Pre-trained NER model for entity extraction
2. Pre-trained sentence transformer for evidence retrieval
3. FINE-TUNED verdict classifier (trained on scraped PolitiFact data)

This pipeline loads the custom-trained model from Phase 2 and applies it
to new, unseen data for real-time fact-checking inference.

Usage:
    %%writefile pipeline.py

    from pipeline import load_analysis_models, run_full_analysis

    models = load_analysis_models('./fine-tuned-classifier')
    result = run_full_analysis(claim, article_body, models)
"""

import torch
import logging
from typing import Dict, List, Tuple, Optional
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def load_analysis_models(model_dir: str = './fine-tuned-classifier') -> Dict:
    """
    Load all three models required for the complete fact-checking pipeline.

    This function loads:
    1. NER model (pre-trained) - for entity extraction
    2. Sentence Transformer (pre-trained) - for evidence retrieval
    3. Verdict Classifier (FINE-TUNED) - our custom model from Phase 2

    Args:
        model_dir: Path to the fine-tuned classifier directory from Phase 2

    Returns:
        Dictionary containing all three loaded models
    """
    logger.info("="*70)
    logger.info("Loading Analysis Models for Phase 3 Inference Pipeline")
    logger.info("="*70)

    models = {}

    # 1. Load NER Model (Pre-trained)
    logger.info("Loading NER model (dslim/bert-base-NER)...")
    try:
        models['ner'] = pipeline(
            "ner",
            model="dslim/bert-base-NER",
            aggregation_strategy="simple",
            device=-1  # Use CPU; change to 0 for GPU
        )
        logger.info("✓ NER model loaded successfully")
    except Exception as e:
        logger.error(f"Failed to load NER model: {e}")
        raise

    # 2. Load Sentence Transformer (Pre-trained)
    logger.info("Loading Sentence Transformer (all-MiniLM-L6-v2)...")
    try:
        models['sentence_model'] = SentenceTransformer('all-MiniLM-L6-v2')
        logger.info("✓ Sentence Transformer loaded successfully")
    except Exception as e:
        logger.error(f"Failed to load Sentence Transformer: {e}")
        raise

    # 3. Load Fine-Tuned Verdict Classifier (CRITICAL - OUR CUSTOM MODEL)
    logger.info(f"Loading FINE-TUNED Verdict Classifier from {model_dir}...")
    logger.info("This model contains knowledge from the scraped PolitiFact dataset.")
    try:
        models['verdict_classifier'] = pipeline(
            "text-classification",
            model=model_dir,
            device=-1,  # Use CPU; change to 0 for GPU
            return_all_scores=False  # Return only the top prediction
        )
        logger.info("✓ Fine-tuned Verdict Classifier loaded successfully")
        logger.info(f"Model loaded from: {model_dir}")
    except Exception as e:
        logger.error(f"Failed to load fine-tuned classifier from {model_dir}: {e}")
        logger.error("Make sure Phase 2 training completed and model was saved.")
        raise

    logger.info("="*70)
    logger.info("All models loaded successfully!")
    logger.info("Pipeline ready for inference on new data.")
    logger.info("="*70)

    return models


def extract_entities_from_claim(claim: str, ner_model) -> List[Dict[str, any]]:
    """
    Extract named entities from a claim using the NER model.

    Args:
        claim: The claim text to analyze
        ner_model: Pre-loaded NER pipeline

    Returns:
        List of dictionaries containing entity information
    """
    try:
        entities = ner_model(claim)
        return [
            {
                'entity_type': ent['entity_group'],
                'text': ent['word'],
                'confidence': round(ent['score'], 3)
            }
            for ent in entities
        ]
    except Exception as e:
        logger.error(f"Error extracting entities: {e}")
        return []


def split_into_sentences(text: str) -> List[str]:
    """
    Split article text into individual sentences for evidence retrieval.

    Args:
        text: Article body text

    Returns:
        List of sentences
    """
    sentences = []

    # Simple sentence splitting by period
    for sent in text.split('.'):
        sent = sent.strip()
        # Only keep sentences with substantial content
        if len(sent) > 20:
            sentences.append(sent + '.')  # Add period back

    # If no sentences found, try splitting by newline
    if not sentences and text:
        sentences = [s.strip() for s in text.split('\n') if len(s.strip()) > 20]

    # Fallback: return the whole text as one sentence
    if not sentences and text:
        sentences = [text]

    return sentences


def find_best_evidence_sentence(
    claim: str,
    article_sentences: List[str],
    sentence_model
) -> Tuple[str, float]:
    """
    Find the most relevant sentence from the article that relates to the claim.
    Uses cosine similarity between sentence embeddings.

    Args:
        claim: The claim to verify
        article_sentences: List of sentences from the article
        sentence_model: Pre-loaded SentenceTransformer model

    Returns:
        Tuple of (best_sentence, similarity_score)
    """
    try:
        if not article_sentences:
            logger.warning("No sentences provided for evidence retrieval")
            return "", 0.0

        # Encode the claim
        claim_embedding = sentence_model.encode(
            claim,
            convert_to_tensor=True,
            show_progress_bar=False
        )

        # Encode all article sentences
        sentence_embeddings = sentence_model.encode(
            article_sentences,
            convert_to_tensor=True,
            show_progress_bar=False
        )

        # Calculate cosine similarities
        similarities = util.pytorch_cos_sim(claim_embedding, sentence_embeddings)[0]

        # Find the sentence with highest similarity
        best_idx = similarities.argmax().item()
        best_score = similarities[best_idx].item()
        best_sentence = article_sentences[best_idx]

        logger.info(f"Best evidence found with similarity score: {round(best_score, 3)}")

        return best_sentence, round(best_score, 3)

    except Exception as e:
        logger.error(f"Error in evidence retrieval: {e}")
        # Return first sentence as fallback
        return (article_sentences[0] if article_sentences else ""), 0.0


def run_full_analysis(
    claim: str,
    article_body: str,
    models: Dict
) -> Dict:
    """
    Execute the complete fact-checking analysis pipeline.

    This is the main inference function that:
    1. Extracts entities from the claim
    2. Finds the best evidence sentence from the article
    3. Uses the fine-tuned model to predict the verdict

    Args:
        claim: The claim statement to fact-check
        article_body: The full article text to analyze
        models: Dictionary of loaded models from load_analysis_models()

    Returns:
        Dictionary containing complete analysis results:
        - entities: List of extracted named entities
        - evidence_sentence: Most relevant sentence from article
        - evidence_similarity: Similarity score (0-1)
        - predicted_verdict: Final verdict (TRUE/FALSE/PANTS ON FIRE)
        - confidence: Model confidence score (0-1)
    """
    logger.info("\n" + "="*70)
    logger.info("Running Complete Fact-Checking Analysis Pipeline")
    logger.info("="*70)

    # STAGE 1: Entity Extraction
    logger.info("\n[STAGE 1/3] Extracting named entities from claim...")
    entities = extract_entities_from_claim(claim, models['ner'])
    logger.info(f"Found {len(entities)} entities")
    if entities:
        for ent in entities[:5]:  # Show first 5
            logger.info(f"  - {ent['text']} ({ent['entity_type']})")

    # STAGE 2: Evidence Retrieval
    logger.info("\n[STAGE 2/3] Finding best evidence from article...")
    sentences = split_into_sentences(article_body)
    logger.info(f"Article split into {len(sentences)} sentences")

    evidence_sentence, similarity_score = find_best_evidence_sentence(
        claim,
        sentences,
        models['sentence_model']
    )
    logger.info(f"Evidence similarity score: {similarity_score}")

    # STAGE 3: Verdict Classification (Using Fine-Tuned Model)
    logger.info("\n[STAGE 3/3] Predicting verdict using fine-tuned classifier...")
    logger.info("This prediction uses our custom model trained on PolitiFact data.")

    # Combine claim and evidence for classification
    # Format matches Phase 2 training: "claim [SEP] evidence"
    classifier_input = f"{claim} [SEP] {evidence_sentence}"

    try:
        prediction = models['verdict_classifier'](classifier_input)[0]
        predicted_verdict = prediction['label']
        confidence = round(prediction['score'], 3)

        logger.info(f"Predicted Verdict: {predicted_verdict}")
        logger.info(f"Confidence: {confidence}")
    except Exception as e:
        logger.error(f"Error during verdict classification: {e}")
        predicted_verdict = "ERROR"
        confidence = 0.0

    # Compile results
    results = {
        'claim': claim,
        'entities': entities,
        'evidence_sentence': evidence_sentence,
        'evidence_similarity': similarity_score,
        'predicted_verdict': predicted_verdict,
        'confidence': confidence,
        'num_sentences_analyzed': len(sentences)
    }

    logger.info("\n" + "="*70)
    logger.info("Analysis Complete!")
    logger.info("="*70)

    return results


def print_analysis_report(results: Dict):
    """
    Print a formatted, human-readable analysis report.

    Args:
        results: Dictionary returned from run_full_analysis()
    """
    print("\n" + "="*80)
    print("FACT-CHECKING ANALYSIS REPORT")
    print("="*80)

    print(f"\n📋 CLAIM:")
    print(f"   {results['claim']}")

    print(f"\n👥 ENTITIES IDENTIFIED ({len(results['entities'])}):")
    if results['entities']:
        for ent in results['entities']:
            print(f"   • {ent['text']} [{ent['entity_type']}] (confidence: {ent['confidence']})")
    else:
        print("   No entities found")

    print(f"\n🔍 BEST EVIDENCE (similarity: {results['evidence_similarity']}):")
    print(f"   {results['evidence_sentence'][:300]}{'...' if len(results['evidence_sentence']) > 300 else ''}")

    print(f"\n⚖️  VERDICT PREDICTION:")
    print(f"   {results['predicted_verdict']}")
    print(f"   Confidence: {results['confidence']*100:.1f}%")

    print(f"\n📊 METADATA:")
    print(f"   Sentences analyzed: {results['num_sentences_analyzed']}")

    print("\n" + "="*80)
    print()


# Example usage demonstration
if __name__ == "__main__":
    print("This is the pipeline.py module for Phase 3 inference.")
    print("Import this module in your notebook to use the fact-checking pipeline.")
    print("\nExample usage:")
    print("""
    from pipeline import load_analysis_models, run_full_analysis, print_analysis_report

    # Load models once
    models = load_analysis_models('./fine-tuned-classifier')

    # Analyze a claim
    claim = "The president announced a new policy yesterday"
    article = "Full article text here..."

    results = run_full_analysis(claim, article, models)
    print_analysis_report(results)
    """)

This is the pipeline.py module for Phase 3 inference.
Import this module in your notebook to use the fact-checking pipeline.

Example usage:

    from pipeline import load_analysis_models, run_full_analysis, print_analysis_report
    
    # Load models once
    models = load_analysis_models('./fine-tuned-classifier')
    
    # Analyze a claim
    claim = "The president announced a new policy yesterday"
    article = "Full article text here..."
    
    results = run_full_analysis(claim, article, models)
    print_analysis_report(results)
    


In [76]:
# Save files
%%writefile pipeline.py
# [paste pipeline.py code]

%%writefile generator.py
# [paste generator.py code]

# Use the pipeline
from pipeline import load_analysis_models, run_full_analysis, print_analysis_report
from generator import load_generator_model, generate_fact_check_article, format_complete_report

# Load models once
models = load_analysis_models('./fine-tuned-classifier')
generator = load_generator_model()

# Analyze new data
results = run_full_analysis(claim, article_body, models)
print_analysis_report(results)

# Generate article
article = generate_fact_check_article(
    results['claim'],
    results['predicted_verdict'],
    generator,
    results['evidence_sentence']
)

# Complete report
report = format_complete_report(
    results['claim'],
    results['predicted_verdict'],
    results['confidence'],
    results['entities'],
    results['evidence_sentence'],
    article
)
print(report)

Writing pipeline.py


In [89]:
# CELL 1: Install and create app.py
!pip install streamlit pandas pyngrok -q
# Then run %%writefile app.py with the code

# CELL 2: Configure ngrok
!ngrok config add-authtoken 33f8RN3JXog4sMRXRDUG7ft8OOw_4hLftc8EyjifXpiPrTQ5g

# CELL 3: Launch app
# Runs the provided Python script that:
# - Starts ngrok tunnel
# - Prints public URL
# - Launches Streamlit

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [94]:

!pip install transformers datasets sentence-transformers pandas scikit-learn torch requests beautifulsoup4 streamlit pyngrok -q

print("✅ All packages installed successfully!")



✅ All packages installed successfully!


In [95]:
%%writefile pipeline.py
#!/usr/bin/env python3
"""
Misinformation Detection Pipeline - Phase 3: Inference Backend
"""

import torch
import logging
from typing import Dict, List, Tuple, Optional
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import numpy as np

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def load_analysis_models(model_dir: str = './fine-tuned-classifier') -> Dict:
    """Load all three models required for the complete fact-checking pipeline."""
    logger.info("="*70)
    logger.info("Loading Analysis Models for Phase 3 Inference Pipeline")
    logger.info("="*70)

    models = {}

    # 1. Load NER Model (Pre-trained)
    logger.info("Loading NER model (dslim/bert-base-NER)...")
    try:
        models['ner'] = pipeline(
            "ner",
            model="dslim/bert-base-NER",
            aggregation_strategy="simple",
            device=-1
        )
        logger.info("✓ NER model loaded successfully")
    except Exception as e:
        logger.error(f"Failed to load NER model: {e}")
        raise

    # 2. Load Sentence Transformer (Pre-trained)
    logger.info("Loading Sentence Transformer (all-MiniLM-L6-v2)...")
    try:
        models['sentence_model'] = SentenceTransformer('all-MiniLM-L6-v2')
        logger.info("✓ Sentence Transformer loaded successfully")
    except Exception as e:
        logger.error(f"Failed to load Sentence Transformer: {e}")
        raise

    # 3. Load Fine-Tuned Verdict Classifier
    logger.info(f"Loading FINE-TUNED Verdict Classifier from {model_dir}...")
    try:
        models['verdict_classifier'] = pipeline(
            "text-classification",
            model=model_dir,
            device=-1,
            return_all_scores=False
        )
        logger.info("✓ Fine-tuned Verdict Classifier loaded successfully")
    except Exception as e:
        logger.error(f"Failed to load fine-tuned classifier: {e}")
        raise

    logger.info("="*70)
    logger.info("All models loaded successfully!")
    logger.info("="*70)

    return models


def extract_entities_from_claim(claim: str, ner_model) -> List[Dict[str, any]]:
    """Extract named entities from a claim."""
    try:
        entities = ner_model(claim)
        return [
            {
                'entity_type': ent['entity_group'],
                'text': ent['word'],
                'confidence': round(ent['score'], 3)
            }
            for ent in entities
        ]
    except Exception as e:
        logger.error(f"Error extracting entities: {e}")
        return []


def split_into_sentences(text: str) -> List[str]:
    """Split article text into individual sentences."""
    sentences = []
    for sent in text.split('.'):
        sent = sent.strip()
        if len(sent) > 20:
            sentences.append(sent + '.')
    if not sentences and text:
        sentences = [s.strip() for s in text.split('\n') if len(s.strip()) > 20]
    if not sentences and text:
        sentences = [text]
    return sentences


def find_best_evidence_sentence(
    claim: str,
    article_sentences: List[str],
    sentence_model
) -> Tuple[str, float]:
    """Find the most relevant sentence from the article."""
    try:
        if not article_sentences:
            return "", 0.0

        claim_embedding = sentence_model.encode(
            claim,
            convert_to_tensor=True,
            show_progress_bar=False
        )

        sentence_embeddings = sentence_model.encode(
            article_sentences,
            convert_to_tensor=True,
            show_progress_bar=False
        )

        similarities = util.pytorch_cos_sim(claim_embedding, sentence_embeddings)[0]
        best_idx = similarities.argmax().item()
        best_score = similarities[best_idx].item()

        return article_sentences[best_idx], round(best_score, 3)

    except Exception as e:
        logger.error(f"Error in evidence retrieval: {e}")
        return (article_sentences[0] if article_sentences else ""), 0.0


def run_full_analysis(
    claim: str,
    article_body: str,
    models: Dict
) -> Dict:
    """Execute the complete fact-checking analysis pipeline."""
    logger.info("\n" + "="*70)
    logger.info("Running Complete Fact-Checking Analysis Pipeline")
    logger.info("="*70)

    # STAGE 1: Entity Extraction
    logger.info("\n[STAGE 1/3] Extracting entities...")
    entities = extract_entities_from_claim(claim, models['ner'])
    logger.info(f"Found {len(entities)} entities")

    # STAGE 2: Evidence Retrieval
    logger.info("\n[STAGE 2/3] Finding best evidence...")
    sentences = split_into_sentences(article_body)
    logger.info(f"Article split into {len(sentences)} sentences")

    evidence_sentence, similarity_score = find_best_evidence_sentence(
        claim,
        sentences,
        models['sentence_model']
    )
    logger.info(f"Evidence similarity: {similarity_score}")

    # STAGE 3: Verdict Classification
    logger.info("\n[STAGE 3/3] Predicting verdict...")
    classifier_input = f"{claim} [SEP] {evidence_sentence}"

    try:
        prediction = models['verdict_classifier'](classifier_input)[0]
        predicted_verdict = prediction['label']
        confidence = round(prediction['score'], 3)
        logger.info(f"Verdict: {predicted_verdict} ({confidence})")
    except Exception as e:
        logger.error(f"Error during classification: {e}")
        predicted_verdict = "ERROR"
        confidence = 0.0

    results = {
        'claim': claim,
        'entities': entities,
        'evidence_sentence': evidence_sentence,
        'evidence_similarity': similarity_score,
        'predicted_verdict': predicted_verdict,
        'confidence': confidence,
        'num_sentences_analyzed': len(sentences)
    }

    logger.info("\n" + "="*70)
    logger.info("Analysis Complete!")
    logger.info("="*70)

    return results


def print_analysis_report(results: Dict):
    """Print a formatted analysis report."""
    print("\n" + "="*80)
    print("FACT-CHECKING ANALYSIS REPORT")
    print("="*80)

    print(f"\n📋 CLAIM:")
    print(f"   {results['claim']}")

    print(f"\n👥 ENTITIES IDENTIFIED ({len(results['entities'])}):")
    if results['entities']:
        for ent in results['entities']:
            print(f"   • {ent['text']} [{ent['entity_type']}] (confidence: {ent['confidence']})")
    else:
        print("   No entities found")

    print(f"\n🔍 BEST EVIDENCE (similarity: {results['evidence_similarity']}):")
    print(f"   {results['evidence_sentence'][:300]}{'...' if len(results['evidence_sentence']) > 300 else ''}")

    print(f"\n⚖️  VERDICT PREDICTION:")
    print(f"   {results['predicted_verdict']}")
    print(f"   Confidence: {results['confidence']*100:.1f}%")

    print(f"\n📊 METADATA:")
    print(f"   Sentences analyzed: {results['num_sentences_analyzed']}")
    print("\n" + "="*80 + "\n")

Overwriting pipeline.py


In [96]:
%%writefile generator.py
#!/usr/bin/env python3
"""
AI Article Generator - Phase 3: Generative Component
"""

import logging
from typing import Optional
from transformers import pipeline
import warnings
warnings.filterwarnings('ignore')

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def load_generator_model(model_name: str = "google/flan-t5-base"):
    """Load a pre-trained text generation model."""
    logger.info("="*70)
    logger.info(f"Loading Text Generation Model: {model_name}")
    logger.info("="*70)

    try:
        generator = pipeline(
            "text2text-generation",
            model=model_name,
            device=-1,
            max_length=512
        )
        logger.info("✓ Text generation model loaded successfully")
        return generator
    except Exception as e:
        logger.error(f"Failed to load generator model: {e}")
        raise


def create_generation_prompt(claim: str, predicted_verdict: str, evidence: Optional[str] = None) -> str:
    """Create a prompt for AI article generation."""
    verdict_context = {
        'TRUE': 'accurate and supported by evidence',
        'FALSE': 'inaccurate and contradicted by evidence',
        'PANTS ON FIRE': 'completely false and has no basis in fact'
    }

    verdict_desc = verdict_context.get(predicted_verdict, 'requires further investigation')

    prompt = f"""You are a professional fact-checker writing a brief analysis.

CLAIM: "{claim}"

Write a short fact-checking article (3-4 sentences) that:
1. States the claim being evaluated
2. Provides factual context
3. Concludes that the claim is {verdict_desc}
4. Maintains a neutral tone

VERDICT: {predicted_verdict}

Article:"""

    if evidence and len(evidence) > 20:
        prompt += f"\n\nCONTEXT: {evidence[:200]}"

    return prompt


def generate_fact_check_article(
    claim: str,
    predicted_verdict: str,
    generator_model,
    evidence: Optional[str] = None,
    max_length: int = 300,
    num_beams: int = 4,
    temperature: float = 0.7
) -> str:
    """Generate a fact-checking article using AI."""
    logger.info("\n" + "="*70)
    logger.info("Generating AI Fact-Check Article")
    logger.info("="*70)

    try:
        prompt = create_generation_prompt(claim, predicted_verdict, evidence)

        generated = generator_model(
            prompt,
            max_length=max_length,
            min_length=50,
            num_beams=num_beams,
            temperature=temperature,
            do_sample=True,
            top_p=0.9,
            no_repeat_ngram_size=3,
            early_stopping=True
        )

        article_text = generated[0]['generated_text'].strip()
        logger.info("✓ Article generated successfully")
        return article_text

    except Exception as e:
        logger.error(f"Error generating article: {e}")
        return f"[Error: {e}]"


def format_complete_report(
    claim: str,
    predicted_verdict: str,
    confidence: float,
    entities: list,
    evidence_sentence: str,
    generated_article: str
) -> str:
    """Create a complete formatted report."""
    report = f"""
╔══════════════════════════════════════════════════════════════════════════════╗
║                        COMPLETE FACT-CHECK REPORT                            ║
╚══════════════════════════════════════════════════════════════════════════════╝

📋 CLAIM: {claim}

⚖️  VERDICT: {predicted_verdict}
   Confidence: {confidence*100:.1f}%

───────────────────────────────────────────────────────────────────────────────

🤖 AI-GENERATED ARTICLE:

{generated_article}

───────────────────────────────────────────────────────────────────────────────

🔍 Evidence: {evidence_sentence[:250]}

👥 Entities:
"""

    if entities:
        for ent in entities[:5]:
            report += f"   • {ent['text']} [{ent['entity_type']}]\n"
    else:
        report += "   None identified\n"

    report += "\n╚══════════════════════════════════════════════════════════════════════════════╝"
    return report

Writing generator.py


In [98]:
# CELL 4: Test imports
from pipeline import load_analysis_models, run_full_analysis, print_analysis_report
from generator import load_generator_model, generate_fact_check_article

print("✅ All imports successful!")

✅ All imports successful!


In [99]:
# Load models (once)
models = load_analysis_models('./fine-tuned-classifier')
generator = load_generator_model()

# Analyze a claim
claim = "Your claim here"
article = "Article text here"

results = run_full_analysis(claim, article, models)
print_analysis_report(results)

# Generate article
ai_article = generate_fact_check_article(
    results['claim'],
    results['predicted_verdict'],
    generator
)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Device set to use cpu


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu



FACT-CHECKING ANALYSIS REPORT

📋 CLAIM:
   Your claim here

👥 ENTITIES IDENTIFIED (0):
   No entities found

🔍 BEST EVIDENCE (similarity: 0.245):
   Article text here

⚖️  VERDICT PREDICTION:
   FALSE
   Confidence: 62.7%

📊 METADATA:
   Sentences analyzed: 1




In [101]:
!pip install streamlit pandas -q
print("✅ Streamlit and dependencies installed successfully!")

✅ Streamlit and dependencies installed successfully!


In [104]:
%%writefile app.py
#!/usr/bin/env python3
"""
Misinformation Intelligence Platform - Frontend
================================================
Professional Streamlit interface for AI-powered fact-checking
Phase 4: Production-Ready UI with Custom Styling
"""

import streamlit as st
import pandas as pd
from typing import Dict
import os
import sys # Import sys

# Add the directory containing pipeline.py and generator.py to the sys path
# This is important if these files are in the same directory as app.py
# and Streamlit might be run from a different location (like the root /content/)
# The os.path.dirname(__file__) gets the directory of the current script (app.py)
sys.path.insert(0, os.path.dirname(__file__))


# Import backend modules
try:
    from pipeline import load_analysis_models, run_full_analysis
    from generator import load_generator_model, generate_fact_check_article, format_complete_report
    BACKEND_MODULES_LOADED = True
except ImportError as e:
    st.error(f"⚠️ Backend modules not found: {e}")
    st.info("Please ensure pipeline.py and generator.py were successfully created/updated in the notebook.")
    BACKEND_MODULES_LOADED = False


# ============================================================================
# CUSTOM CSS - PROFESSIONAL DARK THEME
# ============================================================================

CUSTOM_CSS = """
<style>
    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');

    /* CSS Variables for Color Palette */
    :root {
        --background-color: #0E1117;
        --primary-text-color: #FAFAFA;
        --secondary-text-color: #A0AEC0;
        --card-background-color: #1A202C;
        --card-hover-background: #252D3A;
        --accent-color: #3182CE;
        --accent-hover: #2C5AA0;
        --success-color: #48BB78;
        --warning-color: #ED8936;
        --danger-color: #F56565;
        --border-color: #2D3748;
        --shadow-color: rgba(0, 0, 0, 0.3);
    }

    /* Global Font */
    html, body, [class*="css"] {
        font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
        color: var(--primary-text-color);
    }

    /* Main Container */
    .main {
        background-color: var(--background-color);
        padding: 2rem 1rem;
    }

    /* Custom Card Component */
    .card {
        background-color: var(--card-background-color);
        border: 1px solid var(--border-color);
        border-radius: 12px;
        padding: 1.5rem;
        margin: 1rem 0;
        box-shadow: 0 4px 6px var(--shadow-color);
        transition: all 0.3s ease;
    }

    .card:hover {
        background-color: var(--card-hover-background);
        box-shadow: 0 8px 12px var(--shadow-color);
        transform: translateY(-2px);
    }

    /* Header Styling */
    .main-header {
        text-align: center;
        padding: 2rem 0;
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        border-radius: 16px;
        margin-bottom: 2rem;
        box-shadow: 0 10px 20px rgba(102, 126, 234, 0.3);
    }

    .main-header h1 {
        color: white;
        font-size: 2.8rem;
        font-weight: 700;
        margin: 0;
        text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
    }

    .main-header p {
        color: rgba(255, 255, 255, 0.9);
        font-size: 1.1rem;
        margin-top: 0.5rem;
    }

    /* Info Boxes */
    .info-box {
        background: linear-gradient(135deg, #667eea15 0%, #764ba215 100%);
        border-left: 4px solid var(--accent-color);
        border-radius: 8px;
        padding: 1rem 1.5rem;
        margin: 1rem 0;
    }

    .success-box {
        background: linear-gradient(135deg, #48BB7815 0%, #38A16915 100%);
        border-left: 4px solid var(--success-color);
        border-radius: 8px;
        padding: 1rem 1.5rem;
        margin: 1rem 0;
    }

    .warning-box {
        background: linear-gradient(135deg, #ED893615 0%, #DD6B2015 100%);
        border-left: 4px solid var(--warning-color);
        border-radius: 8px;
        padding: 1rem 1.5rem;
        margin: 1rem 0;
    }

    .danger-box {
        background: linear-gradient(135deg, #F5656515 0%, #C5333315 100%);
        border-left: 4px solid var(--danger-color);
        border-radius: 8px;
        padding: 1rem 1.5rem;
        margin: 1rem 0;
    }

    /* Verdict Badge */
    .verdict-badge {
        display: inline-block;
        padding: 0.5rem 1rem;
        border-radius: 20px;
        font-weight: 600;
        font-size: 0.9rem;
        text-transform: uppercase;
        letter-spacing: 0.5px;
    }

    .verdict-true {
        background-color: var(--success-color);
        color: white;
    }

    .verdict-false {
        background-color: var(--danger-color);
        color: white;
    }

    .verdict-pants {
        background: linear-gradient(135deg, #FC4445 0%, #C51E3A 100%);
        color: white;
    }

    /* Button Styling */
    .stButton > button {
        background: linear-gradient(135deg, var(--accent-color) 0%, var(--accent-hover) 100%);
        color: white;
        border: none;
        border-radius: 8px;
        padding: 0.75rem 2rem;
        font-weight: 600;
        font-size: 1rem;
        transition: all 0.3s ease;
        box-shadow: 0 4px 6px rgba(49, 130, 206, 0.3);
    }

    .stButton > button:hover {
        transform: translateY(-2px);
        box-shadow: 0 6px 12px rgba(49, 130, 206, 0.4);
    }

    /* Text Area Styling */
    .stTextArea > div > div > textarea {
        background-color: var(--card-background-color);
        color: var(--primary-text-color);
        border: 1px solid var(--border-color);
        border-radius: 8px;
        padding: 1rem;
        font-size: 1rem;
    }

    /* Tab Styling */
    .stTabs [data-baseweb="tab-list"] {
        gap: 0.5rem;
        background-color: transparent;
    }

    .stTabs [data-baseweb="tab"] {
        background-color: var(--card-background-color);
        border-radius: 8px 8px 0 0;
        padding: 1rem 1.5rem;
        color: var(--secondary-text-color);
        border: 1px solid var(--border-color);
        font-weight: 500;
    }

    .stTabs [aria-selected="true"] {
        background-color: var(--accent-color);
        color: white;
    }

    /* Entity Badge */
    .entity-badge {
        display: inline-block;
        background-color: #2D3748;
        color: #63B3ED;
        padding: 0.3rem 0.8rem;
        border-radius: 12px;
        margin: 0.2rem;
        font-size: 0.85rem;
        font-weight: 500;
    }

    /* Confidence Meter */
    .confidence-meter {
        width: 100%;
        height: 8px;
        background-color: #2D3748;
        border-radius: 10px;
        overflow: hidden;
        margin: 0.5rem 0;
    }

    .confidence-fill {
        height: 100%;
        background: linear-gradient(90deg, var(--success-color) 0%, var(--accent-color) 100%);
        border-radius: 10px;
        transition: width 0.5s ease;
    }

    /* Source Link */
    .source-link {
        display: inline-flex;
        align-items: center;
        color: var(--accent-color);
        text-decoration: none;
        font-weight: 500;
        transition: color 0.3s ease;
    }

    .source-link:hover {
        color: #63B3ED;
    }

    /* Data Stats */
    .stat-box {
        text-align: center;
        padding: 1.5rem;
        background: linear-gradient(135deg, var(--card-background-color) 0%, #252D3A 100%);
        border-radius: 12px;
        border: 1px solid var(--border-color);
    }

    .stat-number {
        font-size: 2.5rem;
        font-weight: 700;
        color: var(--accent-color);
        margin: 0;
    }

    .stat-label {
        font-size: 0.9rem;
        color: var(--secondary-text-color);
        margin-top: 0.5rem;
    }

    /* Loader Animation */
    .loader {
        border: 4px solid var(--border-color);
        border-top: 4px solid var(--accent-color);
        border-radius: 50%;
        width: 40px;
        height: 40px;
        animation: spin 1s linear infinite;
        margin: 2rem auto;
    }

    @keyframes spin {
        0% { transform: rotate(0deg); }
        100% { transform: rotate(360deg); }
    }

    /* Article Preview */
    .article-preview {
        background-color: var(--card-background-color);
        border: 1px solid var(--border-color);
        border-radius: 12px;
        padding: 2rem;
        margin: 1rem 0;
        box-shadow: 0 4px 6px var(--shadow-color);
    }

    .article-preview h3 {
        color: var(--primary-text-color);
        font-size: 1.5rem;
        font-weight: 600;
        margin-bottom: 1rem;
        border-bottom: 2px solid var(--accent-color);
        padding-bottom: 0.5rem;
    }

    .article-preview p {
        color: var(--secondary-text-color);
        line-height: 1.8;
        font-size: 1rem;
    }

    /* Hide Streamlit Branding */
    #MainMenu {visibility: hidden;}
    footer {visibility: hidden;}

    /* Responsive Design */
    @media (max-width: 768px) {
        .main-header h1 {
            font-size: 2rem;
        }

        .card {
            padding: 1rem;
        }
    }
</style>
"""

# ============================================================================
# PAGE CONFIGURATION
# ============================================================================

st.set_page_config(
    page_title="Misinformation Intelligence Platform",
    page_icon="🔎",
    layout="wide",
    initial_sidebar_state="collapsed"
)

# Inject Custom CSS
st.markdown(CUSTOM_CSS, unsafe_allow_html=True)

# ============================================================================
# CACHED RESOURCE LOADING
# ============================================================================

# Use st.session_state to store models across reruns
if 'analysis_models' not in st.session_state or 'generator_model' not in st.session_state:
     st.session_state.analysis_models = None
     st.session_state.generator_model = None
     st.session_state.models_loaded = False
     st.session_state.load_error = None


# Only attempt to load models if backend modules are loaded AND they haven't been loaded yet
if BACKEND_MODULES_LOADED and not st.session_state.models_loaded:
    try:
        st.info("Attempting to load AI models...")
        # Load analysis models (dict of NER, SentenceTransformer, Classifier)
        analysis_models_dict = load_analysis_models('./fine-tuned-classifier')
        st.session_state.analysis_models = analysis_models_dict # Store the dictionary

        # Load generator model
        st.session_state.generator_model = load_generator_model()

        st.session_state.models_loaded = True
        st.success("✅ AI models loaded successfully!")

    except Exception as model_load_error:
        st.session_state.models_loaded = False
        st.session_state.load_error = str(model_load_error)
        st.error(f"❌ Failed to load AI models: {st.session_state.load_error}")
        st.info("💡 Make sure Phase 2 training is complete and models are saved in './fine-tuned-classifier'")


@st.cache_data
def load_dataset():
    """Load the fact-check dataset (cached)"""
    try:
        df = pd.read_csv('fact_check_dataset.csv')
        return df
    except FileNotFoundError:
        st.warning("⚠️ Dataset not found. Please run Phase 1 scraping first.")
        return pd.DataFrame()

# ============================================================================
# MAIN APPLICATION
# ============================================================================

def main():
    # Header
    st.markdown("""
        <div class="main-header">
            <h1>🔎 Misinformation Intelligence Platform</h1>
            <p>AI-Powered Fact-Checking & Verification System</p>
        </div>
    """, unsafe_allow_html=True)

    # Load dataset (cached)
    dataset = load_dataset()

    # Check if models are loaded
    if not st.session_state.models_loaded:
        st.error("🚫 Models not loaded. Please check the errors above and ensure backend modules are functional.")
        if st.session_state.load_error:
             st.error(f"Model loading error details: {st.session_state.load_error}")
        return # Stop main execution if models aren't loaded


    # Create tabs
    tab1, tab2, tab3 = st.tabs([
        "📚 Fact-Check Database",
        "🔬 Live Analysis Tool",
        "🤖 AI Fact-Check Generator"
    ])

    # ========================================================================
    # TAB 1: FACT-CHECK DATABASE
    # ========================================================================
    with tab1:
        st.markdown("### 📊 PolitiFact Dataset Overview")

        if not dataset.empty:
            # Statistics
            col1, col2, col3, col4 = st.columns(4)

            with col1:
                st.markdown(f"""
                    <div class="stat-box">
                        <p class="stat-number">{len(dataset)}</p>
                        <p class="stat-label">Total Articles</p>
                    </div>
                """, unsafe_allow_html=True)

            # Use .isin() or regex for more robust matching, considering variations like 'mostly-true' etc.
            # Assuming simple exact match for demonstration based on previous data
            false_count = len(dataset[dataset['verdict'].str.lower() == 'false'])
            st.markdown(f"""
                <div class="stat-box">
                    <p class="stat-number">{false_count}</p>
                    <p class="stat-label">False Claims</p>
                </div>
            """, unsafe_allow_html=True)

            # This count might be less precise depending on exact verdict strings beyond 'true'
            true_count = len(dataset[dataset['verdict'].str.lower().str.contains('true', na=False)])
            st.markdown(f"""
                <div class="stat-box">
                    <p class="stat-number">{true_count}</p>
                    <p class="stat-label">True Claims</p>
                </div>
            """, unsafe_allow_html=True)

            pants_count = len(dataset[dataset['verdict'].str.lower().str.contains('pants', na=False)])
            st.markdown(f"""
                <div class="stat-box">
                    <p class="stat-number">{pants_count}</p>
                    <p class="stat-label">Pants on Fire</p>
                </div>
            """, unsafe_allow_html=True)

            st.markdown("---")

            # Search and Filter
            search_query = st.text_input("🔍 Search claims:", placeholder="Enter keywords...")

            if search_query:
                filtered_df = dataset[dataset['claim'].str.contains(search_query, case=False, na=False)]
            else:
                filtered_df = dataset

            # Display cards
            st.markdown(f"### Showing {min(len(filtered_df), 20)} results (showing up to 20)")

            # Iterate only up to 20 results to avoid performance issues with large datasets
            for idx, row in filtered_df.head(20).iterrows():
                verdict = row['verdict'].lower()

                # Determine verdict styling based on simplified categories
                if 'true' in verdict and 'false' not in verdict and 'half' not in verdict and 'barely' not in verdict:
                    verdict_class = "verdict-true"
                    verdict_icon = "✅"
                elif 'half-true' in verdict or 'mostly-true' in verdict or 'barely-true' in verdict:
                     verdict_class = "verdict-warning" # Use warning for shades of truth/falsehood
                     verdict_icon = "⚠️"
                elif 'pants' in verdict:
                    verdict_class = "verdict-pants"
                    verdict_icon = "🔥"
                elif 'false' in verdict or 'mostly-false' in verdict:
                     verdict_class = "verdict-danger" # Use danger for false
                     verdict_icon = "❌"
                else:
                    verdict_class = "verdict-info" # Default/catch-all
                    verdict_icon = "❓"


                # Ensure verdict string is safe for display
                display_verdict = row['verdict'].replace('<', '&lt;').replace('>', '&gt;')

                # Create card
                st.markdown(f"""
                    <div class="card">
                        <div style="display: flex; justify-content: space-between; align-items: start; margin-bottom: 1rem;">
                            <div style="flex: 1;">
                                <p style="color: var(--secondary-text-color); font-size: 0.85rem; margin: 0;">
                                    🏛️ {row['source_site']} • {row['date']}
                                </p>
                            </div>
                            <span class="verdict-badge {verdict_class}">
                                {verdict_icon} {display_verdict.upper()}
                            </span>
                        </div>
                        <p style="font-size: 1.1rem; font-weight: 500; color: var(--primary-text-color); margin: 0.5rem 0;">
                            "{row['claim'][:200]}{'...' if len(row['claim']) > 200 else ''}"
                        </p>
                        <a href="{row['article_url']}" target="_blank" class="source-link">
                            🔗 View Original Article →
                        </a>
                    </div>
                """, unsafe_allow_html=True)
        else:
            st.info("📂 No dataset available. Run Phase 1 scraping first.")

    # ========================================================================
    # TAB 2: LIVE ANALYSIS TOOL
    # ========================================================================
    with tab2:
        st.markdown("### 🔬 Real-Time Fact-Checking Analysis")

        # Check if models are loaded before displaying analysis tool
        if not st.session_state.models_loaded:
             st.warning("AI models are not loaded. Please check the 'Home' tab for loading status.")
        else:
            st.markdown("""
                <div class="info-box">
                    <p style="margin: 0;"><strong>📝 Instructions:</strong></p>
                    <p style="margin: 0.5rem 0 0 0;">Enter a claim and supporting article text. Our AI will analyze entities, retrieve evidence, and predict veracity.</p>
                </div>
            """, unsafe_allow_html=True)

            # Input fields
            col1, col2 = st.columns([1, 1])

            with col1:
                claim_input = st.text_area(
                    "Claim to verify:",
                    height=150,
                    placeholder="E.g., 'The president announced a new climate policy yesterday.'"
                )

            with col2:
                article_input = st.text_area(
                    "Supporting article/context:",
                    height=150,
                    placeholder="Paste the article text or context for verification..."
                )

            # Analyze button
            if st.button("🚀 Run Analysis", type="primary"):
                if not claim_input or not article_input:
                    st.warning("⚠️ Please enter both a claim and article text.")
                else:
                    with st.spinner("🔄 Analyzing claim..."):
                        try:
                            # Pass the loaded analysis models dictionary
                            results = run_full_analysis(
                                claim_input,
                                article_input,
                                st.session_state.analysis_models # Use the dictionary from session state
                            )

                            # Display results in beautiful cards
                            st.markdown("---")
                            st.markdown("### 📊 Analysis Results")

                            # Verdict card
                            verdict = results['predicted_verdict']
                            confidence = results['confidence']

                            # Map predicted verdict to styling classes
                            verdict_lower = verdict.lower()
                            if 'true' in verdict_lower and 'false' not in verdict_lower:
                                box_class = "success-box"
                                icon = "✅"
                            elif 'pants' in verdict_lower:
                                box_class = "danger-box"
                                icon = "🔥"
                            elif 'false' in verdict_lower:
                                box_class = "danger-box" # False is also danger
                                icon = "❌"
                            else: # Catch any other potential labels
                                box_class = "warning-box"
                                icon = "⚠️"


                            st.markdown(f"""
                                <div class="{box_class}">
                                    <h3 style="margin: 0 0 0.5rem 0;">{icon} Predicted Verdict</h3>
                                    <p style="font-size: 1.5rem; font-weight: 700; margin: 0.5rem 0;">
                                        {verdict}
                                    </p>
                                    <p style="margin: 0.5rem 0 0 0; color: var(--secondary-text-color);">
                                        Confidence: {confidence * 100:.1f}%
                                    </p>
                                    <div class="confidence-meter">
                                        <div class="confidence-fill" style="width: {confidence * 100}%;"></div>
                                    </div>
                                </div>
                            """, unsafe_allow_html=True)

                            # Entities card
                            st.markdown("#### 👥 Key Entities Identified")
                            if results['entities']:
                                entities_html = " ".join([
                                    # Ensure entity text and type are safe for HTML
                                    f"<span class='entity-badge'>{ent.get('text', 'N/A').replace('<', '&lt;').replace('>', '&gt;')} ({ent.get('entity_type', 'N/A').replace('<', '&lt;').replace('>', '&gt;')})</span>"
                                    for ent in results['entities']
                                ])
                                st.markdown(f"""
                                    <div class="card">
                                        {entities_html}
                                    </div>
                                """, unsafe_allow_html=True)
                            else:
                                st.info("No named entities detected.")

                            # Evidence card
                            st.markdown("#### 🔍 Primary Evidence")
                            st.markdown(f"""
                                <div class="card">
                                    <p style="color: var(--secondary-text-color); font-size: 0.85rem; margin: 0 0 0.5rem 0;">
                                        Similarity Score: {results['evidence_similarity'] * 100:.1f}%
                                    </p>
                                    <p style="font-style: italic; color: var(--primary-text-color); margin: 0;">
                                        "{results['evidence_sentence'].replace('<', '&lt;').replace('>', '&gt;')}"
                                    </p>
                                </div>
                            """, unsafe_allow_html=True)

                        except Exception as e:
                            st.error(f"❌ Analysis failed: {e}")
                            logger.error(f"Analysis pipeline execution error: {e}", exc_info=True)


    # ========================================================================
    # TAB 3: AI FACT-CHECK GENERATOR
    # ========================================================================
    with tab3:
        st.markdown("### 🤖 AI-Powered Article Generator")

        # Check if generator model is loaded
        if not st.session_state.models_loaded or st.session_state.generator_model is None:
             st.warning("AI generator model is not loaded. Please check the 'Home' tab for loading status.")
        else:
            st.markdown("""
                <div class="info-box">
                    <p style="margin: 0;"><strong>✨ Feature:</strong></p>
                    <p style="margin: 0.5rem 0 0 0;">Generate professional fact-check articles using AI. Provide a claim and select a verdict.</p>
                </div>
            """, unsafe_allow_html=True)

            # Input fields
            gen_claim = st.text_area(
                "Enter the claim:",
                height=100,
                placeholder="E.g., 'Vaccines cause autism'"
            )

            gen_verdict = st.selectbox(
                "Select verdict:",
                ["TRUE", "FALSE", "PANTS ON FIRE"] # Use the simplified labels from training
            )

            gen_evidence = st.text_area(
                "Optional: Add evidence context",
                height=100,
                placeholder="Supporting evidence or context..."
            )

            # Generate button
            if st.button("✨ Generate Article", type="primary"):
                if not gen_claim:
                    st.warning("⚠️ Please enter a claim.")
                else:
                    with st.spinner("🤖 Generating fact-check article..."):
                        try:
                            article = generate_fact_check_article(
                                gen_claim,
                                gen_verdict,
                                st.session_state.generator_model, # Use the generator model from session state
                                gen_evidence if gen_evidence else None
                            )

                            # Display in article preview
                            st.markdown("---")
                            st.markdown("### 📄 Generated Article")

                            # Map selected verdict to styling class for preview
                            verdict_lower = gen_verdict.lower()
                            if 'true' in verdict_lower:
                                verdict_class = "verdict-true"
                            elif 'pants' in verdict_lower:
                                verdict_class = "verdict-pants"
                            elif 'false' in verdict_lower:
                                verdict_class = "verdict-danger"
                            else:
                                verdict_class = "verdict-info"


                            st.markdown(f"""
                                <div class="article-preview">
                                    <h3>Fact-Check: {gen_claim[:100]}{'...' if len(gen_claim) > 100 else ''}</h3>
                                    <p><strong>Verdict:</strong> <span class="verdict-badge {verdict_class}">
                                        {gen_verdict}
                                    </span></p>
                                    <hr style="border: none; border-top: 1px solid var(--border-color); margin: 1rem 0;">
                                    <p>{article.replace('<', '&lt;').replace('>', '&gt;')}</p>
                                </div>
                            """, unsafe_allow_html=True) # Sanitize generated article text

                            # Download option
                            st.download_button(
                                "📥 Download Article",
                                article,
                                file_name="fact_check_article.txt",
                                mime="text/plain"
                            )

                        except Exception as e:
                            st.error(f"❌ Generation failed: {e}")
                            logger.error(f"Article generation error: {e}", exc_info=True)


    # Footer
    st.markdown("---")
    st.markdown("""
        <div style="text-align: center; color: var(--secondary-text-color); padding: 2rem 0;">
            <p>🔎 Misinformation Intelligence Platform | Powered by AI</p>
            <p style="font-size: 0.85rem;">Built with Streamlit • Phase 4 Production Frontend</p>
        </div>
    """, unsafe_allow_html=True)

# Run the main function
if __name__ == "__main__":
    main()

Overwriting app.py


In [None]:
from pyngrok import ngrok
import os

# Kill any existing tunnels
ngrok.kill()

# Create tunnel
public_url = ngrok.connect(8501)

print("="*80)
print("🚀 MISINFORMATION INTELLIGENCE PLATFORM IS LIVE!")
print("="*80)
print(f"\n✅ Public URL: {public_url}")
print("\n📱 Click the link above to access your application")
print("="*80)

# Launch Streamlit
!streamlit run app.py --server.port 8501 --server.headless true

🚀 MISINFORMATION INTELLIGENCE PLATFORM IS LIVE!

✅ Public URL: NgrokTunnel: "https://skyward-unextracted-illa.ngrok-free.dev" -> "http://localhost:8501"

📱 Click the link above to access your application

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.187.170.55:8501[0m
[0m
2025-10-07 21:32:26.586750: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759872746.609698   47803 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759872746.616794   47803 cuda_blas.cc:1407] Unable to regis