In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import time
import nltk

# --- NLTK Setup ---
# Ensure necessary NLTK components are downloaded for robust text processing
try:
    stopwords.words('english')
except LookupError:
    print("Downloading NLTK stopwords...")
    nltk.download('stopwords')
try:
    WordNetLemmatizer().lemmatize('test')
except LookupError:
    print("Downloading NLTK wordnet...")
    nltk.download('wordnet')

# --- Configuration ---
DATA_FILE = 'mega_fake_real_political_news.csv'
STOP_WORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

# --- Preprocessing Functions ---

def clean_text(text):
    """
    Cleans the input text by removing special characters, converting to lowercase,
    removing stop words, and lemmatizing the tokens (word base form).
    """
    # Check if input is a valid string
    if not isinstance(text, str):
        return ""
    
    # 1. Lowercase and remove punctuation/numbers
    text = text.lower()
    # Retain only letters and spaces
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 2. Tokenize and remove stop words
    tokens = text.split()
    tokens = [word for word in tokens if word not in STOP_WORDS]
    
    # 3. Lemmatize (crucial for combining similar words like 'running' and 'ran' to 'run')
    tokens = [LEMMATIZER.lemmatize(word) for word in tokens]
    
    return " ".join(tokens)

def train_model(X_train, X_test, y_train, y_test, vectorizer):
    """
    Handles Feature Engineering (TF-IDF) and Model Training (Logistic Regression)
    for the keyword-based classification.
    """
    
    print("\n\n--- üöÄ Model Training and Evaluation ---")
    
    # Feature Engineering: TF-IDF
    # TF-IDF converts text into numerical feature vectors by weighting words
    # based on how frequently they appear in a document relative to the corpus.
    # This helps identify the key discriminating "fake news" keywords.
    print("1. Fitting TF-IDF Vectorizer...")
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    print(f"   -> Vocabulary Size (Number of Features): {len(vectorizer.vocabulary_)}")
    
    # Classification Model: Logistic Regression
    # A simple yet effective linear model often used as a baseline for text classification.
    print("2. Training Logistic Regression (Keyword Classifier)...")
    model = LogisticRegression(max_iter=1000, random_state=42)
    start_time = time.time()
    model.fit(X_train_vec, y_train)
    training_time = time.time() - start_time
    print(f"   -> Training Complete in {training_time:.2f} seconds.")
    
    # Predict and Evaluate
    print("3. Evaluating Model Performance...")
    y_pred = model.predict(X_test_vec)
    
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f"\n‚úÖ Model Accuracy on Test Set: {accuracy:.4f}")
    print("\nDetailed Classification Report:")
    print(report)
    
    print("--- ‚úÖ Training and Evaluation Complete ---")
    
    return model, vectorizer

def interactive_test(model, vectorizer):
    """Allows interactive testing of the trained model."""
    print("\n\n--- üî¨ Interactive Fake News Detector ---")
    print("Enter a news headline to classify (or type 'quit' to exit).")
    
    while True:
        headline = input("\n> Enter Headline: ")
        if headline.lower() == 'quit':
            break
        
        # Preprocess the input headline using the same function as training
        processed_headline = clean_text(headline)
        
        if not processed_headline:
            print("‚ùå Please enter a valid headline.")
            continue
            
        # Vectorize the processed headline using the fitted vectorizer
        headline_vec = vectorizer.transform([processed_headline])
        
        # Predict the label and confidence
        prediction = model.predict(headline_vec)[0]
        probabilities = model.predict_proba(headline_vec)[0]
        confidence = max(probabilities) * 100
        
        # Display results
        if prediction == 1:
            label = "REAL"
            color = "üü¢"
            flag_message = "This article seems to be legitimate."
        else:
            label = "FAKE"
            color = "üî¥"
            flag_message = "üö® POTENTIALLY MISLEADING OR FAKE NEWS DETECTED! (Keyword-based flag)"

        print("-" * 50)
        print(f"{color} CLASSIFICATION: {label}")
        print(f"   Confidence: {confidence:.2f}%")
        print(f"   Keywords Analyzed: {processed_headline}")
        print(f"   Recommendation: {flag_message}")
        print("-" * 50)

# --- Main Execution ---

if __name__ == "__main__":
    
    # 1. Data Loading
    try:
        df = pd.read_csv(DATA_FILE)
        print(f"‚úÖ Dataset '{DATA_FILE}' loaded successfully. Rows: {len(df)}")
    except FileNotFoundError:
        print(f"‚ùå Error: Dataset file '{DATA_FILE}' not found. Please ensure it's in the same directory.")
        exit()
    
    # Ensure necessary columns exist (title for text, label for target)
    if 'title' not in df.columns or 'label' not in df.columns:
        print("‚ùå Error: Dataset must contain 'title' and 'label' columns.")
        exit()

    # Data Cleaning and Preparation
    print("--- ‚öôÔ∏è Data Preprocessing ---")
    
    # Standardize labels and convert them to numerical format: FAKE=0, REAL=1
    df['label'] = df['label'].astype(str).str.upper().replace({'VERIFIED': 'REAL', 'CLAIM': 'FAKE'})
    df = df[df['label'].isin(['REAL', 'FAKE'])].copy()
    df['target'] = df['label'].apply(lambda x: 1 if x == 'REAL' else 0)

    # Apply the cleaning function to the article titles
    print("1. Cleaning and processing article titles...")
    df['processed_title'] = df['title'].apply(clean_text)
    
    # Drop rows that resulted in empty processed titles
    df.dropna(subset=['processed_title'], inplace=True)
    df = df[df['processed_title'].str.len() > 0]
    
    print(f"2. Final processed articles available for modeling: {len(df)}")

    if len(df) == 0:
        print("‚ùå Error: After cleaning, no usable data remains. Check your data.")
        exit()

    # 3. Split Data into Training and Testing sets (80/20 split)
    X = df['processed_title']
    y = df['target']
    
    # stratify=y ensures the ratio of real/fake news is maintained in both sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"3. Data split: Training samples={len(X_train)}, Testing samples={len(X_test)}")
    
    # 4. Initialize Vectorizer
    # max_features limits the vocabulary size to prevent overfitting and speed up training.
    # ngram_range=(1, 2) includes single words (unigrams) and pairs of words (bigrams).
    tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
    
    # 5. Train and Test Model
    model, vectorizer = train_model(X_train, X_test, y_train, y_test, tfidf_vectorizer)
    
    # 6. Interactive Demonstration
    interactive_test(model, vectorizer)

Downloading NLTK wordnet...


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rosamistica\AppData\Roaming\nltk_data...


‚úÖ Dataset 'mega_fake_real_political_news.csv' loaded successfully. Rows: 206
--- ‚öôÔ∏è Data Preprocessing ---
1. Cleaning and processing article titles...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


2. Final processed articles available for modeling: 191
3. Data split: Training samples=152, Testing samples=39


--- üöÄ Model Training and Evaluation ---
1. Fitting TF-IDF Vectorizer...
   -> Vocabulary Size (Number of Features): 525
2. Training Logistic Regression (Keyword Classifier)...
   -> Training Complete in 0.02 seconds.
3. Evaluating Model Performance...

‚úÖ Model Accuracy on Test Set: 0.9744

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        38
           1       0.00      0.00      0.00         1

    accuracy                           0.97        39
   macro avg       0.49      0.50      0.49        39
weighted avg       0.95      0.97      0.96        39

--- ‚úÖ Training and Evaluation Complete ---


--- üî¨ Interactive Fake News Detector ---
Enter a news headline to classify (or type 'quit' to exit).
‚ùå Please enter a valid headline.
---------------------------------------------