In [None]:
# Fake News Detection System
# Complete implementation for ITM-360 AI Class Project

import pandas as pd
import numpy as np
import re
import string
import pickle
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# NLP Libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import spacy

# Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel
import torch

# Machine Learning Models
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder

# Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, Bidirectional, Input, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Explainability
import shap
import lime
from lime.lime_text import LimeTextExplainer

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# ======================== 1. DATA LOADING & PREPROCESSING ========================

class DataPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
    def clean_text(self, text):
        """Clean and preprocess text data"""
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www.\S+', '', text)
        
        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # Tokenize
        tokens = word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens 
                 if token not in self.stop_words and len(token) > 2]
        
        return ' '.join(tokens)
    
    def load_and_preprocess_data(self, filepath):
        """Load and preprocess the dataset"""
        # Load data
        df = pd.read_csv(filepath)
        
        # Handle missing values
        df = df.dropna(subset=['text', 'label'])
        
        # Clean text
        df['cleaned_text'] = df['text'].apply(self.clean_text)
        
        # Encode labels (0: real, 1: fake)
        le = LabelEncoder()
        df['label_encoded'] = le.fit_transform(df['label'])
        
        return df, le

# ======================== 2. FEATURE EXTRACTION ========================

class FeatureExtractor:
    def __init__(self):
        self.tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
        self.word2vec_model = None
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
        
    def extract_tfidf_features(self, texts_train, texts_test):
        """Extract TF-IDF features"""
        X_train_tfidf = self.tfidf_vectorizer.fit_transform(texts_train)
        X_test_tfidf = self.tfidf_vectorizer.transform(texts_test)
        return X_train_tfidf, X_test_tfidf
    
    def train_word2vec(self, texts, vector_size=100, window=5, min_count=2):
        """Train Word2Vec model"""
        tokenized_texts = [text.split() for text in texts]
        self.word2vec_model = Word2Vec(tokenized_texts, vector_size=vector_size, 
                                       window=window, min_count=min_count, workers=4)
        return self.word2vec_model
    
    def get_word2vec_features(self, texts):
        """Get Word2Vec features for texts"""
        features = []
        for text in texts:
            words = text.split()
            vec = np.zeros(self.word2vec_model.wv.vector_size)
            count = 0
            for word in words:
                if word in self.word2vec_model.wv:
                    vec += self.word2vec_model.wv[word]
                    count += 1
            if count > 0:
                vec = vec / count
            features.append(vec)
        return np.array(features)
    
    def get_bert_embeddings(self, texts, max_length=128, batch_size=32):
        """Extract BERT embeddings"""
        embeddings = []
        
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            
            # Tokenize
            encoded = self.bert_tokenizer.batch_encode_plus(
                batch_texts,
                add_special_tokens=True,
                max_length=max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            
            # Get BERT embeddings
            with torch.no_grad():
                outputs = self.bert_model(**encoded)
                # Use [CLS] token embedding
                batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
                embeddings.extend(batch_embeddings)
        
        return np.array(embeddings)

# ======================== 3. MODEL TRAINING ========================

class ModelTrainer:
    def __init__(self):
        self.models = {}
        self.results = {}
        
    def train_naive_bayes(self, X_train, y_train, X_test, y_test):
        """Train Naive Bayes model"""
        # Grid search for best parameters
        param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0]}
        nb_model = MultinomialNB()
        grid_search = GridSearchCV(nb_model, param_grid, cv=5, scoring='f1')
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        
        self.models['naive_bayes'] = best_model
        self.evaluate_model('Naive Bayes', y_test, y_pred)
        
        return best_model
    
    def train_random_forest(self, X_train, y_train, X_test, y_test):
        """Train Random Forest model"""
        # Grid search for best parameters
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5]
        }
        rf_model = RandomForestClassifier(random_state=42)
        grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='f1', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        
        self.models['random_forest'] = best_model
        self.evaluate_model('Random Forest', y_test, y_pred)
        
        return best_model
    
    def build_lstm_model(self, vocab_size, embedding_dim=100, max_length=100):
        """Build LSTM model"""
        model = Sequential([
            Embedding(vocab_size, embedding_dim, input_length=max_length),
            Bidirectional(LSTM(128, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)),
            GlobalMaxPooling1D(),
            Dense(64, activation='relu'),
            Dropout(0.5),
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(optimizer=Adam(learning_rate=0.001),
                     loss='binary_crossentropy',
                     metrics=['accuracy'])
        
        return model
    
    def train_lstm(self, X_train, y_train, X_val, y_val, vocab_size, max_length=100):
        """Train LSTM model"""
        model = self.build_lstm_model(vocab_size, max_length=max_length)
        
        # Callbacks
        early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
        checkpoint = ModelCheckpoint('best_lstm_model.h5', save_best_only=True)
        reduce_lr = ReduceLROnPlateau(factor=0.5, patience=2)
        
        history = model.fit(
            X_train, y_train,
            batch_size=32,
            epochs=10,
            validation_data=(X_val, y_val),
            callbacks=[early_stopping, checkpoint, reduce_lr],
            verbose=1
        )
        
        self.models['lstm'] = model
        
        return model, history
    
    def build_bert_lstm_model(self, bert_dim=768, max_length=100):
        """Build BERT + LSTM hybrid model"""
        # Input layer for BERT embeddings
        bert_input = Input(shape=(bert_dim,), name='bert_input')
        
        # Dense layers for BERT features
        bert_dense = Dense(256, activation='relu')(bert_input)
        bert_dropout = Dropout(0.3)(bert_dense)
        
        # Additional dense layers
        dense1 = Dense(128, activation='relu')(bert_dropout)
        dropout1 = Dropout(0.3)(dense1)
        
        dense2 = Dense(64, activation='relu')(dropout1)
        dropout2 = Dropout(0.3)(dense2)
        
        # Output layer
        output = Dense(1, activation='sigmoid')(dropout2)
        
        model = Model(inputs=bert_input, outputs=output)
        
        model.compile(optimizer=Adam(learning_rate=0.001),
                     loss='binary_crossentropy',
                     metrics=['accuracy'])
        
        return model
    
    def train_bert_lstm(self, X_train_bert, y_train, X_val_bert, y_val):
        """Train BERT + LSTM hybrid model"""
        model = self.build_bert_lstm_model()
        
        early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
        checkpoint = ModelCheckpoint('best_bert_lstm_model.h5', save_best_only=True)
        
        history = model.fit(
            X_train_bert, y_train,
            batch_size=32,
            epochs=10,
            validation_data=(X_val_bert, y_val),
            callbacks=[early_stopping, checkpoint],
            verbose=1
        )
        
        self.models['bert_lstm'] = model
        
        return model, history
    
    def evaluate_model(self, model_name, y_true, y_pred):
        """Evaluate model performance"""
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        
        self.results[model_name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }
        
        print(f"\n{model_name} Results:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}")
        
        return self.results[model_name]

# ======================== 4. EXPLAINABILITY ========================

class ModelExplainer:
    def __init__(self, model, vectorizer=None):
        self.model = model
        self.vectorizer = vectorizer
        
    def explain_with_lime(self, text, num_features=10):
        """Explain prediction using LIME"""
        explainer = LimeTextExplainer(class_names=['Real', 'Fake'])
        
        def predict_proba(texts):
            if self.vectorizer:
                X = self.vectorizer.transform(texts)
                return self.model.predict_proba(X)
            else:
                # For neural networks
                return self.model.predict(texts)
        
        exp = explainer.explain_instance(text, predict_proba, num_features=num_features)
        
        return exp
    
    def explain_with_shap(self, X_train_sample, X_test_sample):
        """Explain predictions using SHAP"""
        if hasattr(self.model, 'predict_proba'):
            # For sklearn models
            explainer = shap.Explainer(self.model.predict_proba, X_train_sample)
            shap_values = explainer(X_test_sample)
        else:
            # For neural networks
            explainer = shap.DeepExplainer(self.model, X_train_sample)
            shap_values = explainer.shap_values(X_test_sample)
        
        return shap_values

# ======================== 5. VISUALIZATION ========================

class Visualizer:
    @staticmethod
    def plot_confusion_matrix(y_true, y_pred, title='Confusion Matrix'):
        """Plot confusion matrix"""
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=['Real', 'Fake'], 
                   yticklabels=['Real', 'Fake'])
        plt.title(title)
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()
    
    @staticmethod
    def plot_roc_curve(y_true, y_scores, title='ROC Curve'):
        """Plot ROC curve"""
        fpr, tpr, _ = roc_curve(y_true, y_scores)
        roc_auc = roc_auc_score(y_true, y_scores)
        
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, color='darkorange', lw=2, 
                label=f'ROC curve (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(title)
        plt.legend(loc="lower right")
        plt.show()
    
    @staticmethod
    def plot_training_history(history):
        """Plot training history for neural networks"""
        fig, axes = plt.subplots(1, 2, figsize=(12, 4))
        
        # Accuracy
        axes[0].plot(history.history['accuracy'], label='Training Accuracy')
        axes[0].plot(history.history['val_accuracy'], label='Validation Accuracy')
        axes[0].set_title('Model Accuracy')
        axes[0].set_xlabel('Epoch')
        axes[0].set_ylabel('Accuracy')
        axes[0].legend()
        
        # Loss
        axes[1].plot(history.history['loss'], label='Training Loss')
        axes[1].plot(history.history['val_loss'], label='Validation Loss')
        axes[1].set_title('Model Loss')
        axes[1].set_xlabel('Epoch')
        axes[1].set_ylabel('Loss')
        axes[1].legend()
        
        plt.tight_layout()
        plt.show()
    
    @staticmethod
    def plot_word_clouds(real_texts, fake_texts):
        """Plot word clouds for real and fake news"""
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        
        # Real news word cloud
        real_wordcloud = WordCloud(width=800, height=400, 
                                   background_color='white').generate(' '.join(real_texts))
        axes[0].imshow(real_wordcloud, interpolation='bilinear')
        axes[0].set_title('Real News Word Cloud')
        axes[0].axis('off')
        
        # Fake news word cloud
        fake_wordcloud = WordCloud(width=800, height=400, 
                                   background_color='white').generate(' '.join(fake_texts))
        axes[1].imshow(fake_wordcloud, interpolation='bilinear')
        axes[1].set_title('Fake News Word Cloud')
        axes[1].axis('off')
        
        plt.tight_layout()
        plt.show()

# ======================== 6. MAIN PIPELINE ========================

class FakeNewsDetectionPipeline:
    def __init__(self):
        self.preprocessor = DataPreprocessor()
        self.feature_extractor = FeatureExtractor()
        self.model_trainer = ModelTrainer()
        self.visualizer = Visualizer()
        
    def run_pipeline(self, data_path):
        """Run the complete fake news detection pipeline"""
        
        print("="*50)
        print("FAKE NEWS DETECTION SYSTEM")
        print("="*50)
        
        # 1. Load and preprocess data
        print("\n1. Loading and preprocessing data...")
        df, label_encoder = self.preprocessor.load_and_preprocess_data(data_path)
        print(f"Data loaded: {len(df)} samples")
        print(f"Class distribution:\n{df['label'].value_counts()}")
        
        # 2. Split data
        print("\n2. Splitting data...")
        X_train, X_test, y_train, y_test = train_test_split(
            df['cleaned_text'], df['label_encoded'], 
            test_size=0.2, random_state=42, stratify=df['label_encoded']
        )
        
        X_train, X_val, y_train, y_val = train_test_split(
            X_train, y_train, 
            test_size=0.2, random_state=42, stratify=y_train
        )
        
        print(f"Training set: {len(X_train)} samples")
        print(f"Validation set: {len(X_val)} samples")
        print(f"Test set: {len(X_test)} samples")
        
        # 3. Feature extraction
        print("\n3. Extracting features...")
        
        # TF-IDF features
        print("   - Extracting TF-IDF features...")
        X_train_tfidf, X_test_tfidf = self.feature_extractor.extract_tfidf_features(
            X_train, X_test
        )
        X_val_tfidf = self.feature_extractor.tfidf_vectorizer.transform(X_val)
        
        # Word2Vec features
        print("   - Training Word2Vec model...")
        self.feature_extractor.train_word2vec(X_train)
        X_train_w2v = self.feature_extractor.get_word2vec_features(X_train)
        X_val_w2v = self.feature_extractor.get_word2vec_features(X_val)
        X_test_w2v = self.feature_extractor.get_word2vec_features(X_test)
        
        # BERT embeddings (using smaller subset for demo)
        print("   - Extracting BERT embeddings (this may take a while)...")
        # Note: For full implementation, process all data
        # Here we use a subset for demonstration
        sample_size = min(1000, len(X_train))
        X_train_bert = self.feature_extractor.get_bert_embeddings(
            X_train[:sample_size].tolist()
        )
        X_val_bert = self.feature_extractor.get_bert_embeddings(
            X_val[:min(200, len(X_val))].tolist()
        )
        X_test_bert = self.feature_extractor.get_bert_embeddings(
            X_test[:min(200, len(X_test))].tolist()
        )
        
        # 4. Train models
        print("\n4. Training models...")
        
        # Baseline ML models
        print("\n   a) Training Naive Bayes...")
        nb_model = self.model_trainer.train_naive_bayes(
            X_train_tfidf, y_train, X_test_tfidf, y_test
        )
        
        print("\n   b) Training Random Forest...")
        rf_model = self.model_trainer.train_random_forest(
            X_train_w2v, y_train, X_test_w2v, y_test
        )
        
        # Deep Learning models
        print("\n   c) Training LSTM...")
        # Prepare data for LSTM
        tokenizer = Tokenizer(num_words=5000)
        tokenizer.fit_on_texts(X_train)
        
        X_train_seq = tokenizer.texts_to_sequences(X_train)
        X_val_seq = tokenizer.texts_to_sequences(X_val)
        X_test_seq = tokenizer.texts_to_sequences(X_test)
        
        max_length = 100
        X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
        X_val_pad = pad_sequences(X_val_seq, maxlen=max_length)
        X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)
        
        lstm_model, lstm_history = self.model_trainer.train_lstm(
            X_train_pad, y_train, X_val_pad, y_val, 
            vocab_size=5000, max_length=max_length
        )
        
        # Evaluate LSTM
        lstm_pred = (lstm_model.predict(X_test_pad) > 0.5).astype(int).flatten()
        self.model_trainer.evaluate_model('LSTM', y_test, lstm_pred)
        
        print("\n   d) Training BERT + Dense Network...")
        bert_model, bert_history = self.model_trainer.train_bert_lstm(
            X_train_bert, y_train[:sample_size], 
            X_val_bert, y_val[:min(200, len(y_val))]
        )
        
        # Evaluate BERT model
        bert_pred = (bert_model.predict(X_test_bert) > 0.5).astype(int).flatten()
        self.model_trainer.evaluate_model(
            'BERT + Dense', y_test[:min(200, len(y_test))], bert_pred
        )
        
        # 5. Generate visualizations
        print("\n5. Generating visualizations...")
        
        # Confusion matrices
        self.visualizer.plot_confusion_matrix(y_test, nb_model.predict(X_test_tfidf), 
                                             'Naive Bayes Confusion Matrix')
        
        # ROC curves
        if hasattr(nb_model, 'predict_proba'):
            nb_scores = nb_model.predict_proba(X_test_tfidf)[:, 1]
            self.visualizer.plot_roc_curve(y_test, nb_scores, 'Naive Bayes ROC Curve')
        
        # Training history for neural networks
        self.visualizer.plot_training_history(lstm_history)
        
        # Word clouds
        real_texts = df[df['label_encoded'] == 0]['cleaned_text'].tolist()[:500]
        fake_texts = df[df['label_encoded'] == 1]['cleaned_text'].tolist()[:500]
        self.visualizer.plot_word_clouds(real_texts, fake_texts)
        
        # 6. Model explainability
        print("\n6. Generating model explanations...")
        
        # LIME explanation for a sample
        sample_text = X_test.iloc[0]
        explainer = ModelExplainer(nb_model, self.feature_extractor.tfidf_vectorizer)
        lime_exp = explainer.explain_with_lime(sample_text)
        print("\nLIME Explanation for sample text:")
        print(lime_exp.as_list()[:5])  # Top 5 features
        
        # 7. Save models
        print("\n7. Saving models...")
        
        # Save preprocessor
        with open('preprocessor.pkl', 'wb') as f:
            pickle.dump(self.preprocessor, f)
        
        # Save feature extractors
        with open('tfidf_vectorizer.pkl', 'wb') as f:
            pickle.dump(self.feature_extractor.tfidf_vectorizer, f)
        
        # Save ML models
        with open('naive_bayes_model.pkl', 'wb') as f:
            pickle.dump(nb_model, f)
        
        with open('random_forest_model.pkl', 'wb') as f:
            pickle.dump(rf_model, f)
        
        # Save tokenizer for LSTM
        with open('lstm_tokenizer.pkl', 'wb') as f:
            pickle.dump(tokenizer, f)
        
        print("\nModels saved successfully!")
        
        # 8. Print final results summary
        print("\n" + "="*50)
        print("FINAL RESULTS SUMMARY")
        print("="*50)
        
        results_df = pd.DataFrame(self.model_trainer.results).T
        print(results_df.to_string())
        
        return self.model_trainer.models, self.model_trainer.results

# ======================== 7. USAGE EXAMPLE ========================

if __name__ == "__main__":
    # Initialize pipeline
    pipeline = FakeNewsDetectionPipeline()
    
    # Run the pipeline with your dataset
    # Replace 'fake_news_data.csv' with your actual dataset path
    # The dataset should have 'text' and 'label' columns
    
    try:
        models, results = pipeline.run_pipeline('fake_news_data.csv')
        print("\nPipeline completed successfully!")
    except FileNotFoundError:
        print("\nNote: Please ensure you have the fake news dataset.")
        print("You can download it from Kaggle:")
        print("https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset")
        print("\nExpected CSV format:")
        print("- 'text': The news article text")
        print("- 'label': 'REAL' or 'FAKE'")
        
        # Create a small sample dataset for demonstration
        print("\nCreating sample dataset for demonstration...")
        sample_data = pd.DataFrame({
            'text': [
                "The president announced new economic policies today...",
                "Scientists discover miracle cure that doctors hate...",
                "Stock market reaches new highs amid strong earnings...",
                "Aliens confirmed to be living among us, government admits...",
                "Climate change study shows concerning trends...",
                "Celebrity secretly a lizard person, insider reveals..."
            ],
            'label': ['REAL', 'FAKE', 'REAL', 'FAKE', 'REAL', 'FAKE']
        })
        sample_data.to_csv('sample_fake_news_data.csv', index=False)
        print("Sample dataset created: 'sample_fake_news_data.csv'")
        
        # Run with sample data
        models, results = pipeline.run_pipeline('sample_fake_news_data.csv')