In [4]:
url = input("ENter")

In [5]:
from newspaper import Article
import requests
from newspaper.configuration import Configuration
import json

# URL of the article
# Custom configuration to set headers
config = Configuration()
config.browser_user_agent = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
)

# Create the Article object with the custom configuration
article = Article(url, config=config)

try:
    # Fetch and parse the article
    article.download()
    article.parse()
    
    # Extract content
    article_data = {
        "title": article.title,
        "authors": article.authors,
        "publication_date": str(article.publish_date) if article.publish_date else "Unknown",
        "content": article.text,
        "url": url
    }

    # Convert to JSON
    article_json = json.dumps(article_data, indent=4)
    print(article_json)

except Exception as e:
    print(f"An error occurred: {e}")

article_dict = json.loads(article_json)

# Extract the content part
content = article_dict.get("content", "Content not found")
print(content)

{
    "title": "ISRO Satellite Images Reveal Massive Damage Caused By Myanmar Earthquake",
    "authors": [
        "Mahima Joshi"
    ],
    "publication_date": "Unknown",
    "content": "ISRO Satellite Images Reveal Massive Damage Caused By Myanmar Earthquake | See Pics\n\nCurated By :\n\nNews18.com\n\nLast Updated: April 01, 2025, 07:19 IST\n\nISRO captured satellite images of the damage caused by the powerful earthquake measuring 7.7 in magnitude struck Myanmar on March 28, 2025.\n\nImages showing destruction caused by earthquake in Myanmar. (Image: News18)\n\nThe Indian Space Research Organisation (ISRO) on Monday released satellite images of the powerful earthquake caused by the 7.7 magnitude earthquake that struck Myanmar and rattled neighbouring countries on March 28, 2025.\n\nThe disaster caused widespread devastation, particularly near Mandalay, Myanmar\u2019s second-largest city killing over 1,700 people, according to the latest input.\n\nrelated stories\n\nCartosat-3, the s

In [4]:
test_text = content

In [6]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, LSTM, GRU, Bidirectional
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Initialize NLP tools
nltk.download('stopwords')
nltk.download('wordnet')

class MultiModelNewsDetector:
    def __init__(self):
        self.max_len = 200
        self.max_words = 10000
        self.tokenizer = Tokenizer(num_words=self.max_words)
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
        # Initialize models
        self.cnn_model = self.build_cnn_model()
        self.lstm_gru_model = self.build_lstm_gru_model()
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
    
    def build_cnn_model(self):
        """1D CNN for text classification"""
        model = Sequential([
            Embedding(self.max_words, 128, input_length=self.max_len),
            Conv1D(128, 5, activation='relu'),
            GlobalMaxPooling1D(),
            Dense(128, activation='relu'),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def build_lstm_gru_model(self):
        """Hybrid LSTM-GRU model"""
        model = Sequential([
            Embedding(self.max_words, 128, input_length=self.max_len),
            Bidirectional(LSTM(64, return_sequences=True)),
            GRU(64),
            Dense(128, activation='relu'),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def clean_text(self, text):
        """Standard text preprocessing"""
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        words = text.split()
        words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
        return ' '.join(words)
    
    def extract_article(self, url):
        """Universal news extractor"""
        try:
            headers = {'User-Agent': 'Mozilla/5.0'}
            response = requests.get(url, headers=headers, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Try multiple content selectors
            selectors = [
                {'class': ['article-content', 'article-body', 'content']},
                {'itemprop': 'articleBody'},
                {'class': re.compile('(post|entry)-content')},
                {'id': re.compile('content|body|main')}
            ]
            
            article_body = None
            for selector in selectors:
                article_body = soup.find('div', selector)
                if article_body: break
            
            if not article_body:
                article_body = soup  # Fallback to whole page
            
            paragraphs = article_body.find_all('p')
            article_text = ' '.join([p.get_text().strip() for p in paragraphs if p.get_text().strip()])
            
            if len(article_text) < 100:
                return None, "Insufficient article text"
                
            return self.clean_text(article_text), None
            
        except Exception as e:
            return None, f"Extraction error: {str(e)}"
    
    def prepare_input(self, text, model_type='cnn'):
        """Prepare input for different models"""
        if model_type == 'bert':
            return self.bert_tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=self.max_len,
                return_tensors='tf'
            )
        else:
            sequences = self.tokenizer.texts_to_sequences([text])
            return pad_sequences(sequences, maxlen=self.max_len)
    
    def predict_with_all_models(self, url):
        """Run prediction with all three models"""
        article_text, error = self.extract_article(url)
        if error:
            return {"error": error}
        
        # For demo, we'll fit tokenizer on the fly (in production, use pre-trained)
        self.tokenizer.fit_on_texts([article_text])
        
        # CNN Prediction
        cnn_input = self.prepare_input(article_text, 'cnn')
        cnn_pred = self.cnn_model.predict(cnn_input)[0][0]
        
        # LSTM-GRU Prediction
        lstm_gru_input = self.prepare_input(article_text, 'lstm_gru')
        lstm_gru_pred = self.lstm_gru_model.predict(lstm_gru_input)[0][0]
        
        # BERT Prediction
        bert_input = self.prepare_input(article_text, 'bert')
        bert_output = self.bert_model(bert_input)
        bert_pred = tf.sigmoid(bert_output.logits).numpy()[0][0]
        
        def format_pred(pred):
            label = 'Real' if pred > 0.5 else 'Fake'
            confidence = pred if pred > 0.5 else 1 - pred
            return {'label': label, 'confidence': float(confidence * 100)}
        
        return {
            'url': url,
            'models': {
                '1D_CNN': format_pred(cnn_pred),
                'LSTM_GRU': format_pred(lstm_gru_pred),
                'BERT': format_pred(bert_pred)
            },
            'ensemble_prediction': self.ensemble_prediction([cnn_pred, lstm_gru_pred, bert_pred])
        }
    
    def ensemble_prediction(self, predictions):
        """Combine predictions from all models"""
        avg_pred = np.mean(predictions)
        label = 'Real' if avg_pred > 0.5 else 'Fake'
        confidence = avg_pred if avg_pred > 0.5 else 1 - avg_pred
        return {'label': label, 'confidence': float(confidence * 100)}

# Example Usage
if __name__ == "__main__":
    detector = MultiModelNewsDetector()
    
    # Example news URL (replace with actual URL)
    test_url = url

    print("\nMulti-Model News Authenticity Analysis")
    print("===================================")
    print(f"Analyzing: {test_url}\n")
    
    result = detector.predict_with_all_models(test_url)
    
    if 'error' in result:
        print(f"Error: {result['error']}")
    else:
        print("Individual Model Results:")
        for model_name, pred in result['models'].items():
            print(f"{model_name:>8}: {pred['label']} ({pred['confidence']:.1f}%)")
        
        print(f"\nEnsemble Prediction: {result['ensemble_prediction']['label']} "
              f"({result['ensemble_prediction']['confidence']:.1f}% confidence)")
        

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\liyan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\liyan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Multi-Model News Authenticity Analysis
Analyzing: https://www.news18.com/world/isro-satellite-images-reveal-massive-damage-caused-by-myanmar-earthquake-see-pics-9282121.html

Individual Model Results:
  1D_CNN: Fake (52.6%)
LSTM_GRU: Real (50.1%)
    BERT: Fake (50.4%)

Ensemble Prediction: Fake (51.0% confidence)


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, LSTM, GRU, Bidirectional
from tensorflow.keras.models import Sequential, save_model
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
import pickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize NLP tools
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

class FakeNewsDetectorSystem:
    def __init__(self, max_len=200, max_words=10000):
        self.max_len = max_len
        self.max_words = max_words
        self.tokenizer = Tokenizer(num_words=self.max_words)
        self.models = {
            'cnn': None,
            'lstm_gru': None,
            'bert': None
        }
    
    def clean_text(self, text):
        """Preprocess text for all models"""
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        words = text.split()
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        return ' '.join(words)
    
    def load_and_prepare_data(self, true_path, fake_path, test_size=0.2):
        """Load and preprocess dataset"""
        # Load data
        true_df = pd.read_csv(true_path)
        fake_df = pd.read_csv(fake_path)
        
        # Label and combine
        true_df['label'] = 1  # Real news
        fake_df['label'] = 0  # Fake news
        df = pd.concat([true_df, fake_df]).sample(frac=1).reset_index(drop=True)
        
        # Clean text
        df['clean_text'] = df['text'].apply(self.clean_text)
        
        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            df['clean_text'], df['label'], test_size=test_size, random_state=42
        )
        
        # Tokenize for CNN/LSTM-GRU
        self.tokenizer.fit_on_texts(X_train)
        X_train_seq = pad_sequences(self.tokenizer.texts_to_sequences(X_train), maxlen=self.max_len)
        X_test_seq = pad_sequences(self.tokenizer.texts_to_sequences(X_test), maxlen=self.max_len)
        
        return X_train_seq, X_test_seq, y_train, y_test, X_train, X_test
    
    def build_cnn_model(self):
        """1D CNN architecture"""
        model = Sequential([
            Embedding(self.max_words, 128, input_length=self.max_len),
            Conv1D(128, 5, activation='relu'),
            GlobalMaxPooling1D(),
            Dense(128, activation='relu'),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def build_lstm_gru_model(self):
        """Hybrid LSTM-GRU architecture"""
        model = Sequential([
            Embedding(self.max_words, 128, input_length=self.max_len),
            Bidirectional(LSTM(64, return_sequences=True)),
            GRU(64),
            Dense(128, activation='relu'),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def train_models(self, X_train, y_train, X_test, y_test, epochs=5, batch_size=64):
        """Train all three model architectures"""
        # Train CNN
        print("Training CNN model...")
        self.models['cnn'] = self.build_cnn_model()
        self.models['cnn'].fit(X_train, y_train, 
                              validation_data=(X_test, y_test),
                              epochs=epochs, batch_size=batch_size)
        
        # Train LSTM-GRU
        print("\nTraining LSTM-GRU model...")
        self.models['lstm_gru'] = self.build_lstm_gru_model()
        self.models['lstm_gru'].fit(X_train, y_train,
                                   validation_data=(X_test, y_test),
                                   epochs=epochs, batch_size=batch_size)
        
        # Initialize BERT components (fine-tuning requires separate setup)
        print("\nInitializing BERT model (requires separate fine-tuning script)")
        self.models['bert'] = {
            'tokenizer': BertTokenizer.from_pretrained('bert-base-uncased'),
            'model': TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
        }
    
    def save_models(self, save_dir='saved_models'):
        """Save all model components for production"""
        import os
        os.makedirs(save_dir, exist_ok=True)
        
        # Save CNN model
        save_model(self.models['cnn'], f'{save_dir}/cnn_model.h5')
        
        # Save LSTM-GRU model
        save_model(self.models['lstm_gru'], f'{save_dir}/lstm_gru_model.h5')
        
        # Save BERT components
        self.models['bert']['model'].save_pretrained(f'{save_dir}/bert_model')
        self.models['bert']['tokenizer'].save_pretrained(f'{save_dir}/bert_tokenizer')
        
        # Save tokenizer
        with open(f'{save_dir}/tokenizer.pkl', 'wb') as f:
            pickle.dump(self.tokenizer, f)
        
        print(f"All models saved to {save_dir} directory")
    
    def load_models(self, save_dir='saved_models'):
        """Load pre-trained models"""
        # Load CNN
        self.models['cnn'] = tf.keras.models.load_model(f'{save_dir}/cnn_model.h5')
        
        # Load LSTM-GRU
        self.models['lstm_gru'] = tf.keras.models.load_model(f'{save_dir}/lstm_gru_model.h5')
        
        # Load BERT
        self.models['bert'] = {
            'tokenizer': BertTokenizer.from_pretrained(f'{save_dir}/bert_tokenizer'),
            'model': TFBertForSequenceClassification.from_pretrained(f'{save_dir}/bert_model')
        }
        
        # Load tokenizer
        with open(f'{save_dir}/tokenizer.pkl', 'rb') as f:
            self.tokenizer = pickle.load(f)
        
        print("All models loaded successfully")

# Example Usage for Training
if __name__ == "__main__":
    # Initialize system
    detector = FakeNewsDetectorSystem()
    
    # Paths to your dataset (example using ISOT dataset)
    TRUE_DATA_PATH = "True.csv"
    FAKE_DATA_PATH = "Fake.csv"
    
    # Load and prepare data
    print("Loading and preprocessing data...")
    X_train_seq, X_test_seq, y_train, y_test, X_train_raw, X_test_raw = detector.load_and_prepare_data(
        TRUE_DATA_PATH, FAKE_DATA_PATH
    )
    
    # Train models
    print("\nTraining models...")
    detector.train_models(X_train_seq, y_train, X_test_seq, y_test, epochs=3)
    
    # Save models for production
    print("\nSaving trained models...")
    detector.save_models()
    
    # To load models later:
    # new_detector = FakeNewsDetectorSystem()
    # new_detector.load_models()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\liyan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\liyan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading and preprocessing data...

Training models...
Training CNN model...
Epoch 1/3
Epoch 2/3
Epoch 3/3

Training LSTM-GRU model...
Epoch 1/3
Epoch 2/3
Epoch 3/3

In [1]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

def bert_fine_tuning(train_texts, train_labels, val_texts, val_labels):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    # Convert data to BERT format
    train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=200)
    val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=200)
    
    # Create TensorFlow datasets
    train_dataset = tf.data.Dataset.from_tensor_slices((
        dict(train_encodings),
        train_labels
    ))
    val_dataset = tf.data.Dataset.from_tensor_slices((
        dict(val_encodings),
        val_labels
    ))
    
    # Load and fine-tune BERT
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                 metrics=['accuracy'])
    
    model.fit(train_dataset.shuffle(1000).batch(16),
              epochs=2,
              batch_size=16,
              validation_data=val_dataset.batch(16))
    
    return model, tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, LSTM, GRU, Bidirectional
from tensorflow.keras.models import Sequential, save_model
from tensorflow.keras.callbacks import EarlyStopping
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
import pickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os

# Initialize NLP tools
nltk.download('stopwords')
nltk.download('wordnet')

class FakeNewsDetector:
    def __init__(self, max_len=200, max_words=10000):
        self.max_len = max_len
        self.max_words = max_words
        self.tokenizer = Tokenizer(num_words=self.max_words)
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.models = {
            'cnn': None,
            'lstm_gru': None,
            'bert': None
        }
    
    def clean_text(self, text):
        """Preprocess text by removing special chars, numbers, and lemmatizing"""
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        words = text.split()
        words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
        return ' '.join(words)
    
    def load_data(self, true_path, fake_path):
        """Load and combine true and fake news datasets"""
        true_df = pd.read_csv(true_path)
        fake_df = pd.read_csv(fake_path)
        
        true_df['label'] = 1  # 1 for real news
        fake_df['label'] = 0  # 0 for fake news
        
        df = pd.concat([true_df, fake_df]).sample(frac=1).reset_index(drop=True)
        df['clean_text'] = df['text'].apply(self.clean_text)
        
        return df
    
    def prepare_datasets(self, df, test_size=0.2):
        """Split data into train/test sets and tokenize"""
        X_train, X_test, y_train, y_test = train_test_split(
            df['clean_text'], df['label'], test_size=test_size, random_state=42
        )
        
        # Tokenize text
        self.tokenizer.fit_on_texts(X_train)
        X_train_seq = pad_sequences(self.tokenizer.texts_to_sequences(X_train), maxlen=self.max_len)
        X_test_seq = pad_sequences(self.tokenizer.texts_to_sequences(X_test), maxlen=self.max_len)
        
        return X_train_seq, X_test_seq, y_train, y_test
    
    def build_cnn_model(self):
        """Create 1D CNN model architecture"""
        model = Sequential([
            Embedding(self.max_words, 128, input_length=self.max_len),
            Conv1D(128, 3, activation='relu'),
            GlobalMaxPooling1D(),
            Dense(128, activation='relu'),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def build_lstm_gru_model(self):
        """Create hybrid LSTM-GRU model architecture"""
        model = Sequential([
            Embedding(self.max_words, 128, input_length=self.max_len),
            Bidirectional(LSTM(64, return_sequences=True)),
            GRU(64),
            Dense(128, activation='relu'),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def initialize_bert(self):
        """Initialize BERT model components"""
        self.models['bert'] = {
            'tokenizer': BertTokenizer.from_pretrained('bert-base-uncased'),
            'model': TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
        }
    
    def train_models(self, X_train, y_train, X_test, y_test, epochs=5, batch_size=64):
        """Train all three model architectures"""
        early_stopping = EarlyStopping(patience=2, restore_best_weights=True)
        
        # Train CNN
        print("Training CNN model...")
        self.models['cnn'] = self.build_cnn_model()
        self.models['cnn'].fit(
            X_train, y_train,
            validation_data=(X_test, y_test),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=[early_stopping]
        )
        
        # Train LSTM-GRU
        print("\nTraining LSTM-GRU model...")
        self.models['lstm_gru'] = self.build_lstm_gru_model()
        self.models['lstm_gru'].fit(
            X_train, y_train,
            validation_data=(X_test, y_test),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=[early_stopping]
        )
        
        # Initialize BERT (requires separate fine-tuning)
        print("\nInitializing BERT model...")
        self.initialize_bert()
    
    def save_models(self, save_dir='saved_models'):
        """Save all model components to disk"""
        os.makedirs(save_dir, exist_ok=True)
        
        # Save CNN model
        save_model(self.models['cnn'], f'{save_dir}/cnn_model.h5')
        
        # Save LSTM-GRU model
        save_model(self.models['lstm_gru'], f'{save_dir}/lstm_gru_model.h5')
        
        # Save BERT components
        self.models['bert']['model'].save_pretrained(f'{save_dir}/bert_model')
        self.models['bert']['tokenizer'].save_pretrained(f'{save_dir}/bert_tokenizer')
        
        # Save tokenizer
        with open(f'{save_dir}/tokenizer.pkl', 'wb') as f:
            pickle.dump(self.tokenizer, f)
        
        print(f"All models saved to {save_dir} directory")
    
    def load_models(self, save_dir='saved_models'):
        """Load pre-trained models from disk"""
        # Load CNN
        self.models['cnn'] = tf.keras.models.load_model(f'{save_dir}/cnn_model.h5')
        
        # Load LSTM-GRU
        self.models['lstm_gru'] = tf.keras.models.load_model(f'{save_dir}/lstm_gru_model.h5')
        
        # Load BERT
        self.models['bert'] = {
            'tokenizer': BertTokenizer.from_pretrained(f'{save_dir}/bert_tokenizer'),
            'model': TFBertForSequenceClassification.from_pretrained(f'{save_dir}/bert_model')
        }
        
        # Load tokenizer
        with open(f'{save_dir}/tokenizer.pkl', 'rb') as f:
            self.tokenizer = pickle.load(f)
        
        print("All models loaded successfully")
    
    def predict(self, text, model_type='cnn'):
        """Make prediction on new text"""
        cleaned_text = self.clean_text(text)
        
        if model_type == 'bert':
            inputs = self.models['bert']['tokenizer'](
                cleaned_text,
                return_tensors='tf',
                padding='max_length',
                truncation=True,
                max_length=self.max_len
            )
            outputs = self.models['bert']['model'](inputs)
            pred = tf.sigmoid(outputs.logits).numpy()[0][0]
        else:
            sequence = self.tokenizer.texts_to_sequences([cleaned_text])
            padded_seq = pad_sequences(sequence, maxlen=self.max_len)
            pred = self.models[model_type].predict(padded_seq)[0][0]
        
        return {
            'prediction': 'Real' if pred > 0.5 else 'Fake',
            'confidence': float(pred if pred > 0.5 else 1 - pred),
            'raw_score': float(pred)
        }

# Main execution
if __name__ == "__main__":
    # Initialize detector
    detector = FakeNewsDetector()
    
    # Paths to your dataset files
    TRUE_DATA_PATH = "True.csv"
    FAKE_DATA_PATH = "Fake.csv"
    
    # Load and prepare data
    print("Loading and preprocessing data...")
    df = detector.load_data(TRUE_DATA_PATH, FAKE_DATA_PATH)
    X_train, X_test, y_train, y_test = detector.prepare_datasets(df)
    
    # Train models
    print("\nTraining models...")
    detector.train_models(X_train, y_train, X_test, y_test, epochs=5)
    
    # Save models
    print("\nSaving models...")
    detector.save_models()
    
    # Example prediction

    print(f"\nTesting prediction now....\n")
    
    # Load models (simulating a fresh start)
    new_detector = FakeNewsDetector()
    new_detector.load_models()
    
    # Make predictions with different models
    print("\nModel Predictions:")
    for model_name in ['cnn', 'lstm_gru', 'bert']:
        result = new_detector.predict(test_text, model_type=model_name)
        print(f"{model_name.upper():<8}: {result['prediction']} (confidence: {result['confidence']*100:.1f}%)")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\liyan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\liyan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading and preprocessing data...

Training models...
Training CNN model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Training LSTM-GRU model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Initializing BERT model...


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Saving models...
All models saved to saved_models directory

Testing prediction now....



Some layers from the model checkpoint at saved_models/bert_model were not used when initializing TFBertForSequenceClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at saved_models/bert_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


All models loaded successfully

Model Predictions:
CNN     : Real (confidence: 100.0%)
LSTM_GRU: Real (confidence: 100.0%)
BERT    : Real (confidence: 52.5%)
