In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import warnings
warnings.filterwarnings('ignore')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Pragyan
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Pragyan
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Pragyan
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:

try:
    fake_news = pd.read_csv('Fake.csv')
    true_news = pd.read_csv('True.csv')
    fake_news['label'] = 0  
    true_news['label'] = 1  
    df = pd.concat([fake_news, true_news], axis=0)
    
except FileNotFoundError:
    print("Dataset files not found. Creating sample data for demonstration...")
    
    sample_data = {
        'text': [
            "Breaking: Scientists discover revolutionary cure for all diseases!",
            "The government announced new economic policies today.",
            "Aliens landed in New York City, officials confirm!",
            "The stock market showed moderate gains this quarter.",
            "Celebrity reveals secret to eternal youth - doctors hate this!",
            "New study shows benefits of regular exercise and balanced diet.",
            "You won't believe what this politician said about the moon landing!",
            "The weather forecast predicts rain for the weekend.",
            "Secret pyramid discovered under Antarctica ice!",
            "Local community organizes charity event for homeless shelter."
        ],
        'label': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]  # 0: fake, 1: true
    }
    df = pd.DataFrame(sample_data)


In [None]:
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
print(df.info())
print("\nFirst few rows:")
print(df.head())
print("Missing values:")
print(df.isnull().sum())
df = df.dropna(subset=['text'])

print("Class Distribution:")
print(df['label'].value_counts())
print("\nClass Proportions:")
print(df['label'].value_counts(normalize=True))

plt.figure(figsize=(8, 6))
sns.countplot(x='label', data=df)
plt.title('Class Distribution (0: Fake, 1: True)')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Preprocess text by:
    1. Converting to lowercase
    2. Removing special characters and digits
    3. Tokenizing
    4. Removing stopwords
    5. Lemmatizing
    """
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
        return ' '.join(tokens)
    else:
        return ''


In [None]:
print("Preprocessing text data...")
df['cleaned_text'] = df['text'].apply(preprocess_text)
print("\nOriginal vs Cleaned Text Examples:")
for i in range(3):
    print(f"\nExample {i+1}:")
    print("Original:", df['text'].iloc[i][:100] + "...")
    print("Cleaned:", df['cleaned_text'].iloc[i][:100] + "...")
df['text_length'] = df['cleaned_text'].apply(len)
df['word_count'] = df['cleaned_text'].apply(lambda x: len(x.split()))
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(data=df, x='text_length', hue='label', bins=50)
plt.title('Text Length Distribution by Class')

plt.subplot(1, 2, 2)
sns.boxplot(data=df, x='label', y='text_length')
plt.title('Text Length by Class')

plt.tight_layout()
plt.show()


In [None]:
X = df['cleaned_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

print("Creating TF-IDF features...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english'
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF features shape: {X_train_tfidf.shape}")
print("Training Logistic Regression model...")
lr_model = LogisticRegression(
    random_state=42,
    max_iter=1000,
    C=1.0
)

lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)

def evaluate_model(y_true, y_pred, model_name):
    """Evaluate model performance"""
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    
    return accuracy, precision, recall, f1
lr_metrics = evaluate_model(y_test, y_pred_lr, "Logistic Regression")
print("Training additional models...")
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)
nb_metrics = evaluate_model(y_test, y_pred_nb, "Naive Bayes")
svm_model = SVC(random_state=42)
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)
svm_metrics = evaluate_model(y_test, y_pred_svm, "SVM")


In [None]:
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_length = 200
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')
print(f"Padded sequences shape: {X_train_pad.shape}")

vocab_size = len(tokenizer.word_index) + 1

lstm_model = Sequential([
    Embedding(vocab_size, 100, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(32)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)
print("LSTM Model Summary:")
lstm_model.summary()

print("Training LSTM model...")
history = lstm_model.fit(
    X_train_pad, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)
y_pred_lstm_proba = lstm_model.predict(X_test_pad)
y_pred_lstm = (y_pred_lstm_proba > 0.5).astype(int).flatten()

lstm_metrics = evaluate_model(y_test, y_pred_lstm, "LSTM")


plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()


models_comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Naive Bayes', 'SVM', 'LSTM'],
    'Accuracy': [lr_metrics[0], nb_metrics[0], svm_metrics[0], lstm_metrics[0]],
    'Precision': [lr_metrics[1], nb_metrics[1], svm_metrics[1], lstm_metrics[1]],
    'Recall': [lr_metrics[2], nb_metrics[2], svm_metrics[2], lstm_metrics[2]],
    'F1-Score': [lr_metrics[3], nb_metrics[3], svm_metrics[3], lstm_metrics[3]]
})

print("Model Comparison:")
print(models_comparison)

plt.figure(figsize=(12, 8))
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score']

for i, metric in enumerate(metrics_to_plot, 1):
    plt.subplot(2, 2, i)
    sns.barplot(x='Model', y=metric, data=models_comparison)
    plt.title(f'{metric} Comparison')
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


false_positives = X_test[(y_test == 0) & (y_pred_lr == 1)]
false_negatives = X_test[(y_test == 1) & (y_pred_lr == 0)]

print(f"False Positives (Fake news predicted as real): {len(false_positives)}")
print(f"False Negatives (Real news predicted as fake): {len(false_negatives)}")


if len(false_positives) > 0:
    print("\nFalse Positive Examples:")
    for i, text in enumerate(false_positives.head(3)):
        print(f"{i+1}. {text[:100]}...")

if len(false_negatives) > 0:
    print("\nFalse Negative Examples:")
    for i, text in enumerate(false_negatives.head(3)):
        print(f"{i+1}. {text[:100]}...")


In [None]:
import pickle
import joblib
print("Saving models...")

with open('logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)
lstm_model.save('lstm_model.h5')

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
models_dict = {
    'logistic_regression': lr_model,
    'naive_bayes': nb_model,
    'svm': svm_model,
    'tfidf_vectorizer': tfidf_vectorizer,
    'tokenizer': tokenizer
}

joblib.dump(models_dict, 'all_models.pkl')

print("All models saved successfully!")

def predict_news(text, model_type='logistic'):
    """
    Predict whether a news article is real or fake
    
    Parameters:
    text (str): News article text
    model_type (str): Type of model to use ('logistic', 'lstm')
    
    Returns:
    dict: Prediction results
    """
    cleaned_text = preprocess_text(text)
    
    if model_type == 'logistic':
        text_tfidf = tfidf_vectorizer.transform([cleaned_text])
        
        prediction = lr_model.predict(text_tfidf)[0]
        probability = lr_model.predict_proba(text_tfidf)[0]
        
    elif model_type == 'lstm':
        text_seq = tokenizer.texts_to_sequences([cleaned_text])
        text_pad = pad_sequences(text_seq, maxlen=max_length, padding='post')
        probability = lstm_model.predict(text_pad)[0][0]
        prediction = 1 if probability > 0.5 else 0
        probability = [1 - probability, probability]  
    
    else:
        raise ValueError("Model type must be 'logistic' or 'lstm'")
    
    result = {
        'prediction': 'Real News' if prediction == 1 else 'Fake News',
        'confidence': max(probability),
        'fake_probability': probability[0],
        'real_probability': probability[1],
        'text_preview': text[:100] + '...' if len(text) > 100 else text
    }
    
    return result
test_samples = [
    "Breaking: Scientists discover amazing cure that makes all diseases disappear instantly!",
    "The government announced new economic policies aimed at stabilizing the market.",
    "Aliens have been confirmed to be living among us according to secret documents!",
    "Local community raises funds for new park renovation project."
]

print("Testing Prediction Function:")
print("=" * 50)

for i, text in enumerate(test_samples, 1):
    result = predict_news(text)
    print(f"\nSample {i}:")
    print(f"Text: {result['text_preview']}")
    print(f"Prediction: {result['prediction']}")
    print(f"Confidence: {result['confidence']:.2%}")
    print(f"Fake Probability: {result['fake_probability']:.2%}")
    print(f"Real Probability: {result['real_probability']:.2%}")
    print("-" * 30)


print("\n" + "="*50)
print("FAKE NEWS DETECTION SYSTEM - SUMMARY")
print("="*50)
print(f"Dataset size: {len(df)} articles")
print(f"Training set: {len(X_train)} articles")
print(f"Test set: {len(X_test)} articles")
print(f"Best model: Logistic Regression")
print(f"Best accuracy: {lr_metrics[0]:.2%}")
print(f"Files created:")
print("  - logistic_regression_model.pkl (Logistic Regression model)")
print("  - tfidf_vectorizer.pkl (TF-IDF vectorizer)")
print("  - lstm_model.h5 (LSTM model)")
print("  - tokenizer.pkl (Tokenizer for LSTM)")
print("  - all_models.pkl (All models combined)")
print("  - fake_news_app.py (Streamlit web app)")
print("\nTo run the web app: streamlit run fake_news_app.py")