In [36]:
import pandas as pd
import numpy as np
import os
import zipfile
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination
from pgmpy.factors.discrete import TabularCPD

In [None]:

extract_path = "news_data"


In [40]:
true_file = [f for f in os.listdir(extract_path) if "true" in f.lower()][0]
fake_file = [f for f in os.listdir(extract_path) if "fake" in f.lower()][0]

In [41]:
# Load datasets
df_true = pd.read_csv(os.path.join(extract_path, true_file))
df_fake = pd.read_csv(os.path.join(extract_path, fake_file))

In [42]:
# Add labels: 0 = Real News, 1 = Fake News
df_true["label"] = 0
df_fake["label"] = 1

In [43]:
# Combine datasets
df = pd.concat([df_true, df_fake], axis=0).reset_index(drop=True)

In [44]:
# Step 2: Create features for Bayesian Network
# Convert subject to a new 'bias' column
# Simplifying by categorizing potential bias sources
df['bias'] = df['subject'].map(lambda x: 1 if x.lower() in ['politics', 'government', 'worldnews'] else 0)

In [45]:
# Create a feature for news type based on text
df['news_type'] = df['text'].apply(
    lambda x: 1 if isinstance(x, str) and any(word in x.lower() 
                                           for word in ['scandal', 'secret', 'shocking', 'breaking', 'conspiracy']) 
    else 0
)


In [46]:
# Step 3: Text Preprocessing for Naive Bayes
X = df['text']
y = df['label']
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

In [47]:
# Step 4: Train-Validation-Test Split (70% Train, 15% Validation, 15% Test)
X_train, X_temp, y_train, y_temp = train_test_split(X_tfidf, y, test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [48]:
# Step 5: Train Naïve Bayes Classifier
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [49]:
# Validate model on validation set
val_predictions = nb.predict(X_val)
print("Validation Set Evaluation:")
print(classification_report(y_val, val_predictions))

Validation Set Evaluation:
              precision    recall  f1-score   support

           0       0.93      0.92      0.93      3262
           1       0.93      0.94      0.93      3473

    accuracy                           0.93      6735
   macro avg       0.93      0.93      0.93      6735
weighted avg       0.93      0.93      0.93      6735



In [50]:
# Get prediction probabilities for test set
probabilities = nb.predict_proba(X_test)

In [51]:

# Bayesian Decision Theory: Introduce Risk-Based Classification
lambda_01 = 2  # Cost of misclassifying Fake as Real (False Negative)
lambda_10 = 1  # Cost of misclassifying Real as Fake (False Positive)

In [52]:
def bayesian_decision_rule(prob_fake, prob_real, lambda_01, lambda_10):
    """Applies Bayesian Decision Theory to classification."""
    risk_fake = prob_fake * lambda_01
    risk_real = prob_real * lambda_10
    return 1 if risk_fake > risk_real else 0  # 1 = Fake, 0 = Real

In [53]:
# Apply Bayesian Decision Rule
predictions = [bayesian_decision_rule(p[1], p[0], lambda_01, lambda_10) for p in probabilities]

In [54]:
# Evaluate Model on Test Set
print("Test Set Evaluation:")
print(classification_report(y_test, predictions))

Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.95      0.87      0.91      3212
           1       0.89      0.96      0.92      3523

    accuracy                           0.92      6735
   macro avg       0.92      0.91      0.91      6735
weighted avg       0.92      0.92      0.92      6735



In [55]:
# Define Bayesian Network structure using the proper lowercase column names
model = BayesianNetwork([('bias', 'news_type'), ('news_type', 'label')])

In [56]:
# Prepare data for Bayesian Network - include only the relevant columns
df_bayes = df[['bias', 'news_type', 'label']]

In [57]:
# Fit Bayesian Network using Maximum Likelihood Estimation
model.fit(df_bayes, estimator=MaximumLikelihoodEstimator)

In [58]:
# Perform inference
infer = VariableElimination(model)

In [73]:
# Example query: Predict probability of fake news given bias and news_type
query_result = infer.query(variables=['label'], evidence={'bias': 1, 'news_type': 1})
print("\nProbability of news being fake given high bias source and suspicious content:")
print(query_result)


Probability of news being fake given high bias source and suspicious content:
+----------+--------------+
| label    |   phi(label) |
| label(0) |       0.4760 |
+----------+--------------+
| label(1) |       0.5240 |
+----------+--------------+


In [74]:
# Another query: Predict probability of fake news given only news_type
query_result = infer.query(variables=['label'], evidence={'news_type': 1})
print("\nProbability of news being fake given only suspicious content:")
print(query_result)


Probability of news being fake given only suspicious content:
+----------+--------------+
| label    |   phi(label) |
| label(0) |       0.4760 |
+----------+--------------+
| label(1) |       0.5240 |
+----------+--------------+
