# Fake News Detection with Naive Bayes
---
This notebook demonstrates how to build a **Naive Bayes classifier** to detect fake vs. real news using a dataset of news articles.
We will cover: 
1. Exploratory Data Analysis (EDA)
2. Text Preprocessing & Feature Engineering
3. Model Training (Naive Bayes)
4. Evaluation (Accuracy, Precision, Recall, F1)
5. Visualization (Confusion Matrix, Top Words)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
import numpy as np

## 1. Load Dataset

In [None]:
df = pd.read_csv('Assignment_Data_fake_or_real_news.csv')
df.head()

## 2. Exploratory Data Analysis (EDA)

In [None]:
print(df.info())
print(df['label'].value_counts())
sns.countplot(data=df, x='label')
plt.title('Class Distribution')
plt.show()

## 3. Data Preparation (Title + Text)

In [None]:
df['content'] = df['title'] + ' ' + df['text']
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['label'], 
                                                    test_size=0.2, random_state=42, stratify=df['label'])

## 4. Naive Bayes Model

In [None]:
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.7)),
    ('nb', MultinomialNB())
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

## 5. Evaluation

In [None]:
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
print(metrics.classification_report(y_test, y_pred))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['FAKE','REAL'], yticklabels=['FAKE','REAL'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

## 6. Feature Importance (Top Words per Class)

In [None]:
vectorizer = model.named_steps['tfidf']
nb = model.named_steps['nb']
feature_names = np.array(vectorizer.get_feature_names_out())
log_probs = nb.feature_log_prob_

top_fake = feature_names[np.argsort(log_probs[0])[-15:]]
top_real = feature_names[np.argsort(log_probs[1])[-15:]]

print('Top words indicating FAKE news:', top_fake)
print('Top words indicating REAL news:', top_real)

## 7. Extra EDA: Word Clouds & Text Length Distributions

In [None]:
from wordcloud import WordCloud\n\n# Word Cloud for FAKE news\nfake_text = ' '.join(df[df['label']=='FAKE']['content'])\nwc_fake = WordCloud(width=800, height=400, background_color='white').generate(fake_text)\nplt.figure(figsize=(10,5))\nplt.imshow(wc_fake, interpolation='bilinear')\nplt.axis('off')\nplt.title('Word Cloud - FAKE News')\nplt.show()\n\n# Word Cloud for REAL news\nreal_text = ' '.join(df[df['label']=='REAL']['content'])\nwc_real = WordCloud(width=800, height=400, background_color='white').generate(real_text)\nplt.figure(figsize=(10,5))\nplt.imshow(wc_real, interpolation='bilinear')\nplt.axis('off')\nplt.title('Word Cloud - REAL News')\nplt.show()

In [None]:
# Distribution of article length (in words)\ndf['text_length'] = df['content'].apply(lambda x: len(x.split()))\nsns.histplot(data=df, x='text_length', hue='label', bins=50, kde=True)\nplt.title('Distribution of Article Length by Label')\nplt.xlabel('Number of Words')\nplt.ylabel('Count')\nplt.show()

## 8. ROC Curve & AUC Score

In [None]:
from sklearn.metrics import roc_curve, auc\nfrom sklearn.preprocessing import LabelBinarizer\n\n# Binarize labels (FAKE=0, REAL=1)\nlb = LabelBinarizer()\ny_test_bin = lb.fit_transform(y_test)\ny_pred_proba = model.predict_proba(X_test)[:,1]\n\n# ROC Curve\nfpr, tpr, _ = roc_curve(y_test_bin, y_pred_proba)\nroc_auc = auc(fpr, tpr)\n\nplt.figure(figsize=(7,6))\nplt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)\nplt.plot([0, 1], [0, 1], color='gray', linestyle='--')\nplt.xlim([0.0, 1.0])\nplt.ylim([0.0, 1.05])\nplt.xlabel('False Positive Rate')\nplt.ylabel('True Positive Rate')\nplt.title('Receiver Operating Characteristic (ROC)')\nplt.legend(loc='lower right')\nplt.show()