In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/TrueFalse/fake_or_real_news.csv')
print(df)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                                                   text label Unnamed: 2  \
0     Daniel Greenfield, a Shillman Journalism Fello...  FAKE        NaN   
1     Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE        NaN   
2     U.S. Secretary of State John F. Kerry said Mon...  REAL        NaN   
3     — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE        NaN   
4     It's primary day in New York and front-runners...  REAL        NaN   
...                                                 ...   ...        ...   
7790  The State Department told the Republican Natio...  REAL        NaN   
7791  The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...  FAKE        NaN   
7792   Anti-Trump Protesters Are Tools of the Oligar...  FAKE        NaN   
7793  ADDIS ABABA, Ethiopia —President Obama convene...  REAL        NaN   
7794  Jeb Bush Is Suddenly Attackin

  df = pd.read_csv('/content/drive/MyDrive/TrueFalse/fake_or_real_news.csv')


In [6]:
# Part 2: Data Cleaning and Selecting Columns

# Usually, one column has the text and another has 'FAKE' or 'REAL' labels.
# Let's find which columns have those values.
print(df.nunique())   # Check unique value counts per column

# Identify columns manually if needed:
# Suppose column 0 = text, column 1 = label
# (Adjust these column names based on your dataset)
df = df.iloc[:, [0, 1]]
df.columns = ['text', 'label']

# Drop rows with missing values
df = df.dropna(subset=['text', 'label'])

# Normalize label values (remove spaces, make uppercase)
df['label'] = df['label'].astype(str).str.strip().str.upper()

# Keep only FAKE and REAL labels
df = df[df['label'].isin(['FAKE', 'REAL'])]

print("\nCleaned dataset shape:", df.shape)
print(df['label'].value_counts())


text            6644
label            437
Unnamed: 2       315
Unnamed: 3       241
Unnamed: 4       179
                ... 
Unnamed: 134       1
Unnamed: 135       1
Unnamed: 136       1
Unnamed: 137       1
Unnamed: 138       1
Length: 139, dtype: int64

Cleaned dataset shape: (6315, 2)
label
REAL    3161
FAKE    3154
Name: count, dtype: int64


In [7]:
# Part 3: Split Data

from sklearn.model_selection import train_test_split

# Split the data
X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))


Training samples: 5052
Testing samples: 1263


In [8]:
# Part 4: Convert Text to Numerical Features (TF-IDF)

from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)

# Fit on training data and transform both train & test
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF shape (train):", X_train_tfidf.shape)
print("TF-IDF shape (test):", X_test_tfidf.shape)


TF-IDF shape (train): (5052, 35140)
TF-IDF shape (test): (1263, 35140)


In [10]:
# Part 5: Train Naive Bayes Model

from sklearn.naive_bayes import MultinomialNB

# Initialize and train model
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

print(" Model training complete!")


 Model training complete!


In [11]:
# Part 6: Evaluate Model Performance

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions
y_pred = nb.predict(X_test_tfidf)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8796516231195566

Confusion Matrix:
 [[506 125]
 [ 27 605]]

Classification Report:
               precision    recall  f1-score   support

        FAKE       0.95      0.80      0.87       631
        REAL       0.83      0.96      0.89       632

    accuracy                           0.88      1263
   macro avg       0.89      0.88      0.88      1263
weighted avg       0.89      0.88      0.88      1263



In [13]:
# Part 8: Interpret Results and Summarize Findings

import pandas as pd
import numpy as np

# 1️⃣ Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model: {accuracy:.4f}")

# 2️⃣ Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=nb.classes_, columns=nb.classes_)
print("\nConfusion Matrix:")
print(cm_df)

# 3️⃣ Classification Report
report = classification_report(y_test, y_pred, target_names=nb.classes_)
print("\nClassification Report:\n")
print(report)

# 4️⃣ Top Features for each class
feature_names = np.array(tfidf.get_feature_names_out())
class_labels = nb.classes_
topn = 10

for i, label in enumerate(class_labels):
    top_features = feature_names[np.argsort(nb.feature_log_prob_[i])[-topn:]]
    print(f"\nTop {topn} indicative words for class '{label}':")
    print(top_features)

# 5️⃣ Summary text
summary = f"""
Summary of Findings:

1. Model Accuracy: {accuracy:.2%} — high accuracy indicates the model correctly predicts most news articles.
2. Confusion Matrix:
{cm_df.to_string()}
   - True Positives and True Negatives show correct predictions.
   - False Positives and False Negatives indicate misclassifications.
3. Classification Report:
{report}
4. Top words for FAKE news: {', '.join(feature_names[np.argsort(nb.feature_log_prob_[0])[-topn:]])}
5. Top words for REAL news: {', '.join(feature_names[np.argsort(nb.feature_log_prob_[1])[-topn:]])}

Conclusion:
The Naive Bayes classifier effectively distinguishes between FAKE and REAL news articles.
TF-IDF vectorization captures important keywords, showing clear differences in language patterns.
Naive Bayes is a simple, fast, and interpretable model suitable for text classification tasks like fake news detection.
"""

print(summary)


Accuracy of the model: 0.8797

Confusion Matrix:
      FAKE  REAL
FAKE   506   125
REAL    27   605

Classification Report:

              precision    recall  f1-score   support

        FAKE       0.95      0.80      0.87       631
        REAL       0.83      0.96      0.89       632

    accuracy                           0.88      1263
   macro avg       0.89      0.88      0.88      1263
weighted avg       0.89      0.88      0.88      1263


Top 10 indicative words for class 'FAKE':
['just' 'said' 'russia' 'fbi' 'election' 'people' '2016' 'hillary'
 'clinton' 'trump']

Top 10 indicative words for class 'REAL':
['state' 'cruz' 'republican' 'president' 'campaign' 'sanders' 'obama'
 'clinton' 'said' 'trump']

Summary of Findings:

1. Model Accuracy: 87.97% — high accuracy indicates the model correctly predicts most news articles.
2. Confusion Matrix:
      FAKE  REAL
FAKE   506   125
REAL    27   605
   - True Positives and True Negatives show correct predictions.
   - False Positi