In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import pickle

# Download necessary resources
nltk.download("stopwords")
nltk.download("punkt")

# Load datasets
true_df = pd.read_csv("True.csv")
fake_df = pd.read_csv("Fake.csv")

# Label data
true_df["label"] = 1  # Real news
fake_df["label"] = 0  # Fake news

# Combine datasets
df = pd.concat([true_df, fake_df], axis=0).reset_index(drop=True)
df = df.drop(columns=["subject", "date"], errors='ignore')
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Text Cleaning Function
stop_words = set(stopwords.words("english"))

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

df["clean_text"] = df["text"].apply(clean_text)

# Data Split
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Model Training
model = LogisticRegression(max_iter=500, C=1.0)
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)
y_prob = model.predict_proba(X_test_tfidf)[:, 1]

# Accuracy & Report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Fake", "Real"], yticklabels=["Fake", "Real"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# ROC Curve and AUC Score
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc_score = roc_auc_score(y_test, y_prob)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f"AUC = {auc_score:.2f}")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.show()

# Top TF-IDF Features
feature_names = vectorizer.get_feature_names_out()
coefs = model.coef_[0]
top_fake_idx = np.argsort(coefs)[:10]
top_real_idx = np.argsort(coefs)[-10:]

top_fake_features = [(feature_names[i], coefs[i]) for i in top_fake_idx]
top_real_features = [(feature_names[i], coefs[i]) for i in top_real_idx]

print("\nTop Features for Fake News:")
for word, weight in top_fake_features:
    print(f"{word}: {weight:.4f}")

print("\nTop Features for Real News:")
for word, weight in top_real_features:
    print(f"{word}: {weight:.4f}")

# Save model and vectorizer
with open("fake_news_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)
with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

# Load test dataset
test_df = pd.read_csv("test.csv")
test_df["text"] = test_df["text"].fillna("")
test_df["clean_text"] = test_df["text"].apply(clean_text)
X_test_new = vectorizer.transform(test_df["clean_text"])

# Predict on new test data
test_predictions = model.predict(X_test_new)
test_df["prediction"] = test_predictions

# Save predictions
test_df.to_csv("Test_Predictions.csv", index=False)

# Plot prediction distribution
plt.figure(figsize=(6, 4))
sns.countplot(x=test_df["prediction"], palette=["red", "blue"])
plt.xticks(ticks=[0, 1], labels=["Fake News", "Real News"])
plt.xlabel("News Type")
plt.ylabel("Count")
plt.title("Prediction Distribution on Test Data")
plt.show()

# Extract fake and real news from test data
fake_news = test_df[test_df["prediction"] == 0]
real_news = test_df[test_df["prediction"] == 1]
fake_news.to_csv("Fake_News_Predictions.csv", index=False)
real_news.to_csv("Real_News_Predictions.csv", index=False)

print("\n✅ Fake and Real news predictions saved to CSV files.")
