In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
nltk.download('punkt')     # Download punkt
nltk.download('stopwords') # Also download stopwords


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import joblib



# Step 2: Load Two Datasets
df1 = pd.read_csv('/content/Fake.csv')   # Replace with your path
df2 = pd.read_csv('/content/True.csv')   # Replace with your path

# Step 3: Preprocessing - Make Columns Consistent if Needed
# Example only if your second dataset has different column names
# df2.rename(columns={'headline': 'title', 'article': 'text', 'tag': 'label'}, inplace=True)

# Step 4: Combine Two Datasets
df = pd.concat([df1, df2], ignore_index=True)

# Step 5: Shuffle Combined Dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Combined dataset shape:", df.shape)
print(df.head())

# Step 6: Text Cleaning Function
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub('<.*?>', '', text)  # Remove HTML
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub('\w*\d\w*', '', text)  # Remove words with numbers
    words = text.split()  # SIMPLE split on space (no word_tokenize)
    words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(words)

# Step 7: Apply Cleaning
df['clean_text'] = df['text'].apply(clean_text)

# Step 8: Encode Labels
df['label'] = df['label'].map({'REAL':1, 'FAKE':0})  # Map labels to 1 and 0
print(df['label'].value_counts())

# Step 9: Split into Features and Target
X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 10: Feature Extraction - TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 11: Build Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Step 12: Model Evaluation
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Plot Confusion Matrix
plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Step 13: Save the Model and TF-IDF Vectorizer
joblib.dump(model, 'fake_news_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# Step 14: Create a Prediction Function
def predict_news(news_text):
    cleaned = clean_text(news_text)
    vectorized = tfidf_vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)
    return 'REAL' if prediction[0] == 1 else 'FAKE'

# Example prediction
sample_news = "NASA has confirmed a new planet discovered by James Webb Telescope."
print("\nPrediction:", predict_news(sample_news))
