<a href="https://colab.research.google.com/github/Vijay240702/Naan-Mudhalvan/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install & Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
import re
import os

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import resample

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# Step 2: Load Dataset
print(os.listdir())  # optional, to see available files

fake = pd.read_csv("Fake[1].csv")
real = pd.read_csv("True[1].csv")

fake['label'] = 0
real['label'] = 1

# Combine and shuffle dataset
df = pd.concat([fake, real], axis=0).reset_index(drop=True)
df = df[['title', 'text', 'label']]
df['combined'] = df['title'] + ' ' + df['text']

# Step 3: Text Cleaning Function
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\[.*?\]', '', text)
        text = re.sub(r'http\S+|www.\S+', '', text)
        text = re.sub(r'<.*?>+', '', text)
        text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub(r'\n', '', text)
        text = re.sub(r'\w*\d\w*', '', text)
        text = ' '.join([word for word in text.split() if word not in stop_words])
        return text
    else:
        return ""

df['clean_text'] = df['combined'].apply(clean_text)

# Step 4: Balance Dataset (Downsampling)
df_fake = df[df['label'] == 0]
df_real = df[df['label'] == 1]
df_real_downsampled = resample(df_real, replace=True, n_samples=len(df_fake), random_state=42)
df_balanced = pd.concat([df_fake, df_real_downsampled]).sample(frac=1, random_state=42).reset_index(drop=True)

# Step 5: Train-Test Split & TF-IDF
X = df_balanced['clean_text']
y = df_balanced['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_df=0.7, min_df=5)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 6: Model Training
model = PassiveAggressiveClassifier(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Step 1 to Step 7: (Same as before, already working fine)

# ⏩ ADD THIS AFTER MODEL TRAINING & EVALUATION ⏪

# Step 9: Real-time Prediction Function
def predict_news(text_input):
    cleaned_input = clean_text(text_input)
    tfidf_input = vectorizer.transform([cleaned_input])
    prediction = model.predict(tfidf_input)[0]
    return "🟢 REAL News" if prediction == 1 else "🔴 FAKE News"

# Step 10: Try Custom Inputs Here ⌨
while True:
    user_input = input("\n📰 Enter news text to classify (or type 'exit' to quit):\n")
    if user_input.lower() == 'exit':
        print("Exiting...")
        break
    result = predict_news(user_input)
    print(f"\n📣 Prediction: {result}")


# Step 7: Evaluation
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 8: Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['.config', 'Fake[1].csv', 'True[1].csv', 'sample_data']

📣 Prediction: 🔴 FAKE News

📣 Prediction: 🟢 REAL News
