In [3]:
import pandas as pd

# Load dataset
df = pd.read_csv("news.csv")

# View sample rows
df.head()

# Check shape and nulls
df.shape, df.isnull().sum()


((6335, 4),
 Unnamed: 0    0
 title         0
 text          0
 label         0
 dtype: int64)

In [5]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Apply on the 'text' column
df['cleaned_text'] = df['text'].apply(preprocess)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Harshambica\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_text']).toarray()

# Labels
y = df['label'].map({'FAKE': 0, 'REAL': 1})  # adjust if your labels are 0/1 or strings


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, f1_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose one model
model = LogisticRegression()  # or MultinomialNB()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9163378058405682
F1 Score: 0.916403785488959
              precision    recall  f1-score   support

           0       0.91      0.92      0.92       628
           1       0.92      0.91      0.92       639

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267



In [14]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("tfidf.pkl", "wb") as f:
    pickle.dump(tfidf, f)


In [20]:
import streamlit as st
import pickle
import nltk
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Load model and vectorizer
model = pickle.load(open("model.pkl", "rb"))
vectorizer = pickle.load(open("tfidf.pkl", "rb"))

def preprocess(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

st.title("📰 Fake News Detector")

user_input = st.text_area("Enter News Article Text")

if st.button("Classify"):
    cleaned = preprocess(user_input)
    vect_text = vectorizer.transform([cleaned])
    prediction = model.predict(vect_text)[0]
    result = "Real News ✅" if prediction == 1 else "Fake News ❌"
    st.subheader(result)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Harshambica\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
