In [4]:
#Importing data

import pandas as pd
import string
import nltk
import joblib
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('wordnet')

#Text Cleaning Function

def clean_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = ''.join([ch for ch in text if ch not in string.punctuation])
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(words)
    
#Load and Clean Dataset

fake = pd.read_csv("C:\\Users\\siril\\OneDrive\\Documents\\MSME\\NewsClassification\\Fake.csv")
real = pd.read_csv("C:\\Users\\siril\\OneDrive\\Documents\\MSME\\NewsClassification\\True.csv")
    
fake["label"] = 0
real["label"] = 1
    
data = pd.concat([fake, real], ignore_index=True)
data["text"] = data["title"] + " " + data["text"]
data["text"] = data["text"].apply(clean_text)
    
data = data[["text", "label"]]
data.head()
    
#Vectorize Text and Split Data

X = data['text']
y = data['label']

vectorizer = TfidfVectorizer(max_features=5000)
X_vec = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

#Train Models and Evaluate

# Logistic Regression modeling

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

# Naive Bayes modeling

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)

# Accuracy Scores

print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_pred))
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_pred))

#Saving Model &  the Vectorizer

joblib.dump(lr_model, "model.joblib")
joblib.dump(vectorizer, "vectorizer.joblib")

#Manual Prediction

def predict_news(text):
    cleaned = clean_text(text)
    vec = vectorizer.transform([cleaned])
    pred = lr_model.predict(vec)[0]
    return "Real ✅" if pred == 1 else "Fake ❌"
    
predict_news("Breaking news: Government announces big economic reform package.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\siril\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\siril\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Logistic Regression Accuracy: 0.9891982182628062
Naive Bayes Accuracy: 0.9359688195991092


'Fake ❌'