In [1]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
fake = pd.read_csv("../data/Fake.csv")
true = pd.read_csv("../data/True.csv")

fake['label'] = 0
true['label'] = 1


In [3]:
df = pd.concat([fake, true], axis=0)
df = df[['text', 'label']]
df = df.sample(frac=1).reset_index(drop=True)

df.head()


Unnamed: 0,text,label
0,NEW YORK (Reuters) - Billionaire investor Carl...,1
1,Johnson calls Obama s manufactured race war/wa...,0
2,"Earlier this week, Devin Nunes finally recused...",0
3,Milo Yiannopoulos will do anything to keep his...,0
4,BloombergIn the hours after the president is ...,0


In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"\d+", "", text)
    return text

df['clean_text'] = df['text'].apply(clean_text)


In [5]:
X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [6]:
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [7]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

In [8]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9847438752783965
Confusion Matrix:
 [[4572   74]
 [  63 4271]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99      4646
           1       0.98      0.99      0.98      4334

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980



In [9]:
feature_names = tfidf.get_feature_names_out()
coefficients = model.coef_[0]

top_fake = np.argsort(coefficients)[:10]
top_real = np.argsort(coefficients)[-10:]

print("Top Fake News Words:")
for i in top_fake:
    print(feature_names[i])

print("\nTop Real News Words:")
for i in top_real:
    print(feature_names[i])


Top Fake News Words:
image
just
gop
hillary
wire
america
images
mr
like
american

Top Real News Words:
minister
statement
republican
monday
friday
tuesday
thursday
washington
wednesday
reuters


In [11]:
def predict_news(text):
    text = clean_text(text)
    vector = tfidf.transform([text])
    prediction = model.predict(vector)
    return "Real News" if prediction[0] == 1 else "Fake News"

predict_news(
    "The Ministry of Education on Tuesday announced a new education policy aimed at improving higher education standards across Indian universities. The policy focuses on curriculum modernization, digital learning, and faculty training programs."
)


'Real News'

In [12]:
def predict_news(text):
    text = clean_text(text)
    vector = tfidf.transform([text])
    prediction = model.predict(vector)
    return "Real News" if prediction[0] == 1 else "Fake News"

predict_news("Government announces new education policy for universities")


'Fake News'