In [16]:
import numpy as np
import pandas as pd
import string
import nltk

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Sample Dataset

In [23]:
data = {
"text": [
"Government passes new healthcare bill",
"Aliens landed in New York yesterday",
"Stock market hits record high",
"Fake cure for cancer discovered",
"Scientists discover new species",
"NASA launches new satellite",
"Celebrity adopts alien baby",
"New education reform introduced",
"Secret time travel machine found",
"Doctors develop new vaccine"
],
"label": [1,0,1,0,1,1,0,1,0,1]
}

df_fake = pd.DataFrame(data)
df_fake["clean"] = df_fake["text"].apply(clean_text)



In [24]:
# TF-IDF + Logistic Regression

In [25]:
X = df_fake["clean"]
y = df_fake["label"]

vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X)

model = LogisticRegression()
model.fit(X_vec, y)

y_pred = model.predict(X_vec)

print("Accuracy:", accuracy_score(y, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
print(classification_report(y, y_pred))


Accuracy: 1.0
Confusion Matrix:
 [[4 0]
 [0 6]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         6

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [26]:
# Top Important Words

In [27]:
feature_names = vectorizer.get_feature_names_out()
coefs = model.coef_[0]

top_words = np.argsort(coefs)[-5:]
print("Important words for Real News:")
print([feature_names[i] for i in top_words])


Important words for Real News:
['reform', 'satellite', 'vaccine', 'species', 'new']
