In [5]:
import pandas as pd
import numpy as np
import re
import nltk
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [6]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [7]:
print("Loading datasets...")
df_fake = pd.read_csv("./dataset/Fake.csv")
df_true = pd.read_csv("./dataset/True.csv")

Loading datasets...


  df_fake = pd.read_csv("./dataset/Fake.csv")


In [8]:

df_fake['label'] = 1   # Fake
df_true['label'] = 0   # Real

In [9]:

df = pd.concat([df_fake, df_true]).sample(frac=1, random_state=42).reset_index(drop=True)
print("Dataset loaded:", df.shape)

Dataset loaded: (44919, 173)


In [12]:
df_true = df_true[df_true["text"].str.strip().str.len() > 50]
df_fake = df_fake[df_fake["text"].str.strip().str.len() > 50]

In [13]:
df_true["text"] = df_true["text"].astype(str)
df_true["title"] = df_true["title"].astype(str)

bad_rows = df_true[
    (df_true["text"].str.strip() == "") | 
    (df_true["text"].str.len() < 50)
]

bad_rows.tail(20)

Unnamed: 0,title,text,subject,date,label


In [11]:

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [15]:

def clean_text(text):
    text = str(text)
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

In [16]:
print("Cleaning text (may take time)...")
df["content"] = (df["title"] + " " + df["text"]).apply(clean_text)

Cleaning text (may take time)...


In [17]:
X = df["content"]
y = df["label"]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [19]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.8, min_df=5)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [20]:
print("Training Logistic Regression model...")
model = LogisticRegression(max_iter=4000, class_weight="balanced")
model.fit(X_train_vec, y_train)

Training Logistic Regression model...


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,4000


In [21]:
print("Evaluating model...")
pred = model.predict(X_test_vec)
acc = accuracy_score(y_test, pred)
print("\nAccuracy:", round(acc * 100, 2), "%")

Evaluating model...

Accuracy: 99.02 %


In [22]:
print("\nClassification Report:")
print(classification_report(y_test, pred, target_names=["Real", "Fake"]))


Classification Report:
              precision    recall  f1-score   support

        Real       0.99      0.99      0.99      4283
        Fake       0.99      0.99      0.99      4701

    accuracy                           0.99      8984
   macro avg       0.99      0.99      0.99      8984
weighted avg       0.99      0.99      0.99      8984



In [23]:
print("Saving model files...")
pickle.dump(model, open("model.pkl", "wb"))
pickle.dump(vectorizer, open("vector.pkl", "wb"))

print("\n New model & vector saved successfully!\n")

Saving model files...

 New model & vector saved successfully!

