In [1]:
import pandas as pd
import numpy as np
import re
import string
import pickle
import time

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
true_df = pd.read_csv("../data/true.csv")
fake_df = pd.read_csv("../data/fake.csv")

print("TRUE SHAPE:", true_df.shape)
print("FAKE SHAPE:", fake_df.shape)

true_df.head(), fake_df.head()


TRUE SHAPE: (2500000, 3)
FAKE SHAPE: (2500000, 3)


(   id                                               text  label
 0   1  Mumbai is a classification algorithm data scie...   True
 1   2  Random Forest is a supervised learning method ...   True
 2   3  Pune is a supervised learning method neural ne...   True
 3   4  Python is a programming language used in data ...   True
 4   5  Python is a programming language used in data ...   True,
    id                                  text label
 0   1           Apple is located in Africa.  fake
 1   2  Amit Shah is the President of India.  fake
 2   3           Kolkata is located in Mars.  fake
 3   4              The Moon has Bangladesh.  fake
 4   5                Jaipur declared Delhi.  fake)

In [4]:
# Convert human labels to numeric class labels
true_df = true_df.copy()
fake_df = fake_df.copy()

true_df["target"] = 1    # true → 1
fake_df["target"] = 0    # fake → 0

# Keep only text + target columns
true_df = true_df[["text", "target"]]
fake_df = fake_df[["text", "target"]]

true_df.head(), fake_df.head()


(                                                text  target
 0  Mumbai is a classification algorithm data scie...       1
 1  Random Forest is a supervised learning method ...       1
 2  Pune is a supervised learning method neural ne...       1
 3  Python is a programming language used in data ...       1
 4  Python is a programming language used in data ...       1,
                                    text  target
 0           Apple is located in Africa.       0
 1  Amit Shah is the President of India.       0
 2           Kolkata is located in Mars.       0
 3              The Moon has Bangladesh.       0
 4                Jaipur declared Delhi.       0)

In [5]:
SAMPLE = 250_000  # number per class

true_sample = true_df.sample(n=SAMPLE, random_state=42)
fake_sample = fake_df.sample(n=SAMPLE, random_state=42)

# Combine and shuffle
data = pd.concat([true_sample, fake_sample], axis=0)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

print("Final training dataset shape:", data.shape)
data.head()


Final training dataset shape: (500000, 2)


Unnamed: 0,text,target
0,K-Nearest Neighbors is a supervised learning m...,1
1,The Moon is a part of machine learning.,1
2,Pune is the capital of regression tasks.,1
3,SVM is a supervised learning method used for c...,1
4,Light is the President of Mumbai.,0


In [6]:
def clean_text(text: str) -> str:
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)                 # remove URLs
    text = re.sub(r"\d+", "", text)                     # remove digits
    text = text.translate(str.maketrans("", "", string.punctuation))  # punctuation
    return text

start = time.time()
data["clean_text"] = data["text"].apply(clean_text)
end = time.time()

print(f"Cleaning took: {(end - start)/60:.2f} minutes")
data[["text", "clean_text"]].head()


Cleaning took: 0.04 minutes


Unnamed: 0,text,clean_text
0,K-Nearest Neighbors is a supervised learning m...,knearest neighbors is a supervised learning me...
1,The Moon is a part of machine learning.,the moon is a part of machine learning
2,Pune is the capital of regression tasks.,pune is the capital of regression tasks
3,SVM is a supervised learning method used for c...,svm is a supervised learning method used for c...
4,Light is the President of Mumbai.,light is the president of mumbai


In [8]:
from sklearn.model_selection import train_test_split

X = data["clean_text"]
y = data["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42, shuffle=True
)

print("Train size:", len(X_train))
print("Test size :", len(X_test))


Train size: 450000
Test size : 50000


In [9]:
# ----------------------------------------------------------
# 7. TF-IDF VECTORIZATION
# ----------------------------------------------------------

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=50000,     # increased feature size
    ngram_range=(1, 2),     # unigrams + bigrams
    sublinear_tf=True
)

start = time.time()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec  = vectorizer.transform(X_test)
end = time.time()

print("Train matrix shape:", X_train_vec.shape)
print("Test  matrix shape:", X_test_vec.shape)
print(f"TF-IDF took: {(end - start)/60:.2f} minutes")


Train matrix shape: (450000, 1935)
Test  matrix shape: (50000, 1935)
TF-IDF took: 0.06 minutes


In [10]:
# -----------------------------
# 8) Train Logistic Regression
# -----------------------------
from sklearn.linear_model import LogisticRegression
import time

lr = LogisticRegression(
    max_iter=300,
    n_jobs=-1,
    C=3.0,
    solver="lbfgs"
)

start = time.time()
lr.fit(X_train_vec, y_train)
end = time.time()

print(f"Training took: {(end - start)/60:.2f} minutes")


Training took: 0.06 minutes


In [12]:
# -----------------------------
# 9) Evaluation
# -----------------------------
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = lr.predict(X_test_vec)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, pos_label=1)
rec = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)

print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")


Accuracy : 1.0000
Precision: 1.0000
Recall   : 1.0000
F1-score : 1.0000


In [13]:
# -----------------------------
# 10) Save Model & Vectorizer
# -----------------------------
import pickle

pickle.dump(lr, open("../models/model.pkl", "wb"))
pickle.dump(vectorizer, open("../models/vectorizer.pkl", "wb"))

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!
