In [1]:
# Fake News Detection using TF-IDFand Logistic Regression

# =========================================
# 1. IMPORT REQUIRED LIBRARIES
# =========================================

# pandas is used to read and handle CSV data
import pandas as pd

# train_test_split is used to split data into training and testing sets
from sklearn.model_selection import train_test_split

# TF-IDF converts text into numerical features
from sklearn.feature_extraction.text import TfidfVectorizer

# Logistic Regression is the classification model
from sklearn.linear_model import LogisticRegression

# accuracy_score is used to evaluate model performance
from sklearn.metrics import accuracy_score


# =========================================
# 2. LOAD DATASET
# =========================================

# Load fake news dataset
fake = pd.read_csv("Fake.csv")

# Load real news dataset
true = pd.read_csv("True.csv")


# =========================================
# 3. ADD LABELS
# =========================================

# Label fake news as 0
fake["label"] = 0

# Label real news as 1
true["label"] = 1


# =========================================
# 4. COMBINE DATASETS
# =========================================

# Combine fake and real news into a single dataset
data = pd.concat([fake, true])

# Shuffle the data so fake and real news are mixed
data = data.sample(frac=1, random_state=42).reset_index(drop=True)


# =========================================
# 5. SPLIT INPUTS AND OUTPUTS
# =========================================

# X contains the news text (input feature)
X = data["text"]

# y contains the labels (0 = fake, 1 = real)
y = data["label"]


# =========================================
# 6. TRAIN-TEST SPLIT
# =========================================

# Split data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,        # 20% data for testing
    random_state=42       # ensures same split every time
)


# =========================================
# 7. TF-IDF VECTORIZATION
# =========================================

# Create TF-IDF vectorizer
# stop_words="english" removes common words like 'the', 'is', 'and'
tfidf = TfidfVectorizer(stop_words="english")

# Learn vocabulary and transform training text into vectors
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform test text using the same vocabulary
X_test_tfidf = tfidf.transform(X_test)


# =========================================
# 8. TRAIN LOGISTIC REGRESSION MODEL
# =========================================

# Create Logistic Regression model
# max_iter increased to ensure convergence
model = LogisticRegression(max_iter=1000)

# Train the model using TF-IDF features
model.fit(X_train_tfidf, y_train)


# =========================================
# 9. MAKE PREDICTIONS
# =========================================

# Predict labels for test data
y_pred = model.predict(X_test_tfidf)


# =========================================
# 10. EVALUATE MODEL
# =========================================

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print("Model Accuracy:", accuracy)


Model Accuracy: 0.9837416481069042
