In [23]:
import pandas as pd
import numpy as np
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [24]:
# Load datasets
data_fake = pd.read_csv('../Fake.csv')
data_true = pd.read_csv('../True.csv')

# Add labels: 0 for fake, 1 for real
data_fake["class"] = 0
data_true["class"] = 1

# Reserve last 10 samples from each for manual testing
data_fake_manual_testing = data_fake.tail(10)
data_true_manual_testing = data_true.tail(10)

# Drop those last 10 samples from training data
data_fake = data_fake.iloc[:-10]
data_true = data_true.iloc[:-10]

# Combine the datasets and shuffle
data = pd.concat([data_fake, data_true], axis=0).reset_index(drop=True)
data = data.sample(frac=1).reset_index(drop=True)  # Shuffle data

In [25]:
# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\\W', ' ', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

data['text'] = data['text'].apply(preprocess_text)

# Features and labels
x = data['text']
y = data['class']

In [26]:
# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
xv_train = vectorizer.fit_transform(x_train)
xv_test = vectorizer.transform(x_test)

# Logistic Regression Model
model = LogisticRegression()
model.fit(xv_train, y_train)

# Predictions and Evaluation
y_pred = model.predict(xv_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9844028520499108
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99      5910
           1       0.98      0.99      0.98      5310

    accuracy                           0.98     11220
   macro avg       0.98      0.98      0.98     11220
weighted avg       0.98      0.98      0.98     11220



In [32]:
# Function to test new samples
def predict_news(news):
    processed_news = preprocess_text(news)
    transformed_news = vectorizer.transform([processed_news])
    prediction = model.predict(transformed_news)[0]
    label = "Not A Fake News" if prediction == 1 else "Fake News"
    return label

# Test the function with user input
news = input("Enter a news text: ")
print("Prediction:", predict_news(news))

Enter a news text: When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense.
Prediction: Not A Fake News
