# Fake News Detection Model
## 1. Import Libraries and Load Data

In [15]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report

In [16]:
# Load datasets
true = pd.read_csv('True.csv')
false = pd.read_csv('Fake.csv')

print("True news sample:")
display(true.head())
print("\nFake news sample:")
display(false.head())

True news sample:


Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"



Fake news sample:


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


## 2. Data Preprocessing

In [17]:
# Add labels
true['label'] = 1
false['label'] = 0

# Concatenate datasets
news = pd.concat([true, false], axis=0)

print(f"Total samples: {len(news)}")
print(f"\nNull values:\n{news.isnull().sum()}")

Total samples: 44898

Null values:
title      0
text       0
subject    0
date       0
label      0
dtype: int64


In [18]:
# Drop unnecessary columns
news = news.drop(['title', 'subject', 'date'], axis=1)

# Shuffle data
news = news.sample(frac=1, random_state=42)
news.reset_index(drop=True, inplace=True)

display(news.head())

Unnamed: 0,text,label
0,"Donald Trump s White House is in chaos, and th...",0
1,Now that Donald Trump is the presumptive GOP n...,0
2,Mike Pence is a huge homophobe. He supports ex...,0
3,SAN FRANCISCO (Reuters) - California Attorney ...,1
4,Twisted reasoning is all that comes from Pelos...,0


## 3. Text Cleaning Function

In [19]:
def wordopt(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove digits
    text = re.sub(r'\d', '', text)
    
    # Remove newline characters
    text = re.sub(r'\n', '', text)
    
    # Remove common location names
    text = re.sub(r'\b(reuters|washington|london|new york)\b', '', text)
    
    return text

# Apply text cleaning
news['text'] = news['text'].apply(wordopt)
print("Text cleaning completed!")

Text cleaning completed!


## 4. Train-Test Split

In [20]:
# Define features and labels
x = news['text']
y = news['label']

# Split data (70% train, 30% test)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

print(f"Training samples: {x_train.shape[0]}")
print(f"Testing samples: {x_test.shape[0]}")

Training samples: 31428
Testing samples: 13470


## 5. Text Vectorization

In [21]:
# Convert text to numerical features using TF-IDF
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

print(f"Training matrix shape: {xv_train.shape}")
print(f"Testing matrix shape: {xv_test.shape}")

Training matrix shape: (31428, 175056)
Testing matrix shape: (13470, 175056)


## 6. Model Training and Evaluation
### 6.1 Logistic Regression

In [22]:
LR = LogisticRegression(C=0.1, max_iter=1000, solver='liblinear')
LR.fit(xv_train, y_train)
pred_lr = LR.predict(xv_test)

print(f"Logistic Regression Accuracy: {LR.score(xv_test, y_test):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, pred_lr))

Logistic Regression Accuracy: 0.9684

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      7022
           1       0.96      0.98      0.97      6448

    accuracy                           0.97     13470
   macro avg       0.97      0.97      0.97     13470
weighted avg       0.97      0.97      0.97     13470



### 6.2 Decision Tree Classifier

In [23]:
DTC = DecisionTreeClassifier(max_depth=10, min_samples_split=5, min_samples_leaf=2, random_state=42)
DTC.fit(xv_train, y_train)
pred_dtc = DTC.predict(xv_test)

print(f"Decision Tree Accuracy: {DTC.score(xv_test, y_test):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, pred_dtc))

Decision Tree Accuracy: 0.9342

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      7022
           1       0.93      0.93      0.93      6448

    accuracy                           0.93     13470
   macro avg       0.93      0.93      0.93     13470
weighted avg       0.93      0.93      0.93     13470



### 6.3 Random Forest Classifier

In [24]:
RF = RandomForestClassifier(n_estimators=200, max_depth=20, max_features='sqrt', random_state=42)
RF.fit(xv_train, y_train)
pred_rf = RF.predict(xv_test)

print(f"Random Forest Accuracy: {RF.score(xv_test, y_test):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, pred_rf))

Random Forest Accuracy: 0.9712

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      7022
           1       0.97      0.97      0.97      6448

    accuracy                           0.97     13470
   macro avg       0.97      0.97      0.97     13470
weighted avg       0.97      0.97      0.97     13470



### 6.4 Gradient Boosting Classifier

In [25]:
GB = GradientBoostingClassifier(random_state=42)
GB.fit(xv_train, y_train)
pred_gb = GB.predict(xv_test)

print(f"Gradient Boosting Accuracy: {GB.score(xv_test, y_test):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, pred_gb))

Gradient Boosting Accuracy: 0.9685

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      7022
           1       0.96      0.97      0.97      6448

    accuracy                           0.97     13470
   macro avg       0.97      0.97      0.97     13470
weighted avg       0.97      0.97      0.97     13470



## 7. Model Comparison

In [26]:
# Compare all models
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting'],
    'Accuracy': [
        LR.score(xv_test, y_test),
        DTC.score(xv_test, y_test),
        RF.score(xv_test, y_test),
        GB.score(xv_test, y_test)
    ]
})

results = results.sort_values('Accuracy', ascending=False)
display(results)

Unnamed: 0,Model,Accuracy
2,Random Forest,0.971195
3,Gradient Boosting,0.968523
0,Logistic Regression,0.968374
1,Decision Tree,0.934224


## 8. Prediction Function

In [27]:
def output_label(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Genuine News"

def manual_testing(news):
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt)
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DTC.predict(new_xv_test)
    pred_RF = RF.predict(new_xv_test)
    pred_GB = GB.predict(new_xv_test)
    
    return {
        'Logistic Regression': output_label(pred_LR[0]),
        'Decision Tree': output_label(pred_DT[0]),
        'Random Forest': output_label(pred_RF[0]),
        'Gradient Boosting': output_label(pred_GB[0])
    }

## 9. Test with Sample News

In [28]:
# Test with a sample article
sample_news = "Enter your news article here"
predictions = manual_testing(sample_news)

print("Predictions from all models:")
for model, prediction in predictions.items():
    print(f"{model}: {prediction}")

Predictions from all models:
Logistic Regression: Fake News
Decision Tree: Fake News
Random Forest: Fake News
Gradient Boosting: Fake News
