In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

In [2]:
# Importing Dataset
dataframe_fake = pd.read_csv("/content/drive/MyDrive/Fake.csv")
dataframe_true = pd.read_csv("/content/drive/MyDrive/True.csv")

In [3]:
#checking records in fake news
dataframe_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
#checking records in True news
dataframe_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


Adding new column for classifying fake and true news

0-->True news

1-->Fake news

In [5]:
# Adding "class" column to differentiate between fake (1) and true (0) news
dataframe_true["class"] = 0
dataframe_fake["class"] = 1

In [6]:
# Sample a subset of fake news to match the size of true news
dataframe_fake = dataframe_fake.sample(n=dataframe_true.shape[0], random_state=42)

In [7]:
dataframe_fake.shape

(21417, 5)

In [8]:
dataframe_true.shape

(21417, 5)

In [9]:
#checking records in True news
dataframe_true.head()

Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [10]:
#checking records in fake news
dataframe_fake.head()

Unnamed: 0,title,text,subject,date,class
13474,ABOUT HILLARY’S COUGH: We Discovered The Secre...,,politics,"Jul 20, 2016",1
11994,BREAKING: OBAMACARE REPEAL Clears First Hurdle...,The Senate voted 51-48 this afternoon to proce...,politics,"Jan 4, 2017",1
19179,‘SLEEPY’ JUSTICE GINSBURG: Excites Crowd By Sa...,So much for the SCOTUS not being political Che...,left-news,"Feb 7, 2017",1
501,WATCH: Kellyanne Conway Very Upset Hillary Cl...,White House counselor Kellyanne Conway crawled...,News,"August 24, 2017",1
3492,"GOP Gives Trump The Middle Finger, Prepares T...",Donald Trump may have decided that Russia is g...,News,"December 9, 2016",1


In [11]:
# Concatenating both datasets
dataframe_merge = pd.concat([dataframe_fake, dataframe_true], axis=0)

In [12]:
# Removing unnecessary columns
dataframe = dataframe_merge.drop(["title", "subject", "date"], axis=1)

In [13]:
# Shuffle the rows of the dataset to introduce randomness
dataframe = dataframe.sample(frac=1)

In [14]:
# Resetting index after shuffling
dataframe.reset_index(inplace=True)
dataframe.drop(["index"], axis=1, inplace=True)

In [15]:
# Function to preprocess text data
def wordopt(t):
    t = t.lower()
    t = re.sub('\[.*?\]', '', t)
    t = re.sub("\\W", " ", t)
    t = re.sub('https?://\S+|www\.\S+', '', t)
    t = re.sub('<.*?>+', '', t)
    t = re.sub('[%s]' % re.escape(string.punctuation), '', t)
    t = re.sub('\n', '', t)
    t = re.sub('\w*\d\w*', '', t)
    return t

In [16]:
# Applying text preprocessing function to the "text" column
dataframe["text"] = dataframe["text"].apply(wordopt)

In [17]:
# Defining independent and dependent variables
x = dataframe["text"]
y = dataframe["class"]

In [18]:
# Splitting the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [19]:
# Text vectorization using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

MODEL TRAINING

In [20]:
# 1. Logistic Regression
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(xv_train,y_train)

In [21]:
# 2. Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

In [22]:
# 3. Gradient Boost Classifier
from sklearn.ensemble import GradientBoostingClassifier
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

In [23]:
# 4. Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

Model Evaluation

In [24]:
# 1. Logistic Regression
pred_lr = LR.predict(xv_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, pred_lr))
print("Logistic Regression Classification Report:")
print(classification_report(y_test, pred_lr))

Logistic Regression Accuracy: 0.9844990195162947
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      5303
           1       0.99      0.98      0.98      5406

    accuracy                           0.98     10709
   macro avg       0.98      0.98      0.98     10709
weighted avg       0.98      0.98      0.98     10709



In [25]:
# 2. Decision Tree Classifier
pred_dt = DT.predict(xv_test)
print("Decision Tree Classifier Accuracy:", accuracy_score(y_test, pred_dt))
print("Decision Tree Classifier Classification Report:")
print(classification_report(y_test, pred_dt))

Decision Tree Classifier Accuracy: 0.9952376505742833
Decision Tree Classifier Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5303
           1       1.00      1.00      1.00      5406

    accuracy                           1.00     10709
   macro avg       1.00      1.00      1.00     10709
weighted avg       1.00      1.00      1.00     10709



In [26]:
# 3. Gradient Boost Classifier
pred_gbc = GBC.predict(xv_test)
print("Gradient Boost Classifier Accuracy:", accuracy_score(y_test, pred_gbc))
print("Gradient Boost Classifier Classification Report:")
print(classification_report(y_test, pred_gbc))

Gradient Boost Classifier Accuracy: 0.9955177887757961
Gradient Boost Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5303
           1       1.00      0.99      1.00      5406

    accuracy                           1.00     10709
   macro avg       1.00      1.00      1.00     10709
weighted avg       1.00      1.00      1.00     10709



In [27]:
# 4. Random Forest Classifier
pred_rfc = RFC.predict(xv_test)
print("Random Forest Classifier Accuracy:", accuracy_score(y_test, pred_rfc))
print("Random Forest Classifier Classification Report:")
print(classification_report(y_test, pred_rfc))

Random Forest Classifier Accuracy: 0.986459986926884
Random Forest Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      5303
           1       0.99      0.98      0.99      5406

    accuracy                           0.99     10709
   macro avg       0.99      0.99      0.99     10709
weighted avg       0.99      0.99      0.99     10709



In [28]:
# Manual Testing Function
def output_label(n):
    if n == 0:
        return "True News"
    elif n == 1:
        return "Fake News"

In [29]:
def manual_testing(news):
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt)
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GBC = GBC.predict(new_xv_test)
    pred_RFC = RFC.predict(new_xv_test)

    print("\nLogistic Regression Prediction:", output_label(pred_LR[0]))
    print("Decision Tree Prediction:", output_label(pred_DT[0]))
    print("Gradient Boost Prediction:", output_label(pred_GBC[0]))
    print("Random Forest Prediction:", output_label(pred_RFC[0]))

In [None]:
# Manual Testing
news_input = str(input("Enter the news text for manual testing: "))
manual_testing(news_input)