# Fake News Detection

## Suvodeep Das

#### Importing required library

In [39]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

### Loading the data

In [40]:
df = pd.read_csv('news.csv')

In [41]:
df.head(3) 

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL


In [42]:
df.shape

(6298, 4)

# Data Preprocessing

In [43]:
df.isna().sum() #Checking for null values

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [44]:
df.label = df.label.replace({'REAL': 1, 'FAKE': 0}) #Replacing the labels with numbers
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1
...,...,...,...,...
6293,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,1
6294,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,0
6295,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,0
6296,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",1


In [45]:
df['News'] = df['title']+df['text'] #Combining the title and text column and putting them in a new column 'News'
df.head(3)

Unnamed: 0.1,Unnamed: 0,title,text,label,News
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0,"You Can Smell Hillary’s FearDaniel Greenfield,..."
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,Watch The Exact Moment Paul Ryan Committed Pol...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1,Kerry to go to Paris in gesture of sympathyU.S...


Dropping the feature columns that are not required

In [46]:
features_dropped = ['title','text']
df = df.drop(features_dropped, axis =1)
df.head(3)

Unnamed: 0.1,Unnamed: 0,label,News
0,8476,0,"You Can Smell Hillary’s FearDaniel Greenfield,..."
1,10294,0,Watch The Exact Moment Paul Ryan Committed Pol...
2,3608,1,Kerry to go to Paris in gesture of sympathyU.S...


# Text Processing

 Creating a function to convert the text in lowercase, remove the extra space, special chr., ulr and links.

In [47]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [48]:
df["News"] = df["News"].apply(wordopt) 

In [49]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,News
0,8476,0,you can smell hillary s feardaniel greenfield ...
1,10294,0,watch the exact moment paul ryan committed pol...
2,3608,1,kerry to go to paris in gesture of sympathyu s...
3,10142,0,bernie supporters on twitter erupt in anger ag...
4,875,1,the battle of new york why this primary matte...


# Splitting dataset for training and testing

In [50]:
X = df['News']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

### Converting text data into vectors

In [51]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(X_train)
xv_test = vectorization.transform(X_test)

# Model Fitting

### 1. Logistic Regression

In [52]:
from sklearn.linear_model import LogisticRegression

In [53]:
LR = LogisticRegression()
LR.fit(xv_train, y_train) #Fitting training set to the model

LogisticRegression()

In [54]:
#Predicting test results based on our model
pred_lr=LR.predict(xv_test)

In [55]:
#Checking the accuracy score
LR.score(xv_test, y_test)

0.9092063492063492

In [56]:
#Evaluating the performance of the model
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.89      0.94      0.91       780
           1       0.93      0.88      0.91       795

    accuracy                           0.91      1575
   macro avg       0.91      0.91      0.91      1575
weighted avg       0.91      0.91      0.91      1575



### 2. Decision Tree Classification

In [57]:
from sklearn.tree import DecisionTreeClassifier

In [58]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train) #Fitting training set to the model

DecisionTreeClassifier()

In [59]:
#Predicting test results based on our model
pred_dt = DT.predict(xv_test)

In [60]:
#Checking the accuracy score
DT.score(xv_test, y_test)

0.7898412698412698

In [61]:
#Evaluating the performance of the model
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.77      0.81      0.79       780
           1       0.81      0.77      0.79       795

    accuracy                           0.79      1575
   macro avg       0.79      0.79      0.79      1575
weighted avg       0.79      0.79      0.79      1575



### 3. Gradient Boosting Classifier

In [62]:
from sklearn.ensemble import GradientBoostingClassifier

In [63]:
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train) #Fitting training set to the model

GradientBoostingClassifier(random_state=0)

In [64]:
#Predicting test results based on our model
pred_gbc = GBC.predict(xv_test)

In [65]:
#Checking the accuracy score
GBC.score(xv_test, y_test)

0.893968253968254

In [66]:
#Evaluating the performance of the model
print(classification_report(y_test, pred_gbc))

              precision    recall  f1-score   support

           0       0.88      0.91      0.89       780
           1       0.91      0.88      0.89       795

    accuracy                           0.89      1575
   macro avg       0.89      0.89      0.89      1575
weighted avg       0.89      0.89      0.89      1575



### 4. Random Forest Classifier

In [67]:
from sklearn.ensemble import RandomForestClassifier

In [68]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train) #Fitting training set to the model

RandomForestClassifier(random_state=0)

In [69]:
#Predicting test results based on our model
pred_rfc = RFC.predict(xv_test)

In [70]:
#Checking the accuracy score
RFC.score(xv_test, y_test)

0.8926984126984127

In [71]:
#Evaluating the performance of the model
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.88      0.91      0.89       780
           1       0.91      0.88      0.89       795

    accuracy                           0.89      1575
   macro avg       0.89      0.89      0.89      1575
weighted avg       0.89      0.89      0.89      1575



# Model Testing With Manual Entry

### News

In [72]:
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not A Fake News"
    
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GBC = GBC.predict(new_xv_test)
    pred_RFC = RFC.predict(new_xv_test)

    return print("\n\nLogistic Regression Prediction: {} \nDecision Tree Prediction: {} \nGradient Boosting Prediction: {} \nRandom Forest Prediction: {}".format(output_lable(pred_LR[0]), 
                                                                                                              output_lable(pred_DT[0]), 
                                                                                                              output_lable(pred_GBC[0]), 
                                                                                                              output_lable(pred_RFC[0])))

In [73]:
news = str(input())
manual_testing(news)



Logistic Regression Prediction: Not A Fake News 
Decision Tree Prediction: Not A Fake News 
Gradient Boosting Prediction: Not A Fake News 
Random Forest Prediction: Not A Fake News
