Importing all the required libraries

In [70]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

Reading the csv files

In [71]:
df_fake_path = '/Users/siddharth/Code/Python/FakeNewsDetection/data/Fake.csv'
df_true_path = '/Users/siddharth/Code/Python/FakeNewsDetection/data/True.csv'

df_fake = pd.read_csv(df_fake_path)
df_true = pd.read_csv(df_true_path)

In [72]:
df_fake.shape, df_true.shape

((23481, 4), (21417, 4))

Adding a column, 'reliability' to both dataframes to denote the credibility of the article (True - true news, False - fake news)

In [73]:
df_fake["reliability"] = False
df_true["reliability"] = True

Creating separate testing files, from origial dataframe, by choosing 10 random rows of data

In [74]:
df_fake_testing = df_fake.sample(10)
df_true_testing = df_true.sample(10)

df_fake_training = df_fake.drop(df_fake_testing.index)
df_true_training = df_true.drop(df_true_testing.index)

In [75]:
df_fake_training.shape, df_true_training.shape

((23471, 5), (21407, 5))

In [76]:
df_manual_testing = pd.concat([df_fake_testing,df_true_testing], axis = 0)
df_manual_testing = df_manual_testing.sample(frac = 1)
df_manual_testing.reset_index(inplace = True)
df_manual_testing.drop(["index"], axis = 1, inplace = True)
manual_testing_path = '/Users/siddharth/Code/Python/FakeNewsDetection/data/manual_testing.csv'
df_manual_testing.to_csv(manual_testing_path)

Merging the fake and true news testing dataframes

In [77]:
df = pd.concat([df_fake, df_true], axis =0)
df.head(10)

Unnamed: 0,title,text,subject,date,reliability
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",False
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",False
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",False
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",False
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",False
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017",False
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,"December 23, 2017",False
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,"December 23, 2017",False
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,"December 22, 2017",False
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,"December 21, 2017",False


Removing 'title', 'subject', 'date' columns from training set, since it does not play a role on reliability

In [78]:
df.columns

Index(['title', 'text', 'subject', 'date', 'reliability'], dtype='object')

In [79]:
df = df.drop(["title", "subject","date"], axis = 1)
df.head(10)

Unnamed: 0,text,reliability
0,Donald Trump just couldn t wish all Americans ...,False
1,House Intelligence Committee Chairman Devin Nu...,False
2,"On Friday, it was revealed that former Milwauk...",False
3,"On Christmas day, Donald Trump announced that ...",False
4,Pope Francis used his annual Christmas Day mes...,False
5,The number of cases of cops brutalizing and ki...,False
6,Donald Trump spent a good portion of his day a...,False
7,In the wake of yet another court decision that...,False
8,Many people have raised the alarm regarding th...,False
9,Just when you might have thought we d get a br...,False


Shuffling the data

In [80]:
df = df.sample(frac = 1)

In [81]:
df.head(10)

Unnamed: 0,text,reliability
15749,,False
401,WASHINGTON (Reuters) - The U.S. House of Repre...,True
19100,BRUSSELS (Reuters) - Police searched eight hou...,True
1822,"(Reuters) - Jamie Selway, the leading candidat...",True
144,"When we talk about getting rid of Trump, we al...",False
3700,"Megyn Kelly s new book, Settle For More, see...",False
19949,"Hillary is without a doubt, the worst and most...",False
8727,CLEVELAND (Reuters) - Ohio Governor John Kasic...,True
20277,Bad timing This is not great news for the Blac...,False
3552,Green Party presidential nominee Jill Stein ha...,False


In [82]:
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)

In [83]:
df.head()

Unnamed: 0,text,reliability
0,,False
1,WASHINGTON (Reuters) - The U.S. House of Repre...,True
2,BRUSSELS (Reuters) - Police searched eight hou...,True
3,"(Reuters) - Jamie Selway, the leading candidat...",True
4,"When we talk about getting rid of Trump, we al...",False


Writing a function 'clean' to clean the text of the article by removing extra spaces, special characters, and converting to lower case.

In [84]:
def clean(t):
    t = t.lower()
    t = re.sub('\[.*?\]', '', t)
    t = re.sub("\\W"," ",t) 
    t = re.sub('https?://\S+|www\.\S+', '', t)
    t = re.sub('<.*?>+', '', t)
    t = re.sub('[%s]' % re.escape(string.punctuation), '', t)
    t = re.sub('\n', '', t)
    t = re.sub('\w*\d\w*', '', t)    
    return t

In [85]:
df["text"] = df["text"].apply(clean)

In [86]:
x = df['text']
y = df['reliability']

Splitting the cleaned data into training and testing sets

In [87]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

Converting the text to vectors

In [88]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

Logistic Regression model

In [89]:
LR = LogisticRegression()
LR.fit(xv_train,y_train)

LogisticRegression()

In [90]:
lr_prediction=LR.predict(xv_test)

In [91]:
LR.score(xv_test, y_test)

0.9870824053452116

In [92]:
print(classification_report(y_test, lr_prediction))

              precision    recall  f1-score   support

       False       0.99      0.99      0.99      5920
        True       0.99      0.99      0.99      5305

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



Decision Tree Classification

In [93]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier()

In [94]:
dt_prediction = DT.predict(xv_test)

In [95]:
DT.score(xv_test, y_test)

0.995011135857461

In [96]:
print(classification_report(y_test, dt_prediction))

              precision    recall  f1-score   support

       False       0.99      1.00      1.00      5920
        True       1.00      0.99      0.99      5305

    accuracy                           1.00     11225
   macro avg       1.00      0.99      0.99     11225
weighted avg       1.00      1.00      1.00     11225



Gradient Boosting Classifier

In [99]:
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

GradientBoostingClassifier(random_state=0)

In [100]:
gbc_prediction = GBC.predict(xv_test)

In [101]:
GBC.score(xv_test, y_test)

0.995902004454343

In [102]:
print(classification_report(y_test, gbc_prediction))

              precision    recall  f1-score   support

       False       1.00      0.99      1.00      5920
        True       0.99      1.00      1.00      5305

    accuracy                           1.00     11225
   macro avg       1.00      1.00      1.00     11225
weighted avg       1.00      1.00      1.00     11225



Random Forest Classifier

In [103]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

RandomForestClassifier(random_state=0)

In [104]:
rfc_prediction = RFC.predict(xv_test)

In [105]:
RFC.score(xv_test, y_test)

0.9870824053452116

In [106]:
print(classification_report(y_test, rfc_prediction))

              precision    recall  f1-score   support

       False       0.99      0.99      0.99      5920
        True       0.99      0.99      0.99      5305

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



Manual Testing

In [147]:
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(clean) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    LR_predictions = LR.predict(new_xv_test)
    DT_predictions = DT.predict(new_xv_test)
    GBC_predictions = GBC.predict(new_xv_test)
    RFC_predictions = RFC.predict(new_xv_test)
    count_f,count_t = 0,0
    for i in [LR_predictions[0], DT_predictions[0], GBC_predictions[0], RFC_predictions[0]]:
        if i:
            count_t+=1
        else:
            count_f+=1
    if count_f>count_t:
        p = int((count_f/4)*100)
    else:
        p = int((count_t/4)*100)
    if  not LR_predictions[0]:
        return [f'{p}% of the regression models have predicted that the article is fake or unreliable.', False, p]
    else:
        return [f'{p}% of the regression models have predicted that the article is true and reliable.', True, p]

In [150]:
testing = pd.read_csv(manual_testing_path)
txt, truth, percent = '',False,0
score=0
for i in range(20):
    txt, truth, percent = manual_testing(testing.at[i,'text'])
    print('For the article: \"' + testing.at[i,'title'] + '\", ' + txt)
    if truth == testing.at[i,'reliability']:
        score+=1
print(f'The score of the regression models is {score}/20')

For the article: " Well It Happened, Trump Gets Desperate And Goes Full BENGHAZI On Hillary (TWEET)", 100% of the regression models have predicted that the article is fake or unreliable.
For the article: "Trump's prediction of 'massive recession' puzzles economists", 100% of the regression models have predicted that the article is true and reliable.
For the article: " ‘Piece Of Sh*t’: GOP Senator Immediately Regrets Trolling Bernie Sanders Over Trumpcare", 100% of the regression models have predicted that the article is fake or unreliable.
For the article: "Ex-U.S. Attorney Bharara, fired by Trump, joins NYU law school", 100% of the regression models have predicted that the article is true and reliable.
For the article: "COINTEL PRO: Are ‘Anti-Fascist’ Media Personalities Playing to the Cameras?", 100% of the regression models have predicted that the article is fake or unreliable.
For the article: "IT’S COME TO THIS: Leftist Media and Protesters Go For The Jugular In All Out Freak Out 