In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

In [None]:
data_fake = pd.read_csv('Fake.csv')
data_true = pd.read_csv('True.csv')

In [None]:
data_fake.head()

In [None]:
data_true.head()

In [None]:
data_fake["class"] = 0
data_true["class"] = 1

In [None]:
data_fake.shape, data_true.shape

In [None]:
data_fake_testing = data_fake.tail(10)
for i in range(23480, 23470, -1):
    data_fake.drop([i], axis=0, inplace = True)
    
data_true_testing = data_true.tail(10)
for i in range(21416, 21406, -1):
    data_true.drop([i], axis=0, inplace = True)

In [None]:
data_fake.shape, data_true.shape

In [None]:
data_fake_testing["class"] = 0
data_true_testing["class"] = 1

In [None]:
data_fake_testing.head()

In [None]:
data_true_testing.head()

In [12]:
data_merge = pd.concat([data_fake, data_true], axis = 0)
data_merge.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [13]:
data_merge.columns

Index(['title', 'text', 'subject', 'date', 'class'], dtype='object')

In [14]:
data = data_merge.drop(['title','subject','date'], axis=1)

In [15]:
data.isnull().sum()

text     0
class    0
dtype: int64

In [16]:
data = data.sample(frac = 1)

In [17]:
data.head()

Unnamed: 0,text,class
4244,(Reuters) - Highlights for U.S. President Dona...,1
11505,MADRID (Reuters) - Spanish Prime Minister Mari...,1
17653,HARARE (Reuters) - Zimbabwe s ruling ZANU-PF i...,1
17332,The Ferguson #BlackLivesMatter protesters are ...,0
11670,"I can t possibly accept this award, she said...",0


In [18]:
data.reset_index(inplace = True)
data.drop(['index'], axis=1, inplace=True)

In [19]:
data.columns

Index(['text', 'class'], dtype='object')

In [20]:
data.head()

Unnamed: 0,text,class
0,(Reuters) - Highlights for U.S. President Dona...,1
1,MADRID (Reuters) - Spanish Prime Minister Mari...,1
2,HARARE (Reuters) - Zimbabwe s ruling ZANU-PF i...,1
3,The Ferguson #BlackLivesMatter protesters are ...,0
4,"I can t possibly accept this award, she said...",0


In [None]:
data['text'][20]

In [21]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' %re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [22]:
data['text'] = data['text'].apply(wordopt)

In [23]:
x = data['text']
y = data['class']

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [26]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(xv_train, y_train)

In [27]:
pred_lr = lr.predict(xv_test)

In [28]:
lr.score(xv_test, y_test)

0.9868983957219252

In [29]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5913
           1       0.98      0.99      0.99      5307

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [30]:
from sklearn.tree import DecisionTreeClassifier
df = DecisionTreeClassifier()
df.fit(xv_train, y_train)

In [31]:
pred_df = df.predict(xv_test)

In [32]:
df.score(xv_test, y_test)

0.9953654188948307

In [33]:
print(classification_report(y_test, pred_df))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5913
           1       1.00      0.99      1.00      5307

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



In [34]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(random_state = 0)
gb.fit(xv_train, y_train)

In [35]:
pred_gb = gb.predict(xv_test)

In [36]:
gb.score(xv_test, y_test)

0.9946524064171123

In [37]:
print(classification_report(y_test, pred_gb))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      5913
           1       0.99      1.00      0.99      5307

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [38]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 0)
rf.fit(xv_train, y_train)

In [39]:
pred_rf = rf.predict(xv_test)

In [40]:
rf.score(xv_test, y_test)

0.988680926916221

In [41]:
print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5913
           1       0.99      0.99      0.99      5307

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [42]:
def output(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not a Fake News"

In [43]:
def manual_testing(news):
    testing_news = {"text":[news]}
    new_test = pd.Dataframe(testing_news)
    new_test["text"] = new_test["text"].apply(wordopt)
    new_x_test = new_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_lr = lr.predict(new_xv_test)
    pred_df = df.predict(new_xv_test)
    pred_gb = gb.predict(new_xv_test)
    pred_rf = rf.predict(new_xv_test)
    
    return print("\n\nLR Prediction: {} \nDF Prediction: {} \nGB Prediction: {} \nRF Prediction: {}".format(output(pred_lr[0]),
                                                                                                           output(pred_df[0]),
                                                                                                           output(pred_gb[0]),
                                                                                                           output(pred_rf[0])))

In [None]:
news = str(input())
manual_testing(news)