In [1]:
import pandas as pd
import numpy as np
import re
import string 

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df_fake = pd.read_csv("Fake.csv")
df_true = pd.read_csv("True.csv")

In [3]:
df_fake.head(10)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017"
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,"December 23, 2017"
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,"December 23, 2017"
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,"December 22, 2017"
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,"December 21, 2017"


In [4]:
df_true.head(10)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017"
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,"December 29, 2017"
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,"December 28, 2017"


### Removing few data for manual testing later 


In [5]:
df_fake["class"] = 0
df_true["class"] = 1

In [6]:
df_fake_manual = df_fake.tail(10)
df_true_manual = df_true.tail(10)

In [7]:
df_fake.shape, df_true.shape

((23481, 5), (21417, 5))

In [8]:
for i in range(23480, 23470, -1):
    df_fake.drop(i, axis=0, inplace=True)
    
for i in range(21416, 21406, -1):
    df_true.drop(i, axis=0, inplace=True)

In [9]:
df_fake.shape, df_true.shape

((23471, 5), (21407, 5))

In [10]:
df_manual = pd.concat([df_fake_manual, df_true_manual], axis=0)
df_manual.to_csv("Manual_testing.csv")

In [11]:
df_merge = pd.concat([df_fake, df_true], axis=0)
df = df_merge.drop(["title", "subject", "date"], axis=1)
df.head(10)

Unnamed: 0,text,class
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
5,The number of cases of cops brutalizing and ki...,0
6,Donald Trump spent a good portion of his day a...,0
7,In the wake of yet another court decision that...,0
8,Many people have raised the alarm regarding th...,0
9,Just when you might have thought we d get a br...,0


In [12]:
df = df.sample(frac=1, random_state=1)
df.head(10)

Unnamed: 0,text,class
16085,CARACAS (Reuters) - Three of Venezuela s large...,1
22352,Patrick Henningsen 21st Century WireSo far as ...,0
5959,WASHINGTON (Reuters) - U.S. Senator John McCai...,1
11695,GENEVA (Reuters) - The U.N. s freedom of speec...,1
9601,WASHINGTON (Reuters) - Democrats in the U.S. H...,1
21756,GET OVER YOURSELF! MOOCH PLAYS THE RACE CARD A...,0
5490,It s always depressing when allegations that p...,0
14362,BUDAPEST (Reuters) - Hungary is facing a front...,1
18514,BAGHDAD (Reuters) - Iraqi forces and Shi ite p...,1
18045,The Anheuser-Busch Brewery put beer production...,0


In [13]:
# checking if there are any null data
df.isnull().sum()

text     0
class    0
dtype: int64

In [14]:
# removing uneccary symbols in the text

def edit_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('\\W', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)    
    text = re.sub('\n', '', text)
#     text = re.sub('\w*\d\w*', '', text)
    return text

In [15]:
df['text'] = df['text'].apply(edit_text)

In [16]:
df.head(10)

Unnamed: 0,text,class
16085,caracas reuters three of venezuela s large...,1
22352,patrick henningsen 21st century wireso far as ...,0
5959,washington reuters u s senator john mccai...,1
11695,geneva reuters the u n s freedom of speec...,1
9601,washington reuters democrats in the u s h...,1
21756,get over yourself mooch plays the race card a...,0
5490,it s always depressing when allegations that p...,0
14362,budapest reuters hungary is facing a front...,1
18514,baghdad reuters iraqi forces and shi ite p...,1
18045,the anheuser busch brewery put beer production...,0


In [17]:
x = df['text']
y = df['class']

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [19]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

#### Logistic Regression 

In [20]:
LR = LogisticRegression()
LR.fit(xv_train, y_train)

LogisticRegression()

In [21]:
LR.score(xv_test, y_test)

0.9872549019607844

In [22]:
pred_LR = LR.predict(xv_test)
print(classification_report(y_test, pred_LR))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5833
           1       0.99      0.99      0.99      5387

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



#### Decision Tree Classification 

In [23]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier()

In [24]:
DT.score(xv_test, y_test)

0.9967023172905526

In [25]:
pred_DT = DT.predict(xv_test)
print(classification_report(y_test, pred_DT))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5833
           1       1.00      1.00      1.00      5387

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



#### Gradient Boosting Classifier

In [26]:
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

GradientBoostingClassifier(random_state=0)

In [27]:
GBC.score(xv_test, y_test)

0.996524064171123

In [28]:
pred_GBC = GBC.predict(xv_test)
print(classification_report(y_test, pred_GBC))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      5833
           1       0.99      1.00      1.00      5387

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



#### Random Forest Classifier

In [29]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

RandomForestClassifier(random_state=0)

In [30]:
RFC.score(xv_test, y_test)

0.9894830659536542

In [31]:
pred_RFC = RFC.predict(xv_test)
print(classification_report(y_test, pred_RFC))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5833
           1       0.99      0.99      0.99      5387

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220

