In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: preprocess data 

In [2]:
fake_news = pd.read_csv('./data/Fake.csv')
real_news = pd.read_csv('./data/True.csv')

In [3]:
# lets add the categories 
fake_news['label'] = [1 for i in range(len(fake_news))]
real_news['label'] = [0 for i in range(len(real_news))]
# lets merge the title and the text 
fake_news['full_text'] = fake_news.apply(lambda row: row['title']+row['text'], axis=1)
real_news['full_text'] = fake_news.apply(lambda row: row['title']+row['text'], axis=1)

In [4]:
fake_news

Unnamed: 0,title,text,subject,date,label,full_text
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1,Donald Trump Sends Out Embarrassing New Year’...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1,Drunk Bragging Trump Staffer Started Russian ...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1,Sheriff David Clarke Becomes An Internet Joke...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1,Trump Is So Obsessed He Even Has Obama’s Name...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1,Pope Francis Just Called Out Donald Trump Dur...
...,...,...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",1,McPain: John McCain Furious That Iran Treated ...
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",1,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",1,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",1,How to Blow $700 Million: Al Jazeera America F...


In [5]:
full_dataset = pd.concat([fake_news, real_news])

In [6]:
full_dataset

Unnamed: 0,title,text,subject,date,label,full_text
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1,Donald Trump Sends Out Embarrassing New Year’...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1,Drunk Bragging Trump Staffer Started Russian ...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1,Sheriff David Clarke Becomes An Internet Joke...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1,Trump Is So Obsessed He Even Has Obama’s Name...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1,Pope Francis Just Called Out Donald Trump Dur...
...,...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0,SHOCKING TAXPAYER TAB FOR OBAMA’S GOLF Trips A...
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0,WOW! WORLD’S TOP PHYSICIST AND DEMOCRAT: Obama...
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0,URGENT! Join #AntiHillaryFlashMob Rally Agains...
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0,IS LAW AND ORDER SVU PANDERING To Cop Hating M...


# Step 2: Split into train, dev and test

In [10]:
# split later 
X_train, X_test_dev, Y_train, Y_test_dev = train_test_split(full_dataset, full_dataset['label'], test_size=0.20, random_state=42)
X_dev, X_test, Y_dev, Y_test = train_test_split(full_dataset, full_dataset['label'], test_size=0.50, random_state=42)

In [11]:
print(len(X_train))
print(len(X_dev))
print(len(X_test))

print(Y_train)

35918
22449
22449
12854    0
12384    1
938      0
1259     0
3558     0
        ..
11284    1
21251    0
14677    0
860      1
15795    1
Name: label, Length: 35918, dtype: int64


In [12]:
X_train = X_train['full_text'].tolist()
X_dev = X_dev['full_text'].tolist()
X_test = X_test['full_text'].tolist()

Ytrain = Y_train.tolist()
Ydev = Y_dev.tolist()
Ytest = Y_test.tolist()

# Step 3: Vectorize data and predict  

In [28]:
vec = TfidfVectorizer()

# combine the vectorizer with a Naive Bayes classifier
classifier = Pipeline( [('vec', vec),
                        ('cls', LinearSVC())] )

# train model (with naive bayes classifier)
# X are the instances/documents of training data, Y are the labels of training data
classifier.fit(X_train, Y_train)

# predict labels on the documents (X) of the test-set
predictions = classifier.predict(X_dev)
print(predictions)

[1 0 1 ... 1 1 1]


# Step 4: analyze predictions

In [26]:
print(classification_report(Y_dev,predictions))
print(accuracy_score(Ydev, predictions))

              precision    recall  f1-score   support

           0       0.59      0.57      0.58     10651
           1       0.62      0.65      0.64     11798

    accuracy                           0.61     22449
   macro avg       0.61      0.61      0.61     22449
weighted avg       0.61      0.61      0.61     22449

0.6092476279567018


In [27]:
confusion_matrix(Y_dev, predictions)

array([[6032, 4619],
       [4153, 7645]])