In [8]:
import pandas as pd
import numpy as np

In [9]:
#Importing the different datasets
dataset1 = pd.read_csv('news_false_final.tsv', delimiter = '\t\t', quoting = 3)
dataset2 = pd.read_csv('news_true_final.tsv', delimiter = '\t\t', quoting = 3)
dataset3= pd.read_csv('fake_or_real_news.csv').iloc[:,1:4]
dataset3 = dataset3[dataset3['Label'].isin(['True','False'])]

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
#Merging the Datasets
dataset=dataset1.append(dataset2, ignore_index=True)
dataset=dataset.append(dataset3, ignore_index=True)

In [23]:
dataset.head()

Unnamed: 0,Headline,Label,Message
0,Alexandria Ocasio-Cortez misrepresents ICE’s d...,False,"""ICE is required to fill 34,000 beds with deta..."
1,"No, it's not correct that 39% of California st...",False,"""39% of All California Students are illegals."""
2,Viral image overstates births to undocumented ...,False,"""More than 66% of ALL births in California are..."
3,Donald Trump off-base in describing GDP growth...,False,"""Watch those GDP numbers. We started off at a ..."
4,"Donald Trump wrong that Mercedes, BMW import c...",False,"""The European Union … they send us Mercedes, t..."


In [15]:
dataset=dataset.dropna()
dataset=dataset.reset_index(drop=True)

In [19]:
# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, len(dataset)):
    news = re.sub('[^a-zA-Z]', ' ', dataset['Headline'][i]+dataset['Message'][i])
    news = news.lower()
    news = news.split()
    ps = PorterStemmer()
    news = [ps.stem(word) for word in news if not word in set(stopwords.words('english'))]
    news = ' '.join(news)
    corpus.append(news)
    #For viewing progress
    if i*100.0/len(dataset) in range(1,101):
        print i*100.0/len(dataset),"% Completed"

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pratibha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
5.0 % Completed
10.0 % Completed
15.0 % Completed
20.0 % Completed
25.0 % Completed
30.0 % Completed
35.0 % Completed
40.0 % Completed
45.0 % Completed
50.0 % Completed
55.0 % Completed
60.0 % Completed
65.0 % Completed
70.0 % Completed
75.0 % Completed
80.0 % Completed
85.0 % Completed
90.0 % Completed
95.0 % Completed


In [24]:
dataset.shape

(10540, 3)

In [25]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2000)
X = cv.fit_transform(corpus).toarray()

In [26]:
#Encoding dependent Variable
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
y = dataset.iloc[:, 1].values
y = labelencoder_y.fit_transform(y)


for i in range(0,len(dataset)):
    if(y[i]==2):
        y[i]=0
    if(y[i]==3):
        y[i]=1

In [27]:
#Splitting dataset into training and testing set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)



In [28]:
#With the logistic regression
print "\nResults with Logistic Regression "
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression(random_state=0)
classifier.fit(X_train,y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print cm

TN=cm[0][0]
FP=cm[0][1]
TP=cm[1][1]
FN=cm[1][0]
print "FP:",FP,"   FN:",FN
print "TP:",TP,"   TN:",TN
accuracy =(float) (TP + TN) / (TP + TN + FP + FN)
print "Accuracy:",accuracy
precision =(float)(TP) / (TP + FP)
print "Precision:",precision
recall = (float)(TP) / (TP + FN)
print "Recall:",recall
f1_Score =(float) (2 * precision * recall) / (precision + recall)
print "F1 Score:",f1_Score


Results with Logistic Regression 
[[843 222]
 [267 776]]
FP: 222    FN: 267
TP: 776    TN: 843
Accuracy: 0.7680265654648957
Precision: 0.7775551102204409
Recall: 0.7440076701821668
F1 Score: 0.7604115629593337


In [29]:
#With the Naive Bayes Classifier
print "\nResults with Naive Bayes Classifier "
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
TN=cm[0][0]
FP=cm[0][1]
TP=cm[1][1]
FN=cm[1][0]
print "FP:",FP,"   FN:",FN
print "TP:",TP,"   TN:",TN
accuracy =(float) (TP + TN)/ (TP + TN + FP + FN)
print "Accuracy:",accuracy
precision =(float)(TP) / (TP + FP)
print "Precision:",precision
recall = (float)(TP) / (TP + FN)
print "Recall:",recall
f1_Score =(float) (2 * precision * recall) / (precision + recall)
print "F1 Score:",f1_Score


Results with Naive Bayes Classifier 
FP: 107    FN: 518
TP: 525    TN: 958
Accuracy: 0.7035104364326376
Precision: 0.8306962025316456
Recall: 0.5033557046979866
F1 Score: 0.6268656716417911


In [30]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 20, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
TN=cm[0][0]
FP=cm[0][1]
TP=cm[1][1]
FN=cm[1][0]
print "FP:",FP,"   FN:",FN
print "TP:",TP,"   TN:",TN
accuracy =(float) (TP + TN)/ (TP + TN + FP + FN)
print "Accuracy:",accuracy
precision =(float)(TP) / (TP + FP)
print "Precision:",precision
recall = (float)(TP) / (TP + FN)
print "Recall:",recall
f1_Score =(float) (2 * precision * recall) / (precision + recall)
print "F1 Score:",f1_Score

FP: 142    FN: 305
TP: 738    TN: 923
Accuracy: 0.7879506641366224
Precision: 0.8386363636363636
Recall: 0.7075743048897412
F1 Score: 0.7675507020280812


In [31]:
#With the SVM Classifier
print "\nResults with SVM Classifier"
from sklearn.svm import SVC
classifier = SVC(kernel='rbf',random_state=0)
classifier.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
TN=cm[0][0]
FP=cm[0][1]
TP=cm[1][1]
FN=cm[1][0]
print "FP:",FP,"   FN:",FN
print "TP:",TP,"   TN:",TN
accuracy = (float) (TP + TN) / (TP + TN + FP + FN)
print "Accuracy:",accuracy
precision =(float)(TP) / (TP + FP)
print "Precision:",precision
recall = (float)(TP) / (TP + FN)
print "Recall:",recall
f1_Score =(float) (2 * precision * recall) / (precision + recall)
print "F1 Score:",f1_Score


Results with SVM Classifier
FP: 142    FN: 305
TP: 738    TN: 923
Accuracy: 0.7879506641366224
Precision: 0.8386363636363636
Recall: 0.7075743048897412
F1 Score: 0.7675507020280812
