In [1]:

import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer


In [2]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC


In [3]:

from sklearn import metrics
from pandas_ml import ConfusionMatrix
from matplotlib import pyplot as plt
import itertools


In [4]:
df = pd.read_csv("D:/DATATHON/all/train.csv")

In [5]:
df.shape

(20800, 5)

In [6]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [7]:
df = df.set_index("id")
# df = df.set_index("Unnamed: 0")

In [8]:
df.head()

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [9]:
df = df.drop(["title","author"], axis=1) ## droping output variable
df.head()

Unnamed: 0_level_0,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1


In [10]:
df.isnull().sum()

text     39
label     0
dtype: int64

In [11]:
df = df.dropna(how='any',axis=0) 

In [12]:
y = df.label  # output variable
df = df.drop(["label"], axis=1) ## droping output variable
df.head()

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...
1,Ever get the feeling your life circles the rou...
2,"Why the Truth Might Get You Fired October 29, ..."
3,Videos 15 Civilians Killed In Single US Airstr...
4,Print \nAn Iranian woman has been sentenced to...


In [13]:
# Make training and test sets 
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.33, random_state=53)

### Building Count Vectorizer

In [14]:
# Initialize the `count_vectorizer` 
count_vectorizer = CountVectorizer(stop_words='english')

In [15]:
count_train = count_vectorizer.fit(X_train)
count_train = count_vectorizer.transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [18]:

## Fitting Naive Baye's Classifier for Multinomial Model
clf = MultinomialNB()
clf.fit(count_train, y_train)
pred = clf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
score_cnt_nb = round(score,3)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=[1,0])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)


accuracy:   0.893
[[2823  603]
 [ 127 3299]]


In [20]:
## Fitting Passive Aggresive Classifier Model

linear_clf = PassiveAggressiveClassifier(n_iter=50)

linear_clf.fit(count_train, y_train)
pred = linear_clf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
score_cnt_pa = round(score,3)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=[1,0])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)




accuracy:   0.940
[[3252  174]
 [ 234 3192]]


In [26]:

rf = RandomForestClassifier(random_state=100)
rf.fit(count_train, y_train)
pred = rf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
score_cnt_rf = round(score,3)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=[1,0])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)


accuracy:   0.854
[[2700  726]
 [ 277 3149]]


In [27]:

dt = DecisionTreeClassifier(random_state=100)
dt.fit(count_train, y_train)
pred = dt.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
score_cnt_dt = round(score,3)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=[1,0])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)


accuracy:   0.891
[[3094  332]
 [ 414 3012]]


In [22]:
lr = LogisticRegression()
lr.fit(count_train, y_train)
pred = lr.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
score_cnt_lr = round(score,3)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=[1,0])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)

accuracy:   0.950
[[3269  157]
 [ 189 3237]]


In [24]:
ada = AdaBoostClassifier()
ada.fit(count_train, y_train)
pred = ada.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
score_cnt_ada = round(score,3)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=[1,0])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)

accuracy:   0.926
[[3226  200]
 [ 308 3118]]


In [21]:
knn = KNeighborsClassifier()
knn.fit(count_train, y_train)
pred = knn.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
score_cnt_knn = round(score,3)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=[1,0])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)

accuracy:   0.738
[[3332   94]
 [1699 1727]]


In [46]:
mlp = MLPClassifier(hidden_layer_sizes=(5,5,4)) # one hidden layer with one node,if hidden_layer_sizes=(30,30,30)-> 3 hidden layers with 30 nodes each
mlp.fit(count_train, y_train)
pred = mlp.predict(count_test)
score= metrics.accuracy_score(y_test, pred)
score_cnt_mlp=round(score,3)
#print(score_knn_tfidf)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=[1,0])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)

accuracy:   0.738


NameError: name 'cm_knn_tfidf' is not defined

In [16]:
svm1 = SVC()
svm1.fit(count_train, y_train)
pred = svm1.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
score_cnt_svm1 = round(score,3)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=[1,0])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)

accuracy:   0.762
[[3356   70]
 [1561 1865]]


In [28]:
acc = {'Passive Aggresive':score_cnt_pa,'Logistic':score_cnt_lr,'AdaBoost':score_cnt_ada,'NB':score_cnt_nb,
       'RF':score_cnt_rf,'DT':score_cnt_dt,'KNN':score_cnt_knn,"MLP_NN":score_cnt_mlp,"SVM":score_cnt_svm1}
acc1 = pd.DataFrame([acc],)
acc1.index=['Count Vectorization']

NameError: name 'score_cnt_knn' is not defined

In [32]:
acc = {'Passive Aggresive':score_cnt_pa,'Logistic':score_cnt_lr,'AdaBoost':score_cnt_ada,'NB':score_cnt_nb,
       'RF':score_cnt_rf,'DT':score_cnt_dt,"SVM":score_cnt_svm1}
acc1 = pd.DataFrame([acc],)
acc1.index=['Count Vectorization']
acc1

Unnamed: 0,AdaBoost,DT,Logistic,NB,Passive Aggresive,RF,SVM
Count Vectorization,0.926,0.891,0.95,0.893,0.94,0.854,0.762


### Building TF-IDF Vectorizer

In [25]:
# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# # Fit the training Data
# tfidf_train = tfidf_vectorizer.fit([X_train[8476]])
# #print(tfidf_train.vocabulary_)
# print(tfidf_train.idf_)

In [26]:
# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8)
# wich will not appering in all the words. here max_df represent the words accuring <80 of documents are considering.the 

# Fit the training Data
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_train = tfidf_vectorizer.transform(X_train)

#print(tfidf_train.vocabulary_)
#print(tfidf_train.idf_)
tfidf_train.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [27]:
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [61]:
## Fitting Passive Aggresive Classifier Model

linear_clf = PassiveAggressiveClassifier(n_iter=50)

linear_clf.fit(tfidf_train, y_train)
pred = linear_clf.predict(tfidf_test)
score_pa_tfidf = metrics.accuracy_score(y_test, pred)
score_pa_tfidf = round(score_pa_tfidf,3)
print("accuracy:   %0.3f" % score)
cm_pa_tfidf = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)



accuracy:   0.544
[[827 181]
 [213 870]]


In [62]:
lr = LogisticRegression()
lr.fit(tfidf_train, y_train)
pred = lr.predict(tfidf_test)
score_lr_tfidf = metrics.accuracy_score(y_test, pred)
score_lr_tfidf = round(score_lr_tfidf,3)
print("accuracy:   %0.3f" % score)
cm_lr_tfidf = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)

accuracy:   0.544
[[827 181]
 [213 870]]


In [63]:
ada = AdaBoostClassifier()
ada.fit(tfidf_train, y_train)
pred = ada.predict(tfidf_test)
score_ada_tfidf = metrics.accuracy_score(y_test, pred)
score_ada_tfidf = round(score_ada_tfidf,3)
print("accuracy:   %0.3f" % score)
cm_ada_tfidf = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)

accuracy:   0.544
[[827 181]
 [213 870]]


In [64]:
## Fitting Naive Baye's Classifier for Multinomial Model
clf = MultinomialNB()

clf.fit(tfidf_train, y_train)

pred = clf.predict(tfidf_test)
score_nb_tfidf = metrics.accuracy_score(y_test, pred)
score_nb_tfidf = round(score_nb_tfidf,3)
print("accuracy:   %0.3f" % score)
cm_nb_tfidf = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)

accuracy:   0.544
[[827 181]
 [213 870]]


In [65]:
rf = RandomForestClassifier(random_state=100)
rf.fit(tfidf_train, y_train)
pred = rf.predict(tfidf_test)
score_rf_tfidf = metrics.accuracy_score(y_test, pred)
score_rf_tfidf = round(score_rf_tfidf,3)
print("accuracy:   %0.3f" % score)
cm_rf_tfidf = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)

accuracy:   0.544
[[827 181]
 [213 870]]


In [66]:
dt = DecisionTreeClassifier(random_state=100)
dt.fit(count_train, y_train)
pred = dt.predict(count_test)
score_dt_tfidf = metrics.accuracy_score(y_test, pred)
score_dt_tfidf = round(score_dt_tfidf,3)
print("accuracy:   %0.3f" % score)
cm_dt_tfidf = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)

accuracy:   0.544
[[827 181]
 [213 870]]


In [67]:
knn = KNeighborsClassifier()
knn.fit(tfidf_train, y_train)
pred = knn.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
score_knn_tfidf=round(score,3)
#print(score_knn_tfidf)
print("accuracy:   %0.3f" % score)
cm_knn_tfidf = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm_knn_tfidf)

accuracy:   0.544
[[1006    2]
 [ 951  132]]


In [82]:

mlp = MLPClassifier(hidden_layer_sizes=(30)) # one hidden layer with one node,if hidden_layer_sizes=(30,30,30)-> 3 hidden layers with 30 nodes each
mlp.fit(tfidf_train, y_train)
pred = mlp.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
score_mlp_tfidf1=round(score,3)
#print(score_knn_tfidf)
print("accuracy:   %0.3f" % score)
cm_mlp_tfidf = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm_knn_tfidf)

accuracy:   0.899
[[944  64]
 [148 935]]




In [83]:
mlp = MLPClassifier(hidden_layer_sizes=(5,5,4)) # one hidden layer with one node,if hidden_layer_sizes=(30,30,30)-> 3 hidden layers with 30 nodes each
mlp.fit(tfidf_train, y_train)
pred = mlp.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
score_mlp_tfidf=round(score,3)
#print(score_knn_tfidf)
print("accuracy:   %0.3f" % score)
cm_mlp_tfidf = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm_knn_tfidf)

accuracy:   0.930
[[944  64]
 [148 935]]


In [84]:
acc = {'Passive Aggresive':score_pa_tfidf,'Logistic':score_lr_tfidf,'AdaBoost':score_ada_tfidf,'NB':score_nb_tfidf,
                             'RF':score_rf_tfidf,'DT':score_dt_tfidf,'KNN':score_knn_tfidf,"MLP_NN":score_mlp_tfidf}
acc1 = pd.DataFrame([acc],)
acc1.index=['Model Accuracy']

Unnamed: 0,AdaBoost,DT,KNN,Logistic,MLP_NN,NB,Passive Aggresive,RF
Model Accuracy,0.867,0.812,0.544,0.914,0.93,0.857,0.934,0.844


In [52]:
### Hashvector (its like count vectorizer but it wont generate vocabulary i.e dictionary of words it will 
###give numbering we cant back transforming like count vectorizer)

In [62]:
# Initialize the `hash_vectorizer` 
hashing_vectorizer = HashingVectorizer(stop_words='english', non_negative=True)

In [63]:
hash_train = hashing_vectorizer.fit(X_train)
hash_train = hashing_vectorizer.transform(X_train)
hash_test = hashing_vectorizer.transform(X_test)

In [64]:
## Fitting Naive Baye's Classifier for Multinomial Model
l_clf = MultinomialNB()

l_clf.fit(hash_train, y_train)

pred = l_clf.predict(hash_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)

accuracy:   0.852
[[ 733  275]
 [  34 1049]]


In [54]:
## Fitting Passive Aggresive Classifier Model

linear_clf = PassiveAggressiveClassifier(n_iter=50)

linear_clf.fit(hash_train, y_train)
pred = linear_clf.predict(hash_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)

accuracy:   0.919
[[932  76]
 [ 94 989]]


In [61]:
print(hash_train)

  (1, 26381)	-0.1336306209562122
  (1, 46353)	0.1336306209562122
  (1, 76282)	-0.1336306209562122
  (1, 124604)	-0.1336306209562122
  (1, 271872)	0.2672612419124244
  (1, 354766)	0.1336306209562122
  (1, 355578)	-0.1336306209562122
  (1, 380136)	-0.2672612419124244
  (1, 399927)	-0.1336306209562122
  (1, 413315)	-0.2672612419124244
  (1, 421751)	-0.2672612419124244
  (1, 452780)	-0.1336306209562122
  (1, 506429)	-0.5345224838248488
  (1, 612563)	-0.1336306209562122
  (1, 615897)	0.1336306209562122
  (1, 626851)	0.1336306209562122
  (1, 639862)	0.1336306209562122
  (1, 691517)	-0.1336306209562122
  (1, 740856)	-0.1336306209562122
  (1, 777362)	-0.2672612419124244
  (1, 798576)	-0.1336306209562122
  (1, 907820)	0.2672612419124244
  (1, 1039472)	-0.1336306209562122
  (2, 14361)	-0.3333333333333333
  (2, 81229)	0.3333333333333333
  :	:
  (4243, 924171)	0.08478501284163323
  (4243, 934801)	0.028261670947211076
  (4243, 935153)	0.028261670947211076
  (4243, 935275)	0.028261670947211076
  (42