In [2]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score,f1_score

In [3]:
#Read the data
df=pd.read_csv(r'E:\SEM 4\AML\PROJECT\DATASET\archive (4)\fake_or_real_news.csv')

#Get shape and head
df.shape
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [5]:
df.label.value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [6]:
df.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [7]:
#DataFlair - Get the labels
labels=df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [8]:
df1 = df['title'] +"\n"+ df['text']
df1 = df1[:100]

In [9]:
from nltk.corpus import stopwords
import string
def remove_stops(text, stops):
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = text.split()
    final = []
    for word in words:
        if word not in stops:
            final.append(word)
    final = " ".join(final)
    final = final.translate(str.maketrans("", "", string.punctuation))
    final = "".join([i for i in final if not i.isdigit()])
    while "  " in final:
        final = final.replace("  ", " ")
    return (final)

In [10]:
def clean_docs(docs):
    stops = stopwords.words("english")
    final = []
    for doc in docs:
        clean_doc = remove_stops(doc, stops)
        final.append(clean_doc)
    return (final)

In [11]:
df1 = clean_docs(df['text'])
df1[0]



In [12]:
df['text'][0]



In [13]:
len(df['text'][0])

7518

In [14]:
len(df1[0])

5290

In [15]:
df1

 'Google Pinterest Digg Linkedin Reddit Stumbleupon Print Delicious Pocket Tumblr There two fundamental truths world Paul Ryan desperately wants president And Paul Ryan never president Today proved In particularly staggering example political cowardice Paul Ryan rererereversed course announced back Trump Train This aboutface weeks ago He previously declared would supporting defending Trump tape made public Trump bragged assaulting women Suddenly Ryan appearing proTrump rally boldly declaring already sent vote make President United States It surreal moment The figurehead Republican Party dosed gasoline got stage chilly afternoon Wisconsin lit match SpeakerRyan says voted realDonaldTrump “Republicans time come home” httpstcoVyTTYvoE pictwittercomwCvSCgaI — ABC News Politics ABCPolitics November The Democratic Party couldn’t asked better moment film Ryan’s chances ever becoming president went zero instant In wreckage Trump leave behind wake cravenly backed campaign recover If Ryan’s caree

In [16]:
x_train,x_test,y_train,y_test=train_test_split(df1, labels, test_size=0.2, random_state=7)

In [17]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df =0.9,lowercase=False)
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)
tfidf_train

<5068x85898 sparse matrix of type '<class 'numpy.float64'>'
	with 1412091 stored elements in Compressed Sparse Row format>

In [17]:
#DECISION TREE
from sklearn.tree import DecisionTreeClassifier
Dt = DecisionTreeClassifier(criterion = 'entropy')
Dt.fit(tfidf_train,y_train)
y_pred = Dt.predict(tfidf_test)
acc_score=accuracy_score(y_test,y_pred)
print("Confusion matrix : \n",confusion_matrix(y_test,y_pred, labels=['FAKE','REAL']))
print(f'Accuracy : {round(acc_score*100,2)}%')

Confusion matrix : 
 [[504 134]
 [111 518]]
Accuracy : 80.66%


In [18]:
pre_score = precision_score(y_test,y_pred, pos_label="FAKE")
recall = recall_score(y_test,y_pred,pos_label="FAKE")
f1 = f1_score(y_test,y_pred,pos_label="FAKE")
print("FOR FAKE")
print(f'Precision for "FAKE" : {round((pre_score)*100,2)}%')
print(f'Recall for "FAKE" : {round((recall)*100,2)}%')
print(f'f1_score for "FAKE" : {round((f1)*100,2)}%')

FOR FAKE
Precision for "FAKE" : 81.95%
Recall for "FAKE" : 79.0%
f1_score for "FAKE" : 80.45%


In [19]:
pre_score = precision_score(y_test,y_pred, pos_label="REAL")
recall = recall_score(y_test,y_pred,pos_label="REAL")
f1 = f1_score(y_test,y_pred,pos_label="REAL")
print("FOR REAL")
print(f'Precision for "REAL" : {round((pre_score)*100,2)}%')
print(f'Recall for "REAL" : {round((recall)*100,2)}%')
print(f'f1_score for "REAL" : {round((f1)*100,2)}%')

FOR REAL
Precision for "REAL" : 79.45%
Recall for "REAL" : 82.35%
f1_score for "REAL" : 80.87%


In [20]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(tfidf_train,y_train)
y_pred = knn.predict(tfidf_test)
acc_score=accuracy_score(y_test,y_pred)
print("Confusion matrix : \n",confusion_matrix(y_test,y_pred, labels=['FAKE','REAL']))
print(f'Accuracy for "FAKE" : {round(acc_score*100,2)}%')

Confusion matrix : 
 [[635   3]
 [547  82]]
Accuracy for "FAKE" : 56.59%


In [21]:
pre_score = precision_score(y_test,y_pred, pos_label="FAKE")
recall = recall_score(y_test,y_pred,pos_label="FAKE")
f1 = f1_score(y_test,y_pred,pos_label="FAKE")
print("FOR FAKE")
print(f'Precision for "FAKE" : {round((pre_score)*100,2)}%')
print(f'Recall for "FAKE" : {round((recall)*100,2)}%')
print(f'f1_score for "FAKE" : {round((f1)*100,2)}%')

FOR FAKE
Precision for "FAKE" : 53.72%
Recall for "FAKE" : 99.53%
f1_score for "FAKE" : 69.78%


In [22]:
pre_score = precision_score(y_test,y_pred, pos_label="REAL")
recall = recall_score(y_test,y_pred,pos_label="REAL")
f1 = f1_score(y_test,y_pred,pos_label="REAL")
print("FOR REAL")
print(f'Precision for "REAL" : {round((pre_score)*100,2)}%')
print(f'Recall for "REAL" : {round((recall)*100,2)}%')
print(f'f1_score for "REAL" : {round((f1)*100,2)}%')

FOR REAL
Precision for "REAL" : 96.47%
Recall for "REAL" : 13.04%
f1_score for "REAL" : 22.97%


In [24]:
#LOGISTIC REGRESSION
from sklearn import linear_model
logr = linear_model.LogisticRegression()
logr.fit(tfidf_train,y_train)
y_pred = logr.predict(tfidf_test)
acc_score=accuracy_score(y_test,y_pred)
print("Confusion matrix : \n",confusion_matrix(y_test,y_pred, labels=['FAKE','REAL']))
print(f'Accuracy for "FAKE" : {round(acc_score*100,2)}%')

Confusion matrix : 
 [[605  33]
 [ 63 566]]
Accuracy for "FAKE" : 92.42%


In [25]:
pre_score = precision_score(y_test,y_pred, pos_label="FAKE")
recall = recall_score(y_test,y_pred,pos_label="FAKE")
f1 = f1_score(y_test,y_pred,pos_label="FAKE")
print("FOR FAKE")
print(f'Precision for "FAKE" : {round((pre_score)*100,2)}%')
print(f'Recall for "FAKE" : {round((recall)*100,2)}%')
print(f'f1_score for "FAKE" : {round((f1)*100,2)}%')

FOR FAKE
Precision for "FAKE" : 90.57%
Recall for "FAKE" : 94.83%
f1_score for "FAKE" : 92.65%


In [26]:
pre_score = precision_score(y_test,y_pred, pos_label="REAL")
recall = recall_score(y_test,y_pred,pos_label="REAL")
f1 = f1_score(y_test,y_pred,pos_label="REAL")
print("FOR REAL")
print(f'Precision for "REAL" : {round((pre_score)*100,2)}%')
print(f'Recall for "REAL" : {round((recall)*100,2)}%')
print(f'f1_score for "REAL" : {round((f1)*100,2)}%')

FOR REAL
Precision for "REAL" : 94.49%
Recall for "REAL" : 89.98%
f1_score for "REAL" : 92.18%


In [27]:
#NAIVE BAYES
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(tfidf_train.toarray(),y_train)
y_pred = gnb.predict(tfidf_test.toarray())
acc_score=accuracy_score(y_test,y_pred)
print("Confusion matrix : \n",confusion_matrix(y_test,y_pred, labels=['FAKE','REAL']))
print(f'Accuracy for "FAKE" : {round(acc_score*100,2)}%')

Confusion matrix : 
 [[552  86]
 [ 87 542]]
Accuracy for "FAKE" : 86.35%


In [28]:
pre_score = precision_score(y_test,y_pred, pos_label="FAKE")
recall = recall_score(y_test,y_pred,pos_label="FAKE")
f1 = f1_score(y_test,y_pred,pos_label="FAKE")
print("FOR FAKE")
print(f'Precision for "FAKE" : {round((pre_score)*100,2)}%')
print(f'Recall for "FAKE" : {round((recall)*100,2)}%')
print(f'f1_score for "FAKE" : {round((f1)*100,2)}%')

FOR FAKE
Precision for "FAKE" : 86.38%
Recall for "FAKE" : 86.52%
f1_score for "FAKE" : 86.45%


In [29]:
pre_score = precision_score(y_test,y_pred, pos_label="REAL")
recall = recall_score(y_test,y_pred,pos_label="REAL")
f1 = f1_score(y_test,y_pred,pos_label="REAL")
print("FOR REAL")
print(f'Precision for "REAL" : {round((pre_score)*100,2)}%')
print(f'Recall for "REAL" : {round((recall)*100,2)}%')
print(f'f1_score for "REAL" : {round((f1)*100,2)}%')

FOR REAL
Precision for "REAL" : 86.31%
Recall for "REAL" : 86.17%
f1_score for "REAL" : 86.24%


In [30]:
#RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf.fit(tfidf_train,y_train)
y_pred = rf.predict(tfidf_test)
acc_score=accuracy_score(y_test,y_pred)
print("Confusion matrix : \n",confusion_matrix(y_test,y_pred, labels=['FAKE','REAL']))
print(f'Accuracy for "FAKE" : {round(acc_score*100,2)}%')

Confusion matrix : 
 [[579  59]
 [ 58 571]]
Accuracy for "FAKE" : 90.77%


In [31]:
pre_score = precision_score(y_test,y_pred, pos_label="FAKE")
recall = recall_score(y_test,y_pred,pos_label="FAKE")
f1 = f1_score(y_test,y_pred,pos_label="FAKE")
print("FOR FAKE")
print(f'Precision for "FAKE" : {round((pre_score)*100,2)}%')
print(f'Recall for "FAKE" : {round((recall)*100,2)}%')
print(f'f1_score for "FAKE" : {round((f1)*100,2)}%')

FOR FAKE
Precision for "FAKE" : 90.89%
Recall for "FAKE" : 90.75%
f1_score for "FAKE" : 90.82%


In [32]:
pre_score = precision_score(y_test,y_pred, pos_label="REAL")
recall = recall_score(y_test,y_pred,pos_label="REAL")
f1 = f1_score(y_test,y_pred,pos_label="REAL")
print("FOR REAL")
print(f'Precision for "REAL" : {round((pre_score)*100,2)}%')
print(f'Recall for "REAL" : {round((recall)*100,2)}%')
print(f'f1_score for "REAL" : {round((f1)*100,2)}%')

FOR REAL
Precision for "REAL" : 90.63%
Recall for "REAL" : 90.78%
f1_score for "REAL" : 90.71%


In [33]:
#SUPPORT VECTOR MACHINE
from sklearn import svm
svmc = svm.SVC()
svmc.fit(tfidf_train,y_train)
y_pred = svmc.predict(tfidf_test)
acc_score=accuracy_score(y_test,y_pred)
print("Confusion matrix : \n",confusion_matrix(y_test,y_pred, labels=['FAKE','REAL']))
print(f'Accuracy for "FAKE" : {round(acc_score*100,2)}%')

Confusion matrix : 
 [[613  25]
 [ 53 576]]
Accuracy for "FAKE" : 93.84%


In [34]:
pre_score = precision_score(y_test,y_pred, pos_label="FAKE")
recall = recall_score(y_test,y_pred,pos_label="FAKE")
f1 = f1_score(y_test,y_pred,pos_label="FAKE")
print("FOR FAKE")
print(f'Precision for "FAKE" : {round((pre_score)*100,2)}%')
print(f'Recall for "FAKE" : {round((recall)*100,2)}%')
print(f'f1_score for "FAKE" : {round((f1)*100,2)}%')

FOR FAKE
Precision for "FAKE" : 92.04%
Recall for "FAKE" : 96.08%
f1_score for "FAKE" : 94.02%


In [35]:
pre_score = precision_score(y_test,y_pred, pos_label="REAL")
recall = recall_score(y_test,y_pred,pos_label="REAL")
f1 = f1_score(y_test,y_pred,pos_label="REAL")
print("FOR REAL")
print(f'Precision for "REAL" : {round((pre_score)*100,2)}%')
print(f'Recall for "REAL" : {round((recall)*100,2)}%')
print(f'f1_score for "REAL" : {round((f1)*100,2)}%')

FOR REAL
Precision for "REAL" : 95.84%
Recall for "REAL" : 91.57%
f1_score for "REAL" : 93.66%


In [36]:
#SUPPORT VECTOR MACHINE -linear
from sklearn import svm
svmcl = svm.SVC(kernel='linear')
svmcl.fit(tfidf_train,y_train)
y_pred = svmcl.predict(tfidf_test)
acc_score=accuracy_score(y_test,y_pred)
print("Confusion matrix : \n",confusion_matrix(y_test,y_pred, labels=['FAKE','REAL']))
print(f'Accuracy for "FAKE" : {round(acc_score*100,2)}%')

Confusion matrix : 
 [[604  34]
 [ 46 583]]
Accuracy for "FAKE" : 93.69%


In [37]:
from sklearn.ensemble import AdaBoostClassifier
svc = svm.SVC()
ada =  AdaBoostClassifier(estimator=svc,algorithm='SAMME',n_estimators=3)
ada.fit(tfidf_train,y_train)
y_pred = ada.predict(tfidf_test)
acc_score=accuracy_score(y_test,y_pred)
print("Confusion matrix : \n",confusion_matrix(y_test,y_pred, labels=['FAKE','REAL']))
print(f'Accuracy : {round(acc_score*100,2)}%')

Confusion matrix : 
 [[  0 638]
 [  0 629]]
Accuracy : 49.64%


In [18]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn import svm
import numpy as np


logr = linear_model.LogisticRegression()
gnb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=2)
xg = xgb.XGBClassifier(booster='gblinear', objective='binary:logistic') 
svml = svm.SVC(kernel='linear')
svmc = svm.SVC()
Dt = DecisionTreeClassifier(criterion = 'entropy')
rf = RandomForestClassifier(n_estimators=200)

vc = VotingClassifier(estimators=[('lr',logr),('DTC',Dt),('RF',rf),('svmc',svmc),('scml',svml)],voting='soft')

vc.fit(tfidf_train.toarray(),y_train)
y_pred1 = vc.predict(tfidf_test.toarray())
acc_score=accuracy_score(y_test,y_pred1)
print("Confusion matrix : \n",confusion_matrix(y_test,y_pred1, labels=['FAKE','REAL']))
print(f'Accuracy : {round(acc_score*100,2)}%')