In [120]:
import pandas as pd
import numpy as np
import string
import re
from nltk.corpus import stopwords
from autocorrect import spell
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from math import floor,sqrt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,confusion_matrix
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier



In [74]:
data=pd.read_csv('news_data.csv')

In [75]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [76]:
data.rename(columns={'Unnamed: 0':'news_id','text':'news_details'},inplace=True)

In [77]:
data['label']=data['label'].map({'FAKE':0,'REAL':1})

In [78]:
data.head(5)

Unnamed: 0,news_id,title,news_details,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1


In [79]:
data.shape

(6335, 4)

In [80]:
data.duplicated(subset='news_id').sum()

0

In [81]:
data.isna().sum()

news_id         0
title           0
news_details    0
label           0
dtype: int64

In [82]:
def convertlower(s):
    return s.lower()
def removeWhiteSpaces(s):
    return s.strip()
def removePunctuations(s):
    return s.translate(str.maketrans('','',string.punctuation))
def removeNumbers(s):
    return re.sub(r'\d+','',s)
def removeURL(s):
    return re.sub(r'^https?:\/\/.*[\r\n]*','',s)
def removeTags(s):
    remove_tags=re.compile('<.*?>')
    return re.sub(remove_tags,'',s)


In [83]:
col_to_clean=['title','news_details']
for col in col_to_clean:
    data[col]=data[col].apply(convertlower)
    data[col]=data[col].apply(removeWhiteSpaces)
    data[col]=data[col].apply(removePunctuations)
    data[col]=data[col].apply(removeNumbers)
    data[col]=data[col].apply(removeURL)
    data[col]=data[col].apply(removeTags)

In [84]:
data.head(5)

Unnamed: 0,news_id,title,news_details,label
0,8476,you can smell hillary’s fear,daniel greenfield a shillman journalism fellow...,0
1,10294,watch the exact moment paul ryan committed pol...,google pinterest digg linkedin reddit stumbleu...,0
2,3608,kerry to go to paris in gesture of sympathy,us secretary of state john f kerry said monday...,1
3,10142,bernie supporters on twitter erupt in anger ag...,— kaydee king kaydeeking november the lesson...,0
4,875,the battle of new york why this primary matters,its primary day in new york and frontrunners h...,1


In [85]:
tfidf=TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,max_df=0.8,min_df=0.2)

In [86]:
tfidf_matrix_news_details=tfidf.fit_transform(data['news_details'])

In [87]:
tfidf_matrix_news_details.shape

(6335, 89)

In [88]:
tfidf_title=TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,max_df=0.8,min_df=0.01)

In [89]:
tfidf_matrix_title=tfidf_title.fit_transform(data['title'])

In [90]:
tfidf_matrix_title.shape

(6335, 60)

In [91]:
data['label'].shape

(6335,)

In [92]:
X_train,X_test,y_train,y_test=train_test_split(tfidf_matrix_news_details,np.array(data['label']),test_size=0.2,random_state=42)

In [None]:
# Logistic Regression

In [93]:
logreg=LogisticRegression()
logreg.fit(X_train,y_train)

LogisticRegression()

In [109]:
def Logregclf(X,y_true):
    y_pred=logreg.predict(X)
    score=f1_score(y_true,y_pred,average='weighted')
    cm=pd.DataFrame(confusion_matrix(y_true,y_pred),columns=['0','1'],index=['0','1'])
    return score,cm

In [110]:
score_test,cm_test=Logregclf(X_test,y_test)

In [111]:
score_train,cm_train=Logregclf(X_train,y_train)

In [112]:
score_test,score_train

(0.8287207450852468, 0.8121489913782154)

In [97]:
data['label'].value_counts()

1    3171
0    3164
Name: label, dtype: int64

In [114]:
cm_train

Unnamed: 0,0,1
0,2073,463
1,489,2043


In [113]:
cm_test

Unnamed: 0,0,1
0,517,111
1,106,533


In [69]:
X_train,X_test,y_train,y_test=train_test_split(tfidf_matrix_title,np.array(data['label']),test_size=0.2,random_state=42)
logreg=LogisticRegression()
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test) 
score=f1_score(y_test,y_pred,average='weighted')

In [70]:
score

0.7068864967079463

In [None]:
#score is very less when using title feature. so we can only include text's features.

In [99]:
#KNN

In [108]:
sq=floor(sqrt(X_train.shape[0]))
k=sq if (sq%2)!=0 else sq+1

In [115]:
knn=neighbors.KNeighborsClassifier(n_neighbors=k,weights='uniform')
knn.fit(X_train,y_train)  

KNeighborsClassifier(n_neighbors=71)

In [117]:
def knnclf(X,y_true):
    y_pred=knn.predict(X) 
    score=f1_score(y_true,y_pred,average='weighted')
    cm=pd.DataFrame(confusion_matrix(y_true,y_pred),columns=['0','1'],index=['0','1'])
    return score,cm

In [127]:
score_test,cm_test=knnclf(X_test,y_test)
score_train,cm_train=knnclf(X_train,y_train)

In [128]:
score_test,score_train

(0.7889606078024994, 0.7846129451201116)

In [121]:
# Decision Tree Classifier

In [123]:
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)

DecisionTreeClassifier()

In [124]:
def dtreeclf(X,y_true):
    y_pred=dtree.predict(X) 
    score=f1_score(y_true,y_pred,average='weighted')
    cm=pd.DataFrame(confusion_matrix(y_true,y_pred),columns=['0','1'],index=['0','1'])
    return score,cm

In [131]:
score_test,cm_test=dtreeclf(X_test,y_test)
score_train,cm_train=dtreeclf(X_train,y_train)

In [132]:
score_test,score_train

(0.7315281206879298, 0.9928962875406454)