In [1]:
import pandas as pd
import numpy as np
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from math import floor,sqrt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [2]:
data=pd.read_csv('news_data.csv')

In [3]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
data.rename(columns={'Unnamed: 0':'news_id','text':'news_details'},inplace=True)

In [5]:
data['label']=data['label'].map({'FAKE':0,'REAL':1})

In [6]:
data.head(5)

Unnamed: 0,news_id,title,news_details,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1


In [7]:
data.shape

(6335, 4)

In [8]:
data.duplicated(subset='news_id').sum()

0

In [9]:
data.isna().sum()

news_id         0
title           0
news_details    0
label           0
dtype: int64

In [10]:
def convertlower(s):
    return s.lower()
def removeWhiteSpaces(s):
    return s.strip()
def removePunctuations(s):
    return s.translate(str.maketrans('','',string.punctuation))
def removeNumbers(s):
    return re.sub(r'\d+','',s)
def removeURL(s):
    return re.sub(r'^https?:\/\/.*[\r\n]*','',s)
def removeTags(s):
    remove_tags=re.compile('<.*?>')
    return re.sub(remove_tags,'',s)


In [11]:
col_to_clean=['title','news_details']
for col in col_to_clean:
    data[col]=data[col].apply(convertlower)
    data[col]=data[col].apply(removeWhiteSpaces)
    data[col]=data[col].apply(removePunctuations)
    data[col]=data[col].apply(removeNumbers)
    data[col]=data[col].apply(removeURL)
    data[col]=data[col].apply(removeTags)

In [12]:
data.head(5)

Unnamed: 0,news_id,title,news_details,label
0,8476,you can smell hillary’s fear,daniel greenfield a shillman journalism fellow...,0
1,10294,watch the exact moment paul ryan committed pol...,google pinterest digg linkedin reddit stumbleu...,0
2,3608,kerry to go to paris in gesture of sympathy,us secretary of state john f kerry said monday...,1
3,10142,bernie supporters on twitter erupt in anger ag...,— kaydee king kaydeeking november the lesson...,0
4,875,the battle of new york why this primary matters,its primary day in new york and frontrunners h...,1


In [13]:
tfidf=TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,max_df=0.8,min_df=0.2)

In [14]:
tfidf_matrix_news_details=tfidf.fit_transform(data['news_details'])

In [15]:
tfidf_matrix_news_details.shape

(6335, 89)

In [16]:
tfidf_title=TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,max_df=0.8,min_df=0.01)

In [17]:
tfidf_matrix_title=tfidf_title.fit_transform(data['title'])

In [18]:
tfidf_matrix_title.shape

(6335, 60)

In [19]:
data['label'].shape

(6335,)

In [20]:
X_train,X_test,y_train,y_test=train_test_split(tfidf_matrix_news_details,np.array(data['label']),test_size=0.2,random_state=42)

In [21]:
data['label'].value_counts()

1    3171
0    3164
Name: label, dtype: int64

In [22]:
# k value for knn
sq=floor(sqrt(X_train.shape[0]))
k=sq if (sq%2)!=0 else sq+1

In [23]:
def clf(clf_name,X,y_true):
    y_pred=clf_name.predict(X) 
    score=f1_score(y_true,y_pred,average='weighted')
    cm=pd.DataFrame(confusion_matrix(y_true,y_pred),columns=['0','1'],index=['0','1'])
    return score,cm

In [24]:
classifier_dict={'Logistic':LogisticRegression(),'KNN':neighbors.KNeighborsClassifier(n_neighbors=k,weights='uniform'),'Decision Tree':DecisionTreeClassifier(),'SVM':SVC(),'NB':MultinomialNB()}

In [25]:
scores_dict={}
cm_dict={}
for k,v in classifier_dict.items():
    v.fit(X_train,y_train)
    score_test,cm_test=clf(v,X_test,y_test)
    score_train,cm_train=clf(v,X_train,y_train)
    scores_dict[k]=[score_test,score_train]
    cm_dict[k]=[cm_test,cm_train]


In [26]:
scores_df=pd.DataFrame.from_dict(scores_dict,orient='index',columns=['Test_score','Train_score'])

In [27]:
scores_df.sort_values(by=['Test_score','Train_score'],ascending=False,inplace=True)

In [28]:
scores_df

Unnamed: 0,Test_score,Train_score
SVM,0.850017,0.919078
Logistic,0.828721,0.812149
KNN,0.788961,0.784613
NB,0.788473,0.773086
Decision Tree,0.720573,0.992896


In [29]:
cm_dict

{'Logistic': [     0    1
  0  517  111
  1  106  533,
        0     1
  0  2073   463
  1   489  2043],
 'KNN': [     0    1
  0  473  155
  1  112  527,
        0     1
  0  1931   605
  1   486  2046],
 'Decision Tree': [     0    1
  0  447  181
  1  173  466,
        0     1
  0  2534     2
  1    34  2498],
 'SVM': [     0    1
  0  544   84
  1  106  533,
        0     1
  0  2372   164
  1   246  2286],
 'NB': [     0    1
  0  493  135
  1  133  506,
        0     1
  0  1962   574
  1   576  1956]}

In [30]:
X_train,X_test,y_train,y_test=train_test_split(tfidf_matrix_title,np.array(data['label']),test_size=0.2,random_state=42)
logreg=LogisticRegression()
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test) 
score=f1_score(y_test,y_pred,average='weighted')

In [31]:
score

0.7068864967079463

In [32]:
#score is less when using title feature. so I include only 'text' attribute.