Fake news detection using NLP

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('news_dataset.csv')
data.head()

Unnamed: 0,label,text
0,REAL,Payal has accused filmmaker Anurag Kashyap of ...
1,FAKE,A four-minute-long video of a woman criticisin...
2,FAKE,"Republic Poll, a fake Twitter account imitatin..."
3,REAL,"Delhi teen finds place on UN green list, turns..."
4,REAL,Delhi: A high-level meeting underway at reside...


In [3]:
data.shape

(3729, 2)

## Data Cleaning

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3729 entries, 0 to 3728
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   3729 non-null   object
 1   text    3721 non-null   object
dtypes: object(2)
memory usage: 58.4+ KB


In [5]:
data.isnull().sum()

label    0
text     8
dtype: int64

In [6]:
data = data.fillna('')

In [7]:
data.duplicated().sum()

1498

In [8]:
data.drop_duplicates(inplace=True)

In [9]:
data.shape

(2231, 2)

## Text Preprocessing

In [10]:
# convert text into lower alphabet

data['cleaned_text'] = data['text'].str.lower()

In [11]:
import nltk

In [12]:
# removig punctuations and urls

import re

# Urls nand links
def remove_url(text):
  cleaned = re.sub(r'https?://\S+|www\.\S+', '',text)
  return(cleaned)

# puctuations
def remove_punct(text):
  cleaned = re.sub(r'[^\w\s]', '',text)
  return(cleaned)

In [13]:
# ckecking the funtion

text = 'after your come back https://mail.google.com/mail/u/1/#inbox'
remove_url(text)

text = 'after ((your ?? // %'' ;; :  *come back ht$tps://mail.google.com/-mail/u/1/#inb?ox'
remove_punct(text)

'after your       come back httpsmailgooglecommailu1inbox'

In [14]:
data['cleaned_text'] = data['cleaned_text'].apply(remove_url)

In [15]:
data['cleaned_text'] = data['cleaned_text'].apply(remove_punct)

In [16]:
data.sample(5)

Unnamed: 0,label,text,cleaned_text
1417,FAKE,Founder of Post Card News Mahesh Hegde made a ...,founder of post card news mahesh hegde made a ...
3360,FAKE,"A screenshot of a tweet, seemingly by actor Ja...",a screenshot of a tweet seemingly by actor jaa...
1849,FAKE,A viral photo about a new feature is doing the...,a viral photo about a new feature is doing the...
1428,FAKE,Bollywood's veteran lyricist and scriptwriter ...,bollywoods veteran lyricist and scriptwriter j...
3527,FAKE,As India's COVID-19 cases continue to rise and...,as indias covid19 cases continue to rise and e...


In [17]:
# tokenization

from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

data['news_sentences'] = data['cleaned_text'].apply(sent_tokenize)
data['news_words'] = data['cleaned_text'].apply(word_tokenize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
# checking
text = 'If you are looking for random paragraphs, you have come to the right place. When a random word or a random sentence is not quite enough, the next logical step is to find a random paragraph. We created the Random Paragraph Generator with you in mind. The process is quite simple. Choose the number of random paragraphs you would like to see and click the button. Your chosen number of paragraphs will instantly appear.'
print('sent_tokenize =', sent_tokenize(text))
print('word_tokenize =', word_tokenize(text))

sent_tokenize = ['If you are looking for random paragraphs, you have come to the right place.', 'When a random word or a random sentence is not quite enough, the next logical step is to find a random paragraph.', 'We created the Random Paragraph Generator with you in mind.', 'The process is quite simple.', 'Choose the number of random paragraphs you would like to see and click the button.', 'Your chosen number of paragraphs will instantly appear.']
word_tokenize = ['If', 'you', 'are', 'looking', 'for', 'random', 'paragraphs', ',', 'you', 'have', 'come', 'to', 'the', 'right', 'place', '.', 'When', 'a', 'random', 'word', 'or', 'a', 'random', 'sentence', 'is', 'not', 'quite', 'enough', ',', 'the', 'next', 'logical', 'step', 'is', 'to', 'find', 'a', 'random', 'paragraph', '.', 'We', 'created', 'the', 'Random', 'Paragraph', 'Generator', 'with', 'you', 'in', 'mind', '.', 'The', 'process', 'is', 'quite', 'simple', '.', 'Choose', 'the', 'number', 'of', 'random', 'paragraphs', 'you', 'would', '

In [19]:
data.head(5)

Unnamed: 0,label,text,cleaned_text,news_sentences,news_words
0,REAL,Payal has accused filmmaker Anurag Kashyap of ...,payal has accused filmmaker anurag kashyap of ...,[payal has accused filmmaker anurag kashyap of...,"[payal, has, accused, filmmaker, anurag, kashy..."
1,FAKE,A four-minute-long video of a woman criticisin...,a fourminutelong video of a woman criticising ...,[a fourminutelong video of a woman criticising...,"[a, fourminutelong, video, of, a, woman, criti..."
2,FAKE,"Republic Poll, a fake Twitter account imitatin...",republic poll a fake twitter account imitating...,[republic poll a fake twitter account imitatin...,"[republic, poll, a, fake, twitter, account, im..."
3,REAL,"Delhi teen finds place on UN green list, turns...",delhi teen finds place on un green list turns ...,[delhi teen finds place on un green list turns...,"[delhi, teen, finds, place, on, un, green, lis..."
4,REAL,Delhi: A high-level meeting underway at reside...,delhi a highlevel meeting underway at residenc...,[delhi a highlevel meeting underway at residen...,"[delhi, a, highlevel, meeting, underway, at, r..."


In [20]:
# stopwords remove

from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

def remove_stwd(text):
  cleaned = [word for word in text if word not in stop_words]
  return(cleaned)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
data['news_words'] = data['news_words'].apply(remove_stwd)

In [22]:
data.sample(5)

Unnamed: 0,label,text,cleaned_text,news_sentences,news_words
2289,FAKE,Pakistani singer Rabi Pirzada shared a set of ...,pakistani singer rabi pirzada shared a set of ...,[pakistani singer rabi pirzada shared a set of...,"[pakistani, singer, rabi, pirzada, shared, set..."
183,FAKE,A set of four images of violence over reported...,a set of four images of violence over reported...,[a set of four images of violence over reporte...,"[set, four, images, violence, reported, land, ..."
2951,FAKE,A 2017 photo showing actors Akshay Kumar and B...,a 2017 photo showing actors akshay kumar and b...,[a 2017 photo showing actors akshay kumar and ...,"[2017, photo, showing, actors, akshay, kumar, ..."
2504,FAKE,A screenshot of a NDTV India headline claiming...,a screenshot of a ndtv india headline claiming...,[a screenshot of a ndtv india headline claimin...,"[screenshot, ndtv, india, headline, claiming, ..."
55,FAKE,"A photo more than a decade old, showing a Nepa...",a photo more than a decade old showing a nepal...,[a photo more than a decade old showing a nepa...,"[photo, decade, old, showing, nepalese, police..."


In [23]:
# Lemmatization

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lem = WordNetLemmatizer()

def lem_word(text):
  clean_verb = [lem.lemmatize(word, pos='v') for word in text]
  return clean_verb

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [24]:
data['news_words'] = data['news_words'].apply(lem_word)

In [25]:
data['news_words'] = data['news_words'].apply(lambda x: ' '.join(x))

In [26]:
data['news_words_counts'] = data['news_words'].apply(lambda x:len(x))

In [27]:
data.tail()

Unnamed: 0,label,text,cleaned_text,news_sentences,news_words,news_words_counts
3719,FAKE,A set of images is being shared on Facebook wi...,a set of images is being shared on facebook wi...,[a set of images is being shared on facebook w...,set image share facebook claim show instance r...,1093
3720,FAKE,Barely 48 hours ahead of voting in the assembl...,barely 48 hours ahead of voting in the assembl...,[barely 48 hours ahead of voting in the assemb...,barely 48 hours ahead vote assembly elections ...,2881
3722,FAKE,A quote by an impostor Facebook page of Financ...,a quote by an impostor facebook page of financ...,[a quote by an impostor facebook page of finan...,quote impostor facebook page finance minister ...,1207
3726,FAKE,The Bengaluru City Police’s official Twitter h...,the bengaluru city polices official twitter ha...,[the bengaluru city polices official twitter h...,bengaluru city police official twitter handle ...,925
3727,REAL,"Sep 20, 2020, 08:00AM IST\n\nSource: TOI.in\n\...",sep 20 2020 0800am ist\n\nsource toiin\n\nmeet...,[sep 20 2020 0800am ist\n\nsource toiin\n\nmee...,sep 20 2020 0800am ist source toiin meet neelk...,317


In [28]:
data['label'].value_counts()

FAKE    1852
REAL     379
Name: label, dtype: int64

In [29]:
X = data['news_words']
y = data['label']

In [30]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size=0.2, random_state=10)

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

vector = TfidfVectorizer(max_features=1000)

In [32]:
Xtrain = vector.fit_transform(Xtrain)
Xtest = vector.fit_transform(Xtest)

In [33]:
Xtrain.shape

(1784, 1000)

In [34]:
Xtrain = Xtrain.toarray()
Xtest = Xtest.toarray()

Applying naive bayes

In [35]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

gnb = GaussianNB()
mnb = MultinomialNB()

In [36]:
gnb.fit(Xtrain, ytrain)

In [37]:
ypred1 = gnb.predict(Xtest)


In [38]:
ypred1

array(['FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
      

In [39]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score


In [40]:
print('accuracy_score ',accuracy_score(ytest, ypred1))
print('precision_score',precision_score(ytest, ypred1, average='weighted'))
print('recall_score', recall_score(ytest, ypred1, average='weighted'))
print('f1_score', f1_score(ytest, ypred1, average='weighted'))

accuracy_score  0.8568232662192393
precision_score 0.8773688064927118
recall_score 0.8568232662192393
f1_score 0.7929376441194359


In [41]:
confusion_matrix(ytest, ypred1)

array([[382,   0],
       [ 64,   1]])

Multinomial Naive Bayes

In [42]:
mnb.fit(Xtrain, ytrain)

In [43]:
ypred2 = mnb.predict(Xtest)
ypred2

array(['FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'REAL', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'REAL', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'REAL', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'REAL', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'REAL',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'REAL', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
       'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE', 'FAKE',
      

In [44]:
print('accuracy_score ',accuracy_score(ytest, ypred2))
print('precision_score',precision_score(ytest, ypred2, average='weighted'))
print('recall_score', recall_score(ytest, ypred2, average='weighted'))
print('f1_score', f1_score(ytest, ypred2, average='weighted'))

accuracy_score  0.7941834451901566
precision_score 0.7345471741580206
recall_score 0.7941834451901566
f1_score 0.7621213268069501


In [45]:
confusion_matrix(ytest, ypred2)

array([[353,  29],
       [ 63,   2]])