Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# printing the stopwords in English
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data Pre-processing

In [5]:
# loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('/content/news_dataset_with_verdict.csv.csv')

In [6]:
news_dataset.shape

(20800, 6)

In [7]:
# print the first 5 rows of the dataframe
news_dataset.head()

Unnamed: 0,id,title,author,text,label,verdict
0,0,Hillary Clinton Adopts Alien Baby,Mike Hunt,Former Secretary of State Hillary Clinton has ...,1,fake
1,1,Hillary Clinton Adopts Alien Baby,Anonymous,Former Secretary of State Hillary Clinton has ...,1,fake
2,2,NASA Launches New Mars Rover,CNN,NASA's latest Mars rover launched successfully...,0,real
3,3,NASA Confirms Earth Will Experience 15 Days of...,Mike Hunt,"In a shocking revelation, sources claim that t...",1,fake
4,4,Hillary Clinton Adopts Alien Baby,Mike Hunt,A suppressed report suggests vaccines are link...,1,fake


In [8]:
# counting the number of missing values in the dataset
news_dataset.isnull().sum()

Unnamed: 0,0
id,0
title,0
author,0
text,0
label,0
verdict,0


In [9]:
# replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [10]:
 #merging the author name and news title
news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']

In [11]:
print(news_dataset['content'])

0              Mike Hunt Hillary Clinton Adopts Alien Baby
1              Anonymous Hillary Clinton Adopts Alien Baby
2                         CNN NASA Launches New Mars Rover
3        Mike Hunt NASA Confirms Earth Will Experience ...
4              Mike Hunt Hillary Clinton Adopts Alien Baby
                               ...                        
20795        CNN Breakthrough in Cancer Research Announced
20796    BBC News Global Markets Rally Ahead of Fed Mee...
20797    BBC News UN Holds Emergency Meeting on Climate...
20798    Admin NASA Confirms Earth Will Experience 15 D...
20799         Jane Smith Hillary Clinton Adopts Alien Baby
Name: content, Length: 20800, dtype: object


In [12]:
# separating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [13]:
print(X)
print(Y)

          id                                              title      author  \
0          0                  Hillary Clinton Adopts Alien Baby   Mike Hunt   
1          1                  Hillary Clinton Adopts Alien Baby   Anonymous   
2          2                       NASA Launches New Mars Rover         CNN   
3          3  NASA Confirms Earth Will Experience 15 Days of...   Mike Hunt   
4          4                  Hillary Clinton Adopts Alien Baby   Mike Hunt   
...      ...                                                ...         ...   
20795  20795          Breakthrough in Cancer Research Announced         CNN   
20796  20796          Global Markets Rally Ahead of Fed Meeting    BBC News   
20797  20797       UN Holds Emergency Meeting on Climate Change    BBC News   
20798  20798  NASA Confirms Earth Will Experience 15 Days of...       Admin   
20799  20799                  Hillary Clinton Adopts Alien Baby  Jane Smith   

                                                   

Stemming:

Stemming is the process of reducing a word to its Root word

example: actor, actress, acting --> act

In [14]:
port_stem = PorterStemmer()

In [15]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [16]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [17]:
print(news_dataset['content'])

0          mike hunt hillari clinton adopt alien babi
1             anonym hillari clinton adopt alien babi
2                       cnn nasa launch new mar rover
3        mike hunt nasa confirm earth experi day dark
4          mike hunt hillari clinton adopt alien babi
                             ...                     
20795        cnn breakthrough cancer research announc
20796     bbc news global market ralli ahead fed meet
20797        bbc news un hold emerg meet climat chang
20798        admin nasa confirm earth experi day dark
20799     jane smith hillari clinton adopt alien babi
Name: content, Length: 20800, dtype: object


In [18]:
#separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [19]:
print(X)

['mike hunt hillari clinton adopt alien babi'
 'anonym hillari clinton adopt alien babi' 'cnn nasa launch new mar rover'
 ... 'bbc news un hold emerg meet climat chang'
 'admin nasa confirm earth experi day dark'
 'jane smith hillari clinton adopt alien babi']


In [20]:
print(Y)

[1 1 0 ... 0 1 1]


In [21]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [22]:
print(X)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 137255 stored elements and shape (20800, 62)>
  Coords	Values
  (0, 1)	0.3890605823618897
  (0, 3)	0.3890605823618897
  (0, 7)	0.3890605823618897
  (0, 15)	0.30636699671457546
  (0, 32)	0.3890605823618897
  (0, 34)	0.38772845931979283
  (0, 42)	0.38772845931979283
  (1, 1)	0.42230978468261093
  (1, 3)	0.42230978468261093
  (1, 5)	0.4195578877478737
  (1, 7)	0.42230978468261093
  (1, 15)	0.33254918714958487
  (1, 32)	0.42230978468261093
  (2, 16)	0.41860248112336224
  (2, 37)	0.4218678190463959
  (2, 39)	0.4218678190463959
  (2, 43)	0.335979368130715
  (2, 44)	0.4218678190463959
  (2, 52)	0.4218678190463959
  (3, 17)	0.3626058701466275
  (3, 18)	0.3626058701466275
  (3, 19)	0.3626058701466275
  (3, 22)	0.3626058701466275
  (3, 25)	0.3626058701466275
  (3, 34)	0.3614725136476237
  :	:
  (20796, 41)	0.29280376285575294
  (20796, 45)	0.2912562510853205
  (20796, 48)	0.3725333019030779
  (20797, 8)	0.3705885537942835
  (20797, 13

Splitting the dataset to training & test data

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

Training the Model: Logistic Regression

In [24]:
model = LogisticRegression()

In [25]:
model.fit(X_train, Y_train)

Evaluation

accuracy score

In [26]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [27]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  1.0


In [28]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [29]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  1.0


Making a Predictive System

In [30]:
X_new = X_test[3]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[1]
The news is Fake


In [31]:
print(Y_test[3])

1
