Importing Dependencies

In [10]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [18]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admln\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [19]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

DATA COLLECTION AND PREPROCESSING 

About the Dataset:

1. id: unique id for a news article
2. title: the title of a news article
3. author: author of the news article
4. text: the text of the article; could be incomplete
5. label: a label that marks whether the news article is real or fake:
           1: Fake news
           0: real News


In [None]:
#Mention the dataset Path. Have a Own Dataset with the above columns 
news_dataset = pd.read_csv('Fake News Dataset.csv')

In [12]:
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,1,NASA discovers new exoplanet similar to Earth,John Smith,Schools are now integrating AI models to enhan...,0
1,2,Aliens spotted in downtown New York,Sarah Davis,Eyewitnesses report bright lights and strange ...,1
2,3,Celebrity claims to be time traveler from 3020,Olivia Walker,Scientists now admit that all space missions w...,1
3,4,Government announces new healthcare reform,Michael Brown,NASA confirmed that the new exoplanet discover...,0
4,5,Local schools adopt AI-based learning programs,Sarah Davis,The discovery adds to the biodiversity of the ...,0


In [None]:
#Find the Missing Values. 
news_dataset.isnull().sum()
#In the Dataset i haven't given missing values because the purpose of this prediction is just to practice 
#with categorical data. So doing imputation techniques for missing values is not done here.

id        0
title     0
author    0
text      0
label     0
dtype: int64

FEATURE EXTRACTION

In [16]:
#Deriving a new feature that helps us in predicting the model better and constitutes to maximum variance in data. 
news_dataset['content'] = news_dataset['author'] + ' ' + news_dataset['title']
print(news_dataset['content'])

0       John Smith NASA discovers new exoplanet simila...
1         Sarah Davis Aliens spotted in downtown New York
2       Olivia Walker Celebrity claims to be time trav...
3       Michael Brown Government announces new healthc...
4       Sarah Davis Local schools adopt AI-based learn...
                              ...                        
9995    Emily Johnson Government announces new healthc...
9996    James Lewis New species of bird discovered in ...
9997    John Smith NASA discovers new exoplanet simila...
9998    James Lewis Government announces new healthcar...
9999    Michael Brown NASA discovers new exoplanet sim...
Name: content, Length: 10000, dtype: object


STEMMING PROCESS

In [None]:
#Import the Function PorterStemmer - It converts each word into its root word .
port_stem = PorterStemmer()

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]' , ' ' , content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
    return ' '.join(stemmed_content)

#Applying stemming function to each data in the content 
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [21]:
print(news_dataset['content'])

0       john smith nasa discov new exoplanet similar e...
1                 sarah davi alien spot downtown new york
2                  olivia walker celebr claim time travel
3       michael brown govern announc new healthcar reform
4       sarah davi local school adopt ai base learn pr...
                              ...                        
9995    emili johnson govern announc new healthcar reform
9996    jame lewi new speci bird discov amazon rainforest
9997    john smith nasa discov new exoplanet similar e...
9998        jame lewi govern announc new healthcar reform
9999    michael brown nasa discov new exoplanet simila...
Name: content, Length: 10000, dtype: object


SPLITTING DATA AND LABELS

In [24]:
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [27]:
print(X)
print(Y)

['john smith nasa discov new exoplanet similar earth'
 'sarah davi alien spot downtown new york'
 'olivia walker celebr claim time travel' ...
 'john smith nasa discov new exoplanet similar earth'
 'jame lewi govern announc new healthcar reform'
 'michael brown nasa discov new exoplanet similar earth']
[0 1 1 ... 0 0 0]


VECTORIZING THE CONTENT DATA INTO NUMERICAL DATA

In [28]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

print(X)

  (0, 55)	0.36795235287473227
  (0, 54)	0.3999265761865985
  (0, 44)	0.22846711407657871
  (0, 43)	0.3999265761865985
  (0, 35)	0.36795235287473227
  (0, 25)	0.3999265761865985
  (0, 22)	0.31518722606174193
  (0, 20)	0.31282283056055166
  (1, 66)	0.40600973441982086
  (1, 57)	0.40600973441982086
  (1, 50)	0.377509666828921
  (1, 44)	0.2357907655673144
  (1, 21)	0.40600973441982086
  (1, 18)	0.377509666828921
  (1, 2)	0.40600973441982086
  (2, 64)	0.39127625873722754
  (2, 61)	0.4164750228696783
  (2, 60)	0.4164750228696783
  (2, 45)	0.39127625873722754
  (2, 13)	0.4164750228696783
  (2, 12)	0.4164750228696783
  (3, 49)	0.4176252045000466
  (3, 44)	0.24297114422556498
  (3, 42)	0.392735700472044
  (3, 29)	0.4176252045000466
  :	:
  (9996, 8)	0.3822073448943373
  (9996, 3)	0.3822073448943373
  (9997, 55)	0.36795235287473227
  (9997, 54)	0.3999265761865985
  (9997, 44)	0.22846711407657871
  (9997, 43)	0.3999265761865985
  (9997, 35)	0.36795235287473227
  (9997, 25)	0.3999265761865985
  (9