Fake News Detection System

Importing dependencies

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NEW\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [6]:
print(stopwords.words('english'))    # These stopwords are not important for out processing as they don't have any affect

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data Preprocessing start

In [8]:
fake_news = pd.read_csv(r'C:\Users\NEW\OneDrive\Desktop\Fake News Prediction System\Day4_Fake_News_Data.csv')

In [9]:
fake_news.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [10]:
fake_news.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


In [11]:
fake_news.info()

<class 'pandas.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      20800 non-null  int64
 1   title   20242 non-null  str  
 2   author  18843 non-null  str  
 3   text    20761 non-null  str  
 4   label   20800 non-null  int64
dtypes: int64(2), str(3)
memory usage: 812.6 KB


In [12]:
fake_news.shape

(20800, 5)

In [13]:
fake_news.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [14]:
fake_news = fake_news.fillna("")

In [15]:
fake_news.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [16]:
fake_news['content'] = fake_news['author'] + ' ' + fake_news['title']

In [17]:
print(fake_news['content'])

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799              David Swanson What Keeps the F-35 Alive
Name: content, Length: 20800, dtype: str


In [25]:
fake_news['label'].value_counts()

label
1    10413
0    10387
Name: count, dtype: int64

In [21]:
X = fake_news.drop('label', axis=1)
Y = fake_news['label']

In [23]:
print(X.columns)

Index(['id', 'title', 'author', 'text', 'content'], dtype='str')


In [24]:
print(Y)

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64


Stemming:
Stemming is the process of reducing a word to it's root word
Example: acting, actor, actress --> act --> rootword for all the 3 words

In [27]:
port_stem = PorterStemmer()

In [30]:
def stemming(content):      # for fake_news['content']
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)     # Replaces anything that is NOT a letter with a space
    stemmed_content = stemmed_content.lower().split()      # Lower case all the words and then split them into separate words 
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]       # This line removes stopwords and performs stemming
    stemmed_content = ' '.join(stemmed_content)     # Again join the list made with split into words with spaces
    return stemmed_content

In [31]:
fake_news['content'] = fake_news['content'].apply(stemming)

In [34]:
X = fake_news['content'].values     # .values converts pandas series into numpy arrays
Y = fake_news['label'].values

Converting textual data into numerical data as computer cannot understand text

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)