# Project Name:

### Fake News Prediction

# Problem Statement:

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('news.csv')
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [4]:
df.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [6]:
df[df.duplicated()]

Unnamed: 0.1,Unnamed: 0,title,text,label


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [8]:
df.describe()

Unnamed: 0.1,Unnamed: 0
count,6335.0
mean,5280.415627
std,3038.503953
min,2.0
25%,2674.5
50%,5271.0
75%,7901.0
max,10557.0


In [9]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE


In [12]:
df.label.value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

### Importing required libraries for NLP business case

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Text preprocessing

In [4]:
df.loc[df['label']=='REAL','label']=1
df.loc[df['label']=='FAKE', 'label']=0

In [22]:
df.label.value_counts()

1    3171
0    3164
Name: label, dtype: int64

In [23]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,28,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,10294,85,Google Pinterest Digg Linkedin Reddit Stumbleu...,0


### Removing Punctuation

In [5]:
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
# Creating a function to remove the punctuation

def remove_punct(text):
    text = "".join([char for char in text if char not in string.punctuation]) 
    return text

In [7]:
text = []
for i in df['text']:
    t = remove_punct(i)
    text.append(t)
    

In [8]:
df['Text_clean'] = text
df

Unnamed: 0.1,Unnamed: 0,title,text,label,Text_clean
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0,Daniel Greenfield a Shillman Journalism Fellow...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,Google Pinterest Digg Linkedin Reddit Stumbleu...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1,US Secretary of State John F Kerry said Monday...
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0,— Kaydee King KaydeeKing November 9 2016 The l...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1,Its primary day in New York and frontrunners H...
...,...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,1,The State Department told the Republican Natio...
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,0,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,0,AntiTrump Protesters Are Tools of the Oligarc...
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",1,ADDIS ABABA Ethiopia —President Obama convened...


### Now we need to convert each of those messages into a vector ( the way the ML models can understand and can work with)

### Model creation

In [9]:
X = df['Text_clean'].values # convert df as array
y = df['label'].values


In [10]:
# Datatype for y is object. lets convert it into int
y = y.astype('int')
y

array([0, 0, 1, ..., 0, 1, 1])

### Creating train test split

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=10)
X_train.shape

(5068,)

**BAG OF WORDS**
* We cannot pass text directly to train our models in Natural Language Processing, thus we need to convert it into numbers, which machine can understand and can perform the required modelling on it.

In [12]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


In [13]:
# Initialize the object for countvectorizer 
CV = CountVectorizer(stop_words="english")  

In [14]:
# Apply countvectorizer functionality on the training data to convert the categorical data into vectors
X_train_CV = CV.fit_transform(X_train)

### Training a model

### MultinominalNB

In [15]:
# Initialising the model
NB = MultinomialNB()

In [16]:
NB.fit(X_train_CV,y_train)

In [17]:
X_test_CV = CV.transform(X_test)

In [18]:
y_predict = NB.predict(X_test_CV)
y_predict

array([1, 0, 1, ..., 0, 1, 1])

In [19]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.93      0.84      0.89       603
           1       0.87      0.94      0.90       664

    accuracy                           0.90      1267
   macro avg       0.90      0.89      0.89      1267
weighted avg       0.90      0.90      0.90      1267



In [20]:
pd.crosstab(y_test,y_predict)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,508,95
1,37,627


### BernoulliNB

In [21]:
#Initialising a model
bnb = BernoulliNB()

## fitting the model
bnb.fit(X_train_CV,y_train)

## getting the prediction
y_hat1=bnb.predict(X_test_CV) 

## confusion matrix
pd.crosstab(y_test,y_hat1)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,546,57
1,158,506


### Term Frequency-Inverse Document Frequency

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer


tf=TfidfVectorizer()

In [23]:
X_train_cv = tf.fit_transform(X_train)
X_test_cv = tf.transform(X_test)

### MultinominalNB

In [24]:
# Initialising the model
nb = MultinomialNB()
nb.fit(X_train_cv,y_train)  

In [25]:
y_hat = nb.predict(X_test_cv)

In [26]:
# classification report

print(classification_report(y_test,y_hat))

              precision    recall  f1-score   support

           0       0.98      0.62      0.76       603
           1       0.74      0.99      0.85       664

    accuracy                           0.81      1267
   macro avg       0.86      0.81      0.80      1267
weighted avg       0.86      0.81      0.81      1267



In [27]:
pd.crosstab(y_test,y_hat)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,374,229
1,6,658


### BernoulliNB

In [28]:
## model object creation
nb=BernoulliNB()

## fitting the model
nb.fit(X_train_cv,y_train)

## getting the prediction
y_hat=nb.predict(X_test_cv) 

In [29]:
y_hat

array([1, 0, 1, ..., 0, 1, 1])

In [30]:
## Evaluating the model
from sklearn.metrics import classification_report,confusion_matrix

In [31]:
print(classification_report(y_test,y_hat))

              precision    recall  f1-score   support

           0       0.76      0.87      0.81       603
           1       0.87      0.75      0.81       664

    accuracy                           0.81      1267
   macro avg       0.81      0.81      0.81      1267
weighted avg       0.82      0.81      0.81      1267



In [32]:
## confusion matrix
pd.crosstab(y_test,y_hat)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,525,78
1,164,500


### Observation:

* After text preprocessing in NLP, the data was passed through a Count Vectorizer and Term Frequency-Inverse Document Frequency (TF-IDF). The Multinomial Naive Bayes model using the Count Vectorizer yielded better results with less error and 90% accuracy.
