<a href="https://colab.research.google.com/github/RasikaGhodke/Fake-News-Prediction/blob/main/Fake_News_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **FAKE NEWS PREDICTION PROJECT USING MACHINE LEARNING WITH PYTHON**

In [None]:
# Connect to the Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the data
file_path = '/content/drive/MyDrive/fake_news_train.csv'

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# printing the stopwords in English
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

# **Data Pre-processing**

In [None]:
# Creating dataframe
news_dataset = pd.read_csv(file_path)

In [None]:
# Size of the data
news_dataset.shape

(10000, 5)

### Column Explanation:
### id: Unique identifier for each article.

### title: The headline/title of the news article.

### author: The author of the article.

### text: The full body text of the article.

### label: The target variable:

### 1 → Fake news

### 0 → Real news

In [None]:
# print the first 5 rows of the dataframe
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,Government Plans to Confiscate Guns,Patricia Brown,New research shows cures suppressed by pharmac...,1
1,1,Mayor Announces Infrastructure Plan,Sam Lee,"In an interview, the mayor highlighted communi...",0
2,2,You Won’t Believe What Politician Said,Alex Johnson,Politician caught in scandal that mainstream m...,1
3,3,You Won’t Believe What Politician Said,John Smith,New research shows cures suppressed by pharmac...,1
4,4,New Study Reveals Benefits of Exercise,Alex Johnson,"In an interview, the mayor highlighted communi...",0


In [None]:
# counting the number of missing values in the dataset
news_dataset.isnull().sum()

Unnamed: 0,0
id,0
title,0
author,0
text,0
label,0


In [None]:
# merging the author name and news title
news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']

In [None]:
# Printing new dataset containing column "content"
print(news_dataset['content'])

0       Patricia Brown Government Plans to Confiscate ...
1             Sam Lee Mayor Announces Infrastructure Plan
2       Alex Johnson You Won’t Believe What Politician...
3       John Smith You Won’t Believe What Politician Said
4       Alex Johnson New Study Reveals Benefits of Exe...
                              ...                        
9995    Patricia Brown New Study Reveals Benefits of E...
9996        Jane Doe Miracle Cure for Diabetes Discovered
9997     Alex Johnson Government Plans to Confiscate Guns
9998            John Smith Shocking Health Secret Exposed
9999    Patricia Brown New Study Reveals Benefits of E...
Name: content, Length: 10000, dtype: object


In [None]:
# separating the data & label
# Get the Independent Features
X = news_dataset.drop(columns='label', axis=1)
# Get the Dependent features
Y = news_dataset['label']

In [None]:
print(X)


        id                                   title          author  \
0        0     Government Plans to Confiscate Guns  Patricia Brown   
1        1     Mayor Announces Infrastructure Plan         Sam Lee   
2        2  You Won’t Believe What Politician Said    Alex Johnson   
3        3  You Won’t Believe What Politician Said      John Smith   
4        4  New Study Reveals Benefits of Exercise    Alex Johnson   
...    ...                                     ...             ...   
9995  9995  New Study Reveals Benefits of Exercise  Patricia Brown   
9996  9996    Miracle Cure for Diabetes Discovered        Jane Doe   
9997  9997     Government Plans to Confiscate Guns    Alex Johnson   
9998  9998          Shocking Health Secret Exposed      John Smith   
9999  9999  New Study Reveals Benefits of Exercise  Patricia Brown   

                                                   text  \
0     New research shows cures suppressed by pharmac...   
1     In an interview, the mayor highligh

In [None]:
print(Y)

In [None]:
Y.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,5002
0,4998


### Stemming:

### Stemming is the process of reducing a word to its Root word

### example: actor, actress, acting, react  --> act

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
# Applying Steemming function on content column(which we make by combining author and title columns)
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [None]:
print(news_dataset['content'])

0                patricia brown govern plan confisc gun
1              sam lee mayor announc infrastructur plan
2                   alex johnson believ politician said
3                     john smith believ politician said
4         alex johnson new studi reveal benefit exercis
                             ...                       
9995    patricia brown new studi reveal benefit exercis
9996                 jane doe miracl cure diabet discov
9997               alex johnson govern plan confisc gun
9998               john smith shock health secret expos
9999    patricia brown new studi reveal benefit exercis
Name: content, Length: 10000, dtype: object


In [None]:
#separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [None]:
print(X)

['patricia brown govern plan confisc gun'
 'sam lee mayor announc infrastructur plan'
 'alex johnson believ politician said' ...
 'alex johnson govern plan confisc gun'
 'john smith shock health secret expos'
 'patricia brown new studi reveal benefit exercis']


In [None]:
print(Y)

[1 0 1 ... 1 1 0]


In [None]:
Y.shape

(10000,)

In [None]:
X.shape

(10000,)

## TF-IDF stands for “Term Frequency – Inverse Document Frequency ”

## TF-IDF is a numerical statistic which measures the importance of the word in a document.

## Term Frequency : Number of time a word appears in a text document.

## Inverse Document Frequency : Measure the word is a rare word or common word in a document.

In [None]:
# converting the textual data to numerical data using vectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [None]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 61448 stored elements and shape (10000, 52)>
  Coords	Values
  (0, 4)	0.3756385569307595
  (0, 8)	0.44518781665090107
  (0, 16)	0.44518781665090107
  (0, 18)	0.44518781665090107
  (0, 35)	0.3756385569307595
  (0, 36)	0.3510196188402638
  (1, 1)	0.4457939493835242
  (1, 22)	0.4457939493835242
  (1, 26)	0.3731626344404433
  (1, 30)	0.4457939493835242
  (1, 36)	0.3539810172792543
  (1, 41)	0.3731626344404433
  (2, 0)	0.40424139644544366
  (2, 2)	0.4737009558781698
  (2, 25)	0.40424139644544366
  (2, 37)	0.4737009558781698
  (2, 40)	0.4737009558781698
  (3, 2)	0.4720354940766327
  (3, 24)	0.40715321255977327
  (3, 37)	0.4720354940766327
  (3, 40)	0.4720354940766327
  (3, 47)	0.40715321255977327
  (4, 0)	0.34132358074172603
  (4, 3)	0.4081569844034316
  (4, 13)	0.4081569844034316
  :	:
  (9996, 9)	0.4291892061477907
  (9996, 10)	0.4291892061477907
  (9996, 11)	0.4291892061477907
  (9996, 12)	0.3627578402356845
  (9996, 23)	0.3627

## Splitting the dataset to training & test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

## LOGISTIC REGRESSION:

## Logistic regression is a statistical analysis method to predict a binary outcome, such as yes or no, based on prior observations of a data set. A logistic regression model predicts a dependent data variable by analyzing the relationship between one or more existing independent variables.

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

In [None]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  1.0


In [None]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  1.0


## CONFUSION MATRIX:

## The confusion matrix is a 2 dimensional array comparing predicted category labels to the true label.

In [None]:
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Purples):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    return plt



In [None]:
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X, Y, test_size=0.33, random_state=42)

### CLASSIFICATION MODEL : MULTINOMIAL NAIVE BAYES

### Multinomial Naive Bayes algorithm is a probabilistic learning method that is mostly used in Natural Language Processing (NLP)

### Multinomial Naïve Bayes uses term frequency i.e. the number of times a given term appears in a document. Term frequency is often normalized by dividing the raw term frequency by the document length.

In [None]:
#let's implement the model : Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()

In [None]:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix


classifier=MultinomialNB()
classifier.fit(X1_train_tfidf, Y1_train)
prediction1 = classifier.predict(X1_test_tfidf)
score = metrics.accuracy_score(Y1_test, prediction1)
print("accuracy:   %0.3f" % score)
cm1 = metrics.confusion_matrix(Y1_test, prediction1)
plot_confusion_matrix(cm1, classes=['FAKE', 'REAL'])

NameError: name 'X1_train_tfidf' is not defined

### CLASSIFICATION MODEL : PASSIVE AGGRESSIVE CLASSIFIER

### Passive Aggressive Classifier works by responding as passive for correct classifications and responding as aggressive for any miscalculation.

In [None]:
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [None]:
#lets implemet the algorithm : Passive Aggressive Classifier
from sklearn.linear_model import PassiveAggressiveClassifier
classifier=PassiveAggressiveClassifier()
classifier.fit(X2_train_tfidf, Y2_train)
prediction2 = classifier.predict(X2_test_tfidf)
score = metrics.accuracy_score(Y2_test, prediction2)
print("accuracy:   %0.3f" % score)
cm2 = metrics.confusion_matrix(Y2_test, prediction2)
plot_confusion_matrix(cm2, classes=['FAKE', 'REAL'])


### BUILDING A PREDICTIVE SYSTEM:

### Building a predictive system inorder to find that the inital word in the dataset is real or fake using LogisticRegression model:

In [None]:
X_new = X_test[0]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

In [None]:
print(Y_test[0])

### Building a predictive system inorder to find that the inital word in the dataset is real or fake using Multinomial Naive Bayes model:

In [None]:
X1_new = X1_test[0]

prediction = model.predict(X1_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

In [None]:
print(Y1_test[0])

### Building a predictive system inorder to find that the inital word in the dataset is real or fake using Passive Aggressive Classifier:

In [None]:
# Building a predictive system inorder to find that the inital word in the dataset is real or fake using Passive Aggressive Classifier
X2_new = X2_test[0]

prediction = model.predict(X2_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

In [None]:
print(Y2_test[0])

### CLASSIFICATION REPORT:

### A Classification report is used to measure the quality of predictions from a classification algorithm.

### It is used to show the precision, recall, F1 Score, and support of your trained classification model.

In [None]:
# Classification report for logistic regression model
from sklearn.metrics import classification_report
print(classification_report(Y_test, X_test_prediction))

In [None]:
# Classification report for Multinomial Naive Bayes
print("\nClassification Report:\n")
print(classification_report(Y1_test, prediction1, target_names=['REAL', 'FAKE']))



In [None]:
# Classification report for Passive Aggressive Classifier
from sklearn.metrics import classification_report
print(classification_report(Y2_test, prediction2))

### MODEL COMPARISON

### Hence, in this dataset we can observe that

### Logistic Regression : Accuracy is 1.0

### Multinomial Naive Bayes : Accuracy is 1.0

### Passive Aggressive Classifier : Accuracy is 0.51

In [None]:
print("Successfully we have predicted the fake news using logistic regression, Multinomial Naive Bayes & Passive Aggressive Classifier ")