## Natural Language Processing : Fake News Classifier

This is a supervised classification problem. The dataset has articles and corresponding labels to identify whether it's REAL or FAKE news. 

In [198]:
# importing important nltk libararies

In [199]:
import nltk 
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Praveen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [200]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Praveen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [201]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Praveen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Importing libararies

In [202]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics
from sklearn.metrics import accuracy_score, confusion_matrix

In [203]:
data = pd.read_csv("fake_or_real_news.csv")

In [204]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


### Renaming columns

In [205]:
data.rename(columns={'Unnamed: 0':'article_id','text':'summary'}, inplace=True)

In [206]:
data.columns

Index(['article_id', 'title', 'summary', 'label'], dtype='object')

In [207]:
data.head()

Unnamed: 0,article_id,title,summary,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


### Observing articles

In [208]:
data['summary'][0]



## Pruning article

In [209]:
# considering only letters by replacing others with " "
letters= re.sub('[^a-zA-Z]',' ',data['summary'][0])
letters= letters.lower()
letters = letters.split(' ')
len(letters)

1478

In [210]:
# removing stopwords
stop_words = set(stopwords.words("english"))
clear_words = [w for w in letters if w not in stop_words]
len(clear_words)

820

In [211]:
clear_words[:20]

['daniel',
 'greenfield',
 '',
 'shillman',
 'journalism',
 'fellow',
 'freedom',
 'center',
 '',
 'new',
 'york',
 'writer',
 'focusing',
 'radical',
 'islam',
 '',
 '',
 'final',
 'stretch',
 'election']

In [212]:
# removing blank values 
clear_words= list(filter(None, clear_words)) 
len(clear_words)

670

In [213]:
clear_words[:20]

['daniel',
 'greenfield',
 'shillman',
 'journalism',
 'fellow',
 'freedom',
 'center',
 'new',
 'york',
 'writer',
 'focusing',
 'radical',
 'islam',
 'final',
 'stretch',
 'election',
 'hillary',
 'rodham',
 'clinton',
 'gone']

### Function based on above steps

In [214]:
# the above steps were just for one sentence
# the same should be repeated for each sentence in the summary

def clean_summary(raw_text):
    letters= re.sub('[^a-zA-Z]',' ',raw_text)
    letters= letters.lower()
    letters = letters.split(' ')
    clear_words = [w for w in letters if w not in stop_words]
    clear_words= list(filter(None, clear_words)) 
    return(" ".join(clear_words))

In [215]:
data.summary.size

6335

In [216]:
X = data['summary']
y = data['label'].apply(lambda x : 1 if x=='REAL' else 0)

In [217]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [218]:
X_train.shape

(4434,)

In [219]:
X_test.shape

(1901,)

## First Let's try with CountVectorizer

The CountVectorizer provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words, but also to encode new documents using that vocabulary.

You can use it as follows:

* Create an instance of the CountVectorizer class.
* Call the fit() function in order to learn a vocabulary from one or more documents.
* Call the transform() function on one or more documents as needed to encode each as a vector.
An encoded vector is returned with a length of the entire vocabulary and an integer count for the number of times each word appeared in the document.

Because these vectors will contain a lot of zeros, we call them sparse. Python provides an efficient way of handling sparse vectors in the scipy.sparse package.

The vectors returned from a call to transform() will be sparse vectors, and you can transform them back to numpy arrays to look and better understand what is going on by calling the toarray() function.

https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/

### Going with all available features

In [263]:
# Initializing CountVectorizer

vectorizer = CountVectorizer(analyzer='word',
                            preprocessor=clean_summary,
                            stop_words="english",
                            lowercase ='True')
                            

In [264]:
#fit training data
countVectr = vectorizer.fit(X_train)

In [265]:
# Checking for features after fitting dataset
countVectr.get_feature_names()[:10]

['aa',
 'aaa',
 'aaahhh',
 'aaas',
 'aab',
 'aachen',
 'aadhar',
 'aadmi',
 'aaeeb',
 'aaib']

In [223]:
# transforming training data
count_train = countVectr.transform(X_train)

In [224]:
count_train.shape

(4434, 55900)

In [225]:
# As we have built vocabulary based on the training data,
# we just have to transform it test dataset using this vocabulary
count_test = countVectr.transform(X_test)

In [226]:
count_test.shape

(1901, 55900)

## Naive Bayes model

The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). 
The multinomial distribution normally requires integer feature counts. 
However, in practice, fractional counts such as tf-idf may also work.

In [227]:
nb_classifier = MultinomialNB()

In [228]:
# fit training dataset
nb_classifier.fit(count_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [229]:
# predicting values for test dataset
pred = nb_classifier.predict(count_test)

In [230]:
pred

array([1, 1, 0, ..., 0, 1, 1], dtype=int64)

In [231]:
# the score for the dataset
score = accuracy_score(y_test, pred)
print(score)

0.8800631246712257


In [232]:
cm = confusion_matrix(y_test, pred)
print(cm)

[[778 155]
 [ 73 895]]


### Taking top 5000 features

In [266]:
# Initializing CountVectorizer

vectorizer = CountVectorizer(analyzer='word',
                            preprocessor=clean_summary,
                            stop_words="english",
                            lowercase ='True',
                            max_features = 5000)

In [267]:
countVectr = vectorizer.fit(X_train)
# Checking for features after fitting dataset
countVectr.get_feature_names()[:10]

['abandon',
 'abandoned',
 'abc',
 'abdullah',
 'abedin',
 'ability',
 'able',
 'abortion',
 'abortions',
 'abroad']

In [268]:
# transforming training data
count_train = countVectr.transform(X_train)
count_train.shape

(4434, 5000)

In [269]:
count_test = countVectr.transform(X_test)
count_test.shape

(1901, 5000)

In [271]:
nb_classifier = MultinomialNB()
# fit training dataset
nb_classifier.fit(count_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [272]:
# predicting values for test dataset
pred = nb_classifier.predict(count_test)

In [273]:
# the score for the dataset
score = accuracy_score(y_test, pred)
print(score)

0.857969489742241


In [274]:
cm = confusion_matrix(y_test, pred)
print(cm)

[[796 137]
 [133 835]]


## TfidfVectorizer

One issue with simple counts is that some words like “the” will appear many times and their large counts will not be very meaningful in the encoded vectors.

An alternative is to calculate word frequencies, and by far the most popular method is called TF-IDF. This is an acronym than stands for “Term Frequency – Inverse Document” Frequency which are the components of the resulting scores assigned to each word.

* Term Frequency: This summarizes how often a given word appears within a document.
* Inverse Document Frequency: This downscales words that appear a lot across documents.

Without going into the math, TF-IDF are word frequency scores that try to highlight words that are more interesting, e.g. frequent in a document but not across documents.

### Going with all features

In [233]:
tfidfVectorizer = TfidfVectorizer(  stop_words ='english',
                                    max_df=0.7,
                                    analyzer='word',
                                    preprocessor=clean_summary,
                                    lowercase ='True')
                                   # max_features =5000
                                

In [234]:
tfidfVectr = tfidfVectorizer.fit(X_train)

In [249]:
tfidfVectr.get_feature_names()[:10]
## it's clear that some of the features are of no importance

['aa',
 'aaa',
 'aaahhh',
 'aaas',
 'aab',
 'aachen',
 'aadhar',
 'aadmi',
 'aaeeb',
 'aaib']

In [236]:
tfidf_train = tfidfVectr.transform(X_train)

In [237]:
tfidf_train.shape

(4434, 55900)

In [238]:
tfidf_test = tfidfVectr.transform(X_test)

In [239]:
tfidf_classifier = MultinomialNB()

In [240]:
tfidf_classifier.fit(tfidf_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [241]:
pred = tfidf_classifier.predict(tfidf_test)

In [242]:
score = accuracy_score(y_test, pred)
print(score)

0.8327196212519726


In [243]:
cm = confusion_matrix(y_test, pred)
print(cm)

[[639 294]
 [ 24 944]]


### Let's try max_features = 5000 instead of going with 55000 features

#### max_features : If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus

In [260]:
# Let's try with top 5000 features
tfidfVectorizer = TfidfVectorizer(  stop_words ='english',
                                    max_df=0.7,
                                    analyzer='word',
                                    preprocessor=clean_summary,
                                    lowercase ='True',
                                    max_features =5000)

In [251]:
tfidfVectr = tfidfVectorizer.fit(X_train)
tfidfVectr.get_feature_names()[:10]


['abandon',
 'abandoned',
 'abc',
 'abdullah',
 'abedin',
 'ability',
 'able',
 'abortion',
 'abortions',
 'abroad']

In [252]:
# these features makes much more sense than the earlier one

In [253]:
tfidf_train = tfidfVectr.transform(X_train)
tfidf_train.shape

(4434, 5000)

In [255]:
tfidf_test = tfidfVectr.transform(X_test)
tfidf_classifier = MultinomialNB()

In [256]:
tfidf_classifier.fit(tfidf_train, y_train)
pred = tfidf_classifier.predict(tfidf_test)

In [262]:
score = accuracy_score(y_test, pred)
print(" The accuracy score is:{} ".format(score))

 The accuracy score is:0.8726985796948974 


In [259]:
# Though we have tries just 5000 features but the accuracy got improved.

In [258]:
cm = confusion_matrix(y_test, pred)
print(cm)

[[816 117]
 [125 843]]
