In [2]:
import os
import pickle
import warnings

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import (GridSearchCV, KFold, RandomizedSearchCV,
                                     learning_curve)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline

In [3]:
np.random.seed(71)
train_data = pd.read_csv(r'C:\Users\My PC\Documents\TXT\Fake-News-Detection-System-master\datasets\train.csv')
valid_data = pd.read_csv(r'C:\Users\My PC\Documents\TXT\Fake-News-Detection-System-master\datasets\valid.csv')
test_data = pd.read_csv(r'C:\Users\My PC\Documents\TXT\Fake-News-Detection-System-master\datasets\test.csv')

In [4]:
def show_eval_scores(model, test_set, model_name):
    y_pred = model.predict(test_set['news'])
    y_true = test_set['label']
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    
    print('Report for {}'.format(model_name))
    print('Accuracy is: {}'.format(accuracy))
    print('F1 score is: {}'.format(f1))
    print('Precision score is: {}'.format(precision))
    print('Recall score is: {}'.format(recall))

In [5]:
print('Train dataset size: {}'.format(train_data.shape))
print('Valid dataset size: {}'.format(valid_data.shape))
print('Test dataset size: {}'.format(test_data.shape))

Train dataset size: (10240, 2)
Valid dataset size: (1284, 2)
Test dataset size: (1267, 2)


In [6]:
training_set = pd.concat([train_data, valid_data], ignore_index=True)
print('Training set size: {}'.format(training_set.shape))
training_set.sample(5)

Training set size: (11524, 2)


Unnamed: 0,label,news
5938,False,Florida's new fingerprint identification syste...
8722,True,Harvard Study Finds States With Most Gun Laws ...
465,True,When terrorists killed more than 250 Americans...
54,False,We have towns in West Texas that are out of wa...
2499,True,Says he sued Obamas EPA seven times.


In [7]:
countV = CountVectorizer()
train_count = countV.fit_transform(training_set['news'].values)

## Logistic Regression 

In [8]:
stopwords_list = list(stopwords.words('english'))

In [9]:
lr_pipeline = Pipeline([
    ('lrCV', CountVectorizer(stop_words=stopwords_list, lowercase=True, ngram_range=(1, 1))),
    ('lr_clf', LogisticRegression(C=0.0001,random_state=42, n_jobs=-1))
])

In [10]:
lr_pipeline.fit(training_set['news'], training_set['label'])

  " = {}.".format(effective_n_jobs(self.n_jobs)))


Pipeline(memory=None,
     steps=[('lrCV', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['i', 'me',..., penalty='l2', random_state=42,
          solver='warn', tol=0.0001, verbose=0, warm_start=False))])

In [11]:
show_eval_scores(lr_pipeline, test_data, 'Logistic Regression Count Vectorizer')

Report for Logistic Regression Count Vectorizer
Accuracy is: 0.56353591160221
F1 score is: 0.7208480565371024
Precision score is: 0.56353591160221
Recall score is: 1.0


## Naive Bayes 

In [12]:
nb_pipeline = Pipeline([
    ('nb_CV', CountVectorizer(stop_words=stopwords_list, lowercase=True, ngram_range=(1, 4))),
    ('nb_clf', MultinomialNB(alpha=6.8))
])

In [13]:
nb_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(memory=None,
     steps=[('nb_CV', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), preprocessor=None,
        stop_words=['i', 'me'...zer=None, vocabulary=None)), ('nb_clf', MultinomialNB(alpha=6.8, class_prior=None, fit_prior=True))])

In [14]:
show_eval_scores(nb_pipeline, test_data, 'Naive Bayes Count Vectorizer')

Report for Naive Bayes Count Vectorizer
Accuracy is: 0.6203630623520127
F1 score is: 0.7326292384658143
Precision score is: 0.6073732718894009
Recall score is: 0.9229691876750701


## Random Forest 

In [15]:
rf_pipeline = Pipeline([
    ('rf_CV', CountVectorizer(stop_words=stopwords_list, lowercase=False, ngram_range=(1, 1))),
    ('rf_clf', RandomForestClassifier(max_depth=12, n_estimators=300, n_jobs=-1, random_state=42))
])

In [16]:
rf_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(memory=None,
     steps=[('rf_CV', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['i', 'me...imators=300, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

In [17]:
show_eval_scores(rf_pipeline, test_data, 'Random Forest Classifier Count Vectorizer')

Report for Random Forest Classifier Count Vectorizer
Accuracy is: 0.5651144435674822
F1 score is: 0.7215765538150581
Precision score is: 0.5644268774703557
Recall score is: 1.0


## Support Vector Machines 

In [18]:
svm_pipeline = Pipeline([
    ('svm_CV', CountVectorizer(stop_words=stopwords_list, lowercase=False, ngram_range=(1, 1))),
    ('svm_clf', SVC(random_state=42, gamma=1.0, kernel='rbf'))
])

In [19]:
svm_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(memory=None,
     steps=[('svm_CV', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['i', 'm...f',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False))])

In [20]:
show_eval_scores(svm_pipeline, test_data, 'Random Forest Classifier Count Vectorizer')

Report for Random Forest Classifier Count Vectorizer
Accuracy is: 0.5666929755327546
F1 score is: 0.7211782630777045
Precision score is: 0.5657370517928287
Recall score is: 0.9943977591036415


## Soft Estimator 

In [22]:
rf_voting_pipeline = Pipeline([
    ('rf_CV', CountVectorizer(stop_words=stopwords_list, lowercase=False, ngram_range=(1, 1))),
    ('rf_clf', RandomForestClassifier(max_depth=12, n_estimators=300, n_jobs=-1, random_state=42))
])

In [23]:
lr_voting_pipeline = Pipeline([
    ('lrCV', CountVectorizer(stop_words=stopwords_list, lowercase=True, ngram_range=(1, 1))),
    ('lr_clf', LogisticRegression(C=0.0001,random_state=42, n_jobs=-1))
])

In [25]:
nb_voting_pipeline = Pipeline([
    ('nb_CV', CountVectorizer(stop_words=stopwords_list, lowercase=True, ngram_range=(1, 4))),
    ('nb_clf', MultinomialNB(alpha=6.8))
])

In [26]:
svm_voting_pipeline = Pipeline([
    ('svm_CV', CountVectorizer(stop_words=stopwords_list, lowercase=False, ngram_range=(1, 1))),
    ('svm_clf', SVC(random_state=42, gamma=1.0, kernel='rbf', probability=True))
])

In [27]:
voting_classifier = VotingClassifier(estimators=[
    ('lr', lr_voting_pipeline), ('nb', nb_voting_pipeline),
    ('svm', svm_voting_pipeline), ('rf', rf_voting_pipeline)], voting='soft', n_jobs=-1)

In [28]:
voting_classifier.fit(training_set['news'], training_set['label'])

VotingClassifier(estimators=[('lr', Pipeline(memory=None,
     steps=[('lrCV', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), pre...tors=300, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))]))],
         flatten_transform=None, n_jobs=-1, voting='soft', weights=None)

In [29]:
show_eval_scores(voting_classifier, test_data, 'Voting Classifier(soft) Count Vectorizer')

Report for Voting Classifier(soft) Count Vectorizer
Accuracy is: 0.6045777426992897
F1 score is: 0.7293354943273906
Precision score is: 0.5936675461741425
Recall score is: 0.9453781512605042
