In [3]:
import os
import pickle
import warnings

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import (GridSearchCV, KFold, RandomizedSearchCV,
                                     learning_curve)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline
warnings.filterwarnings('ignore')

In [4]:
np.random.seed(71)
train_data = pd.read_csv(r'C:\Users\My PC\Documents\TXT\Fake-News-Detection-System-master\datasets\train.csv')
valid_data = pd.read_csv(r'C:\Users\My PC\Documents\TXT\Fake-News-Detection-System-master\datasets\valid.csv')
test_data = pd.read_csv(r'C:\Users\My PC\Documents\TXT\Fake-News-Detection-System-master\datasets\test.csv')

In [5]:
def show_eval_scores(model, test_set, model_name):
    y_pred = model.predict(test_set['news'])
    y_true = test_set['label']
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    
    print('Report for {}'.format(model_name))
    print('Accuracy is: {}'.format(accuracy))
    print('F1 score is: {}'.format(f1))
    print('Precision score is: {}'.format(precision))
    print('Recall score is: {}'.format(recall))

In [6]:
print('Train dataset size: {}'.format(train_data.shape))
print('Valid dataset size: {}'.format(valid_data.shape))
print('Test dataset size: {}'.format(test_data.shape))

Train dataset size: (10240, 2)
Valid dataset size: (1284, 2)
Test dataset size: (1267, 2)


In [7]:
training_set = pd.concat([train_data, valid_data], ignore_index=True)
print('Training set size: {}'.format(training_set.shape))
training_set.sample(5)

Training set size: (11524, 2)


Unnamed: 0,label,news
5938,False,Florida's new fingerprint identification syste...
8722,True,Harvard Study Finds States With Most Gun Laws ...
465,True,When terrorists killed more than 250 Americans...
54,False,We have towns in West Texas that are out of wa...
2499,True,Says he sued Obamas EPA seven times.


In [9]:
stopwords_list = list(stopwords.words('english'))
tfidf_V = TfidfVectorizer(stop_words=stopwords_list, use_idf=True, smooth_idf=True)
train_count = tfidf_V.fit_transform(training_set['news'].values)

## Logistic Regression 

In [10]:
stopwords_list = list(stopwords.words('english'))

In [12]:
lr_pipeline = Pipeline([
    ('lr_TF', TfidfVectorizer(lowercase=False, ngram_range=(1, 5), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('lr_clf', LogisticRegression(C=1.0, random_state=42, n_jobs=-1))
])

In [13]:
lr_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(memory=None,
     steps=[('lr_TF', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 5), norm='l2', preprocessor=None, smooth_idf=True...  penalty='l2', random_state=42, solver='warn', tol=0.0001,
          verbose=0, warm_start=False))])

In [14]:
show_eval_scores(lr_pipeline, test_data, 'Logistic Regression Count Vectorizer')

Report for Logistic Regression Count Vectorizer
Accuracy is: 0.6314127861089187
F1 score is: 0.7143730886850153
Precision score is: 0.6340933767643865
Recall score is: 0.8179271708683473


## Naive Bayes 

In [15]:
nb_pipeline = Pipeline([
    ('nb_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('nb_clf', MultinomialNB(alpha=2.0))
])

In [16]:
nb_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(memory=None,
     steps=[('nb_TF', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,...,
        vocabulary=None)), ('nb_clf', MultinomialNB(alpha=2.0, class_prior=None, fit_prior=True))])

In [17]:
show_eval_scores(nb_pipeline, test_data, 'Naive Bayes Count Vectorizer')

Report for Naive Bayes Count Vectorizer
Accuracy is: 0.6053670086819258
F1 score is: 0.732905982905983
Precision score is: 0.5924006908462867
Recall score is: 0.9607843137254902


## Random Forest 

In [18]:
rf_pipeline = Pipeline([
    ('rf_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('rf_clf', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1))
])

In [19]:
rf_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(memory=None,
     steps=[('rf_TF', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,...imators=200, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

In [20]:
show_eval_scores(rf_pipeline, test_data, 'Random Forest Classifier Count Vectorizer')

Report for Random Forest Classifier Count Vectorizer
Accuracy is: 0.5722178374112076
F1 score is: 0.7248730964467006
Precision score is: 0.5684713375796179
Recall score is: 1.0


## Support Vector Machines 

In [21]:
svm_pipeline = Pipeline([
    ('svm_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('svm_clf', SVC(gamma=0.2, kernel='rbf', random_state=42))
])

In [22]:
svm_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(memory=None,
     steps=[('svm_TF', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True...f',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False))])

In [23]:
show_eval_scores(svm_pipeline, test_data, 'Random Forest Classifier Count Vectorizer')

Report for Random Forest Classifier Count Vectorizer
Accuracy is: 0.6006314127861089
F1 score is: 0.7201327433628317
Precision score is: 0.5950639853747715
Recall score is: 0.9117647058823529


## Soft Estimator 

In [24]:
lr_voting_pipeline = Pipeline([
    ('lr_TF', TfidfVectorizer(lowercase=False, ngram_range=(1, 5), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('lr_clf', LogisticRegression(C=1.0, random_state=42, n_jobs=-1))
])

In [25]:
nb_voting_pipeline = Pipeline([
    ('nb_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('nb_clf', MultinomialNB(alpha=2.0))
])

In [26]:
svm_voting_pipeline = Pipeline([
    ('svm_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('svm_clf', SVC(gamma=0.2, kernel='rbf', random_state=42, probability=True))
])

In [27]:
rf_voting_pipeline = Pipeline([
    ('rf_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('rf_clf', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1))
])

In [28]:

voting_classifier = VotingClassifier(estimators=[
    ('lr', lr_voting_pipeline), ('nb', nb_voting_pipeline),
    ('svm', svm_voting_pipeline), ('rf', rf_voting_pipeline)], voting='soft', n_jobs=-1)

In [29]:
voting_classifier.fit(training_set['news'], training_set['label'])

VotingClassifier(estimators=[('lr', Pipeline(memory=None,
     steps=[('lr_TF', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 5),...tors=200, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))]))],
         flatten_transform=None, n_jobs=-1, voting='soft', weights=None)

In [30]:
show_eval_scores(voting_classifier, test_data, 'Voting Classifier(soft) TFIDF Vectorizer')

Report for Voting Classifier(soft) TFIDF Vectorizer
Accuracy is: 0.6227308602999211
F1 score is: 0.723699421965318
Precision score is: 0.6161417322834646
Recall score is: 0.876750700280112
