In [0]:
# auxiliar libraries
import pandas as pd
import re
import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.pipeline import Pipeline

# models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

PATH = './drive/My Drive/Colab Notebooks/' # <--- Change this

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
trueNews = pd.read_csv(PATH + 'datasets/FakeNews/True.csv')
fakeNews = pd.read_csv(PATH + 'datasets/FakeNews/Fake.csv')

In [4]:
trueNews.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
fakeNews.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


Subjects inside each dataset

In [6]:
trueNews.subject.unique()

array(['politicsNews', 'worldnews'], dtype=object)

In [7]:
fakeNews.subject.unique()

array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east'], dtype=object)

### Looking for nan or null values to drop

In [8]:
trueNews.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [9]:
fakeNews.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

The datasets does not have any null value

### Avaliating the text size

In [0]:
true_len = trueNews.text.apply(lambda x: len(x))
fake_len = fakeNews.text.apply(lambda x: len(x))

In [11]:
print(true_len.quantile([.01, .25, .5, .75, .99]))
print(fake_len.quantile([.01, .25, .5, .75, .99]))

0.01     277.00
0.25     914.00
0.50    2222.00
0.75    3237.00
0.99    7097.52
Name: text, dtype: float64
0.01        1.0
0.25     1433.0
0.50     2166.0
0.75     3032.0
0.99    11827.0
Name: text, dtype: float64


As could be seen in the quantiles, the datasets contains really short texts that must be dropped.

In [12]:
trueNews['text_length'] = true_len
fakeNews['text_length'] = fake_len

trueNews = trueNews[trueNews['text_length'] >= 250]
fakeNews = fakeNews[fakeNews['text_length'] >= 250]

trueNews.drop('text_length', axis=1, inplace=True)
fakeNews.drop('text_length', axis=1, inplace=True)

print(len(trueNews), len(fakeNews))

21283 21724


The punctuation count quantiles does not show a significant diference between them, so they will be not considered as features

### Removing urls

In [0]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

trueNews.text = trueNews.text.apply(remove_URL)
fakeNews.text = fakeNews.text.apply(remove_URL)

### Merging the datasets

In [0]:
trueNews['target'] = 'true'
fakeNews['target'] = 'fake'

columns = ['title', 'text', 'target']
df = pd.concat([trueNews[columns], fakeNews[columns]]).reset_index(drop = True)

In [15]:
df.head()

Unnamed: 0,title,text,target
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,True


Splitting the datasets

In [0]:
RANDOM_STATE = 42
x_train, x_test, y_train, y_test = train_test_split(df['text'], df.target, test_size=0.2, shuffle=True, random_state=RANDOM_STATE)

# Evaluating models

In this section, will be evaluated several machine learning models with cross-validation to find a shortlist of promising models.

Defining functions to print some metrics

In [0]:
def display_scores(scores):
    for k in range(len(scores)):
        print("Accuracy of {}-fold: {}%".format(k+1,round(scores[k]*100,2)))
    print("Mean:", round(scores.mean()*100, 2))
    print("Standard deviation:", round(scores.std()*100, 4))
    
def performance_measurement(pipeline, x_train, y_train):
    print("-- Cross Validation --")
    cross_val = cross_val_score(pipeline, x_train, y_train, cv=5, scoring="accuracy")
    display_scores(cross_val)
    
    y_train_pred = cross_val_predict(pipeline, x_train, y_train, cv=5)
    
    print("\n-- Precision --")
    print("Precision of {}%".format(round(precision_score(y_train, y_train_pred, pos_label='true')*100,2)))
    
    print("\n-- Confusion Matrix --")
    print(confusion_matrix(y_train, y_train_pred))

### Defining each metric:

#### Accuracy: 

$$Accuracy = \frac{True Positives}{Total}$$

#### Precision: 

$$Precision = \frac{True Positives}{True Positives + False Positives}$$

#### Confusion Matrix:

<table>
  <tr>
    <td style="border:1px solid black">True Negatives</td>
    <td style="border:1px solid black">False Positives</td>
  </tr>
  <tr>
    <td style="border:1px solid black">False Negatives</td>
    <td style="border:1px solid black">True Positives</td>
  </tr>
</table>

## Logistic Regression

In [22]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model', LogisticRegression(random_state=RANDOM_STATE))
                     ])

performance_measurement(pipeline, x_train, y_train)

-- Cross Validation --
Accuracy of 1-fold: 98.26%
Accuracy of 2-fold: 98.5%
Accuracy of 3-fold: 98.56%
Accuracy of 4-fold: 98.58%
Accuracy of 5-fold: 98.53%
Mean: 98.49
Standard deviation: 0.1175

-- Precision --
Precision of 98.18%

-- Confusion Matrix --
[[17098   312]
 [  209 16786]]


## Linear Support Vector Classifier

In [23]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model', LinearSVC(random_state=RANDOM_STATE))
                     ])

performance_measurement(pipeline, x_train, y_train)

-- Cross Validation --
Accuracy of 1-fold: 99.43%
Accuracy of 2-fold: 99.43%
Accuracy of 3-fold: 99.43%
Accuracy of 4-fold: 99.36%
Accuracy of 5-fold: 99.36%
Mean: 99.4
Standard deviation: 0.0356

-- Precision --
Precision of 99.35%

-- Confusion Matrix --
[[17300   110]
 [   95 16900]]


## Stochastic Gradient Descent

In [24]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model', SGDClassifier(random_state=RANDOM_STATE))
                     ])

performance_measurement(pipeline, x_train, y_train)

-- Cross Validation --
Accuracy of 1-fold: 98.9%
Accuracy of 2-fold: 99.13%
Accuracy of 3-fold: 99.06%
Accuracy of 4-fold: 99.1%
Accuracy of 5-fold: 99.13%
Mean: 99.06
Standard deviation: 0.087

-- Precision --
Precision of 98.9%

-- Confusion Matrix --
[[17222   188]
 [  135 16860]]


## Gradient Boost Classifier

In [25]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model', XGBClassifier(
                        learning_rate = 0.01,
                        n_estimators = 10,
                        max_depth = 5,
                        objective = 'binary:logistic',
                        random_state=RANDOM_STATE)
                     )])

performance_measurement(pipeline, x_train, y_train)

-- Cross Validation --
Accuracy of 1-fold: 99.49%
Accuracy of 2-fold: 99.35%
Accuracy of 3-fold: 99.35%
Accuracy of 4-fold: 99.46%
Accuracy of 5-fold: 99.39%
Mean: 99.41
Standard deviation: 0.0598

-- Precision --
Precision of 99.12%

-- Confusion Matrix --
[[17259   151]
 [   53 16942]]


## Decision Tree

In [26]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model', DecisionTreeClassifier(
                        criterion='entropy',
                        max_depth = 10, 
                        splitter='best',
                        random_state=RANDOM_STATE)
                     )])

performance_measurement(pipeline, x_train, y_train)

-- Cross Validation --
Accuracy of 1-fold: 99.58%
Accuracy of 2-fold: 99.49%
Accuracy of 3-fold: 99.42%
Accuracy of 4-fold: 99.55%
Accuracy of 5-fold: 99.42%
Mean: 99.49
Standard deviation: 0.0656

-- Precision --
Precision of 99.54%

-- Confusion Matrix --
[[17332    78]
 [   97 16898]]


## Random Forest

In [27]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model', RandomForestClassifier(random_state=RANDOM_STATE)
                     )])

performance_measurement(pipeline, x_train, y_train)

-- Cross Validation --
Accuracy of 1-fold: 98.87%
Accuracy of 2-fold: 98.56%
Accuracy of 3-fold: 98.81%
Accuracy of 4-fold: 98.5%
Accuracy of 5-fold: 98.74%
Mean: 98.69
Standard deviation: 0.1404

-- Precision --
Precision of 98.9%

-- Confusion Matrix --
[[17224   186]
 [  263 16732]]


At this moment the Linear Support Vector Classifier, Gradient Boost Classifier, and the Decision Tree had the best results, so they will be fine-tuned to choose the best hyperparameters.

# Fine-Tune the models

## Grid Search

In [0]:
svc_param_grid = [    
    {'max_iter': [1000, 5000, 10000], 'C': [1, 1.5, 2],    
    'loss': ['hinge', 'squared_hinge'], 'tol': [1e-5, 1e-4], 
    'random_state': [RANDOM_STATE]}
]

xgb_param_grid = [
    {'max_depth': [5], 'learning_rate': [0.01, 0.001],
    'objective': ['binary:logistic'], 'n_estimators': [100, 1000], 
    'random_state': [RANDOM_STATE]}
]

dt_param_grid = [
    {'max_depth': [None, 10], 'max_features': ['auto', 'log2', None], 
     'criterion': ['entropy'], 'splitter': ['best'],
     'random_state': [RANDOM_STATE]}
]

In [0]:
data_pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer())
                    ])

x_prepared = data_pipeline.fit_transform(x_train.values)

### Linear Support Vector Classifier

In [0]:
model = LinearSVC()

svc_grid_search = GridSearchCV(model, svc_param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True)

svc_grid_search.fit(x_prepared, y_train)



GridSearchCV(cv=5, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [1, 1.5, 2], 'loss': ['hinge', 'squared_hinge'],
                          'max_iter': [1000, 5000, 10000], 'random_state': [42],
                          'tol': [1e-05, 0.0001]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='accuracy', verbose=0)

In [0]:
svc_grid_search.best_params_

{'C': 1.5,
 'loss': 'squared_hinge',
 'max_iter': 1000,
 'random_state': 42,
 'tol': 1e-05}

In [0]:
svc_grid_search.best_estimator_

LinearSVC(C=1.5, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=42, tol=1e-05,
          verbose=0)

In [0]:
svc_grid_search.best_score_

0.9944484813253889

### Gradient Boost Classifier

In [0]:
model = XGBClassifier()

xgb_grid_search = GridSearchCV(model, xgb_param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True)

xgb_grid_search.fit(x_prepared, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid=[{'learning_rate': [0.01, 0.001], 'max_depth': [5],
                          'n_estimators': [100, 1000],
                          'objective': ['binar

In [0]:
xgb_grid_search.best_params_

{'learning_rate': 0.01,
 'max_depth': 5,
 'n_estimators': 1000,
 'objective': 'binary:logistic',
 'random_state': 42}

In [0]:
xgb_grid_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.01, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
xgb_grid_search.best_score_

0.9970643801772997

### Decision Tree

In [0]:
model = DecisionTreeClassifier()

dt_grid_search = GridSearchCV(model, dt_param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True)

dt_grid_search.fit(x_prepared, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid=[{'criterion': ['entropy'], 'max_depth': [None, 10],
 

In [0]:
dt_grid_search.best_params_

{'criterion': 'entropy',
 'max_depth': None,
 'max_features': None,
 'random_state': 42,
 'splitter': 'best'}

In [0]:
dt_grid_search.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [0]:
dt_grid_search.best_score_

0.9950007266385699

# Ensembling the models

As we found the best hyperparameters, let's ensemble the models in a voting classifier and compare the results on a last evaluation using the test set.

## Voting Classifier

In [0]:
# classifiers with the best parameters

# --- XGB ---
xgb_clf = XGBClassifier(learning_rate = 0.001, n_estimators = 1000, max_depth = 5,
                        objective = 'binary:logistic',
                        random_state=RANDOM_STATE)

xgb_pipeline = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('model', xgb_clf)])

# --- SVC ---
svc_clf = LinearSVC(C=1.5, class_weight=None, dual=True, fit_intercept=True,
                    intercept_scaling=1, loss='squared_hinge', max_iter=1000,
                    multi_class='ovr', penalty='l2', tol=1e-05,
                    random_state=RANDOM_STATE)

# need to use a calibration over linear svc in order to make it a probabilistic classifier
calibrated_svc_clf = CalibratedClassifierCV(svc_clf, method='sigmoid', cv=5) 

svc_pipeline = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('model', calibrated_svc_clf)])

# --- Decision Tree ---
dt_clf = DecisionTreeClassifier(criterion='entropy', max_depth=None,
                                max_features=None, splitter='best', 
                                random_state=RANDOM_STATE)

dt_pipeline = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('model', dt_clf)])

# --- Voting ---
voting_clf = VotingClassifier(
        estimators = [('xgb', xgb_clf), ('svc', calibrated_svc_clf), ('dtree', dt_clf)],    
        voting='soft') 

voting_pipeline = Pipeline([('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('model', voting_clf)])

## Final Measurement

In [0]:
svc_pipeline.fit(x_train, y_train)
print('Linear SVC done.')
dt_pipeline.fit(x_train, y_train)
print('Decision Tree done.')
xgb_pipeline.fit(x_train, y_train)
print('XGB done.')
voting_pipeline.fit(x_train, y_train)
print('Voting Classifier done.')

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Linear SVC done.
Decision Tree done.
XGB done.
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Voting Classifier done.


In [0]:
joblib.dump(svc_pipeline, PATH + 'models/FakeNews/svc_model.pkl', compress=9)
joblib.dump(xgb_pipeline, PATH + 'models/FakeNews/xgb_model.pkl', compress=9)
joblib.dump(dt_pipeline, P ATH + 'models/FakeNews/dt_model.pkl', compress=9)
joblib.dump(voting_pipeline, P ATH + 'models/FakeNews/voting_model.pkl', compress=9)

In [0]:
svc_pipeline = joblib.load(PATH + 'models/FakeNews/svc_model.pkl')
xgb_pipeline = joblib.load(PATH + 'models/FakeNews/xgb_model.pkl')
dt_pipeline = joblib.load(PATH + 'models/FakeNews/dt_model.pkl')
voting_pipeline = joblib.load(PATH + 'models/FakeNews/voting_model.pkl')

In [0]:
xgb_pred = xgb_pipeline.predict(x_test)
svc_pred = svc_pipeline.predict(x_test)
dt_pred = dt_pipeline.predict(x_test)
voting_pred = voting_pipeline.predict(x_test)

In [0]:
print("Accuracy for XGB Model: %.2f" % (accuracy_score(y_test, xgb_pred) * 100))
print("Accuracy for SVC Model: %.2f" % (accuracy_score(y_test, svc_pred) * 100))
print("Accuracy for Decision Tree Model: %.2f" % (accuracy_score(y_test, dt_pred) * 100))
print("Accuracy for Voting Classifier Model: %.2f" % (accuracy_score(y_test, voting_pred) * 100))

Accuracy for XGB Model: 99.47
Accuracy for SVC Model: 99.61
Accuracy for Decision Tree Model: 99.48
Accuracy for Voting Classifier Model: 99.72


As expected the voting classifier has the best accuracy between the models, so he is the best choice to classify fake news.