In [0]:
# auxiliar libraries
import pandas as pd
import re
import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.pipeline import Pipeline

# models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

PATH = './' # <--- Change this

In [0]:
trueNews = pd.read_csv(PATH + 'datasets/FakeNews/True.csv')
fakeNews = pd.read_csv(PATH + 'datasets/FakeNews/Fake.csv')

In [5]:
trueNews.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [6]:
fakeNews.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


Subjects inside each dataset

In [7]:
trueNews.subject.unique()

array(['politicsNews', 'worldnews'], dtype=object)

In [8]:
fakeNews.subject.unique()

array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east'], dtype=object)

### Looking for nan or null values to drop

In [9]:
trueNews.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [10]:
fakeNews.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

The datasets does not have any null value

### Avaliating the text size

In [0]:
true_len = trueNews.text.apply(lambda x: len(x))
fake_len = fakeNews.text.apply(lambda x: len(x))

In [12]:
print(true_len.quantile([.01, .25, .5, .75, .99]))
print(fake_len.quantile([.01, .25, .5, .75, .99]))

0.01     277.00
0.25     914.00
0.50    2222.00
0.75    3237.00
0.99    7097.52
Name: text, dtype: float64
0.01        1.0
0.25     1433.0
0.50     2166.0
0.75     3032.0
0.99    11827.0
Name: text, dtype: float64


As could be seen in the quantiles, the datasets contains really short texts that must be dropped.

In [13]:
trueNews['text_length'] = true_len
fakeNews['text_length'] = fake_len

trueNews = trueNews[trueNews['text_length'] >= 250]
fakeNews = fakeNews[fakeNews['text_length'] >= 250]

trueNews.drop('text_length', axis=1, inplace=True)
fakeNews.drop('text_length', axis=1, inplace=True)

print(len(trueNews), len(fakeNews))

21283 21724


The punctuation count quantiles does not show a significant diference between them, so they will be not considered as features

### Removing urls

In [0]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

trueNews.text = trueNews.text.apply(remove_URL)
fakeNews.text = fakeNews.text.apply(remove_URL)

### Removing the news provider name

In [0]:
to_replace = {'reuters': '', 'Reuters': ''}
def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text

trueNews.text = trueNews.text.apply(lambda x: x.split('(Reuters) - ')[1] if '(Reuters) - ' in x else x) # removing the provider header
trueNews.text = trueNews.text.apply(lambda x: replace_all(x, to_replace))
fakeNews.text = fakeNews.text.apply(lambda x: replace_all(x, to_replace))

### Merging the datasets

In [0]:
trueNews['target'] = 'true'
fakeNews['target'] = 'fake'

columns = ['title', 'text', 'target']
df = pd.concat([trueNews[columns], fakeNews[columns]]).reset_index(drop = True)

In [17]:
df.head()

Unnamed: 0,title,text,target
0,"As U.S. budget fight looms, Republicans flip t...",The head of a conservative Republican faction ...,True
1,U.S. military to accept transgender recruits o...,Transgender people will be allowed for the fir...,True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,The special counsel investigation of links bet...,True
3,FBI Russia probe helped by Australian diplomat...,Trump campaign adviser George Papadopoulos tol...,True
4,Trump wants Postal Service to charge 'much mor...,President Donald Trump called on the U.S. Post...,True


Splitting the datasets

In [0]:
RANDOM_STATE = 42
x_train, x_test, y_train, y_test = train_test_split(df['text'], df.target, test_size=0.2, shuffle=True, random_state=RANDOM_STATE)

# Evaluating models

In this section, will be evaluated several machine learning models with cross-validation to find a shortlist of promising models.

Defining functions to print some metrics

In [0]:
def display_scores(scores):
    for k in range(len(scores)):
        print("Accuracy of {}-fold: {}%".format(k+1,round(scores[k]*100,2)))
    print("Mean:", round(scores.mean()*100, 2))
    print("Standard deviation:", round(scores.std()*100, 4))
    
def performance_measurement(pipeline, x_train, y_train):
    print("-- Cross Validation --")
    cross_val = cross_val_score(pipeline, x_train, y_train, cv=5, scoring="accuracy")
    display_scores(cross_val)
    
    y_train_pred = cross_val_predict(pipeline, x_train, y_train, cv=5)
    
    print("\n-- Precision --")
    print("Precision of {}%".format(round(precision_score(y_train, y_train_pred, pos_label='true')*100,2)))
    
    print("\n-- Confusion Matrix --")
    print(confusion_matrix(y_train, y_train_pred))

### Defining each metric:

#### Accuracy: 

$$Accuracy = \frac{True Positives}{Total}$$

#### Precision: 

$$Precision = \frac{True Positives}{True Positives + False Positives}$$

#### Confusion Matrix:

<table>
  <tr>
    <td style="border:1px solid black">True Negatives</td>
    <td style="border:1px solid black">False Positives</td>
  </tr>
  <tr>
    <td style="border:1px solid black">False Negatives</td>
    <td style="border:1px solid black">True Positives</td>
  </tr>
</table>

## Logistic Regression

In [0]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model', LogisticRegression(random_state=RANDOM_STATE))
                     ])

performance_measurement(pipeline, x_train, y_train)

-- Cross Validation --
Accuracy of 1-fold: 97.73%
Accuracy of 2-fold: 97.95%
Accuracy of 3-fold: 98.01%
Accuracy of 4-fold: 98.01%
Accuracy of 5-fold: 98.05%
Mean: 97.95
Standard deviation: 0.1137

-- Precision --
Precision of 97.56%

-- Confusion Matrix --
[[16993   417]
 [  288 16707]]


## Linear Support Vector Classifier

In [0]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model', LinearSVC(random_state=RANDOM_STATE))
                     ])

performance_measurement(pipeline, x_train, y_train)

-- Cross Validation --
Accuracy of 1-fold: 98.82%
Accuracy of 2-fold: 99.01%
Accuracy of 3-fold: 98.87%
Accuracy of 4-fold: 98.79%
Accuracy of 5-fold: 98.91%
Mean: 98.88
Standard deviation: 0.0763

-- Precision --
Precision of 98.79%

-- Confusion Matrix --
[[17204   206]
 [  179 16816]]


## Stochastic Gradient Descent

In [0]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model', SGDClassifier(random_state=RANDOM_STATE))
                     ])

performance_measurement(pipeline, x_train, y_train)

-- Cross Validation --
Accuracy of 1-fold: 98.18%
Accuracy of 2-fold: 98.37%
Accuracy of 3-fold: 98.43%
Accuracy of 4-fold: 98.49%
Accuracy of 5-fold: 98.49%
Mean: 98.39
Standard deviation: 0.1132

-- Precision --
Precision of 98.05%

-- Confusion Matrix --
[[17077   333]
 [  220 16775]]


## Gradient Boost Classifier

In [0]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model', XGBClassifier(
                        learning_rate = 0.01,
                        n_estimators = 100,
                        max_depth = 10,
                        objective = 'binary:logistic',
                        random_state=RANDOM_STATE)
                     )])

performance_measurement(pipeline, x_train, y_train)

-- Cross Validation --
Accuracy of 1-fold: 94.62%
Accuracy of 2-fold: 95.36%
Accuracy of 3-fold: 95.06%
Accuracy of 4-fold: 95.35%
Accuracy of 5-fold: 95.36%
Mean: 95.15
Standard deviation: 0.289

-- Precision --
Precision of 94.6%

-- Confusion Matrix --
[[16483   927]
 [  741 16254]]


## Decision Tree

In [0]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model', DecisionTreeClassifier(
                        criterion='entropy',
                        max_depth = 10, 
                        splitter='best',
                        random_state=RANDOM_STATE)
                     )])

performance_measurement(pipeline, x_train, y_train)

-- Cross Validation --
Accuracy of 1-fold: 92.76%
Accuracy of 2-fold: 92.65%
Accuracy of 3-fold: 93.14%
Accuracy of 4-fold: 92.66%
Accuracy of 5-fold: 92.97%
Mean: 92.84
Standard deviation: 0.1906

-- Precision --
Precision of 92.13%

-- Confusion Matrix --
[[16052  1358]
 [ 1107 15888]]


## Random Forest

In [0]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model', RandomForestClassifier(random_state=RANDOM_STATE)
                     )])

performance_measurement(pipeline, x_train, y_train)

-- Cross Validation --
Accuracy of 1-fold: 97.14%
Accuracy of 2-fold: 97.38%
Accuracy of 3-fold: 97.5%
Accuracy of 4-fold: 97.22%
Accuracy of 5-fold: 97.2%
Mean: 97.29
Standard deviation: 0.134

-- Precision --
Precision of 97.46%

-- Confusion Matrix --
[[16980   430]
 [  503 16492]]


At this moment the Linear Support Vector Classifier, Stochastic Gradient Descent, and the Random Forest had the best results, so they will be fine-tuned to choose the best hyperparameters.

# Fine-Tune the models

## Grid Search

In [0]:
svc_param_grid = [    
    {'max_iter': [1000, 5000, 10000], 'C': [1, 1.5, 2],    
    'loss': ['hinge', 'squared_hinge'], 'tol': [1e-5, 1e-4], 
    'random_state': [RANDOM_STATE]}
]

sgd_param_grid = [
    {'learning_rate': ['optimal'], 
     'max_iter': [1000, 5000, 10000],
     'alpha': [0.00001, 0.0001, 0.0003],
     'loss': ['log', 'modified_huber', 'squared_hinge'],
     'random_state': [RANDOM_STATE]}
]

rf_param_grid = [
    {'max_depth': [None, 10], 'max_features': ['auto', 'log2'], 
     'criterion': ['entropy'],
     'n_estimators': [500, 1000, 1500],
     'random_state': [RANDOM_STATE]}
]

In [0]:
data_pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer())
                    ])

x_prepared = data_pipeline.fit_transform(x_train.values)

### Linear Support Vector Classifier

In [0]:
model = LinearSVC()

svc_grid_search = GridSearchCV(model, svc_param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True)

svc_grid_search.fit(x_prepared, y_train)



GridSearchCV(cv=5, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [1, 1.5, 2], 'loss': ['hinge', 'squared_hinge'],
                          'max_iter': [1000, 5000, 10000], 'random_state': [42],
                          'tol': [1e-05, 0.0001]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='accuracy', verbose=0)

In [0]:
svc_grid_search.best_params_

{'C': 1.5,
 'loss': 'squared_hinge',
 'max_iter': 1000,
 'random_state': 42,
 'tol': 1e-05}

In [0]:
svc_grid_search.best_estimator_

LinearSVC(C=1.5, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=42, tol=1e-05,
          verbose=0)

In [0]:
svc_grid_search.best_score_

0.9892748147071646

## Stochastic Gradient Descent

In [0]:
model = SGDClassifier()

sgd_grid_search = GridSearchCV(model, sgd_param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True)

sgd_grid_search.fit(x_prepared, y_train)



GridSearchCV(cv=5, error_score=nan,
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5,
                                     random_state=None, shuffle=True, tol=0.001,
                                     validation_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'alpha': [1e-05, 0.0001, 0.0003],
                          'learning_rate': ['optimal'],
                          'loss': ['log', 'modified_huber', 'squared_hinge'],
               

In [0]:
sgd_grid_search.best_params_

{'alpha': 1e-05,
 'learning_rate': 'optimal',
 'loss': 'modified_huber',
 'max_iter': 1000,
 'random_state': 42}

In [0]:
sgd_grid_search.best_estimator_

SGDClassifier(alpha=1e-05, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [0]:
sgd_grid_search.best_score_

0.9887225693939834

### Random Forest

In [21]:
model = RandomForestClassifier()

rf_grid_search = GridSearchCV(model, rf_param_grid, cv=3,
                           scoring='accuracy',
                           return_train_score=True)

rf_grid_search.fit(x_prepared, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [22]:
rf_grid_search.best_params_

{'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'auto',
 'n_estimators': 1500,
 'random_state': 42}

In [23]:
rf_grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1500,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [24]:
rf_grid_search.best_score_

0.9743351096142668

# Ensembling the models

As we found the best hyperparameters, let's ensemble the models in a voting classifier and compare the results on a last evaluation using the test set.

## Voting Classifier

In [0]:
# classifiers with the best parameters

# --- SVC ---
svc_clf = LinearSVC(C=1.5, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', tol=1e-05, random_state=42)

# need to use a calibration over linear svc in order to make it a probabilistic classifier
calibrated_svc_clf = CalibratedClassifierCV(svc_clf, method='sigmoid', cv=5) 

svc_pipeline = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('model', calibrated_svc_clf)])

# --- SGD ---
sgd_clf = SGDClassifier(alpha=1e-05, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

sgd_pipeline = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('model', sgd_clf)])

# --- Random Forest ---
rf_clf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1500,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

rf_pipeline = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('model', rf_clf)])

# --- Voting ---
voting_clf = VotingClassifier(
        estimators = [('sgd', sgd_clf), ('svc', calibrated_svc_clf), ('rf', rf_clf)],    
        voting='soft') 

voting_pipeline = Pipeline([('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('model', voting_clf)])

## Final Measurement

In [28]:
svc_pipeline.fit(x_train, y_train)
print('Linear SVC done.')
rf_pipeline.fit(x_train, y_train)
print('Random Forest done.')
sgd_pipeline.fit(x_train, y_train)
print('SGD done.')
voting_pipeline.fit(x_train, y_train)
print('Voting Classifier done.')

Linear SVC done.
Random Forest done.
SGD done.
Voting Classifier done.


In [None]:
joblib.dump(svc_pipeline, PATH + 'models/FakeNews/svc_model.pkl', compress=9)
joblib.dump(sgd_pipeline, PATH + 'models/FakeNews/sgd_model.pkl', compress=9)
joblib.dump(rf_pipeline, PATH + 'models/FakeNews/rf_model.pkl', compress=9)
joblib.dump(voting_pipeline, PATH + 'models/FakeNews/voting_model.pkl', compress=9)

In [0]:
svc_pipeline = joblib.load(PATH + 'models/FakeNews/svc_model.pkl')
sgd_pipeline = joblib.load(PATH + 'models/FakeNews/xgb_model.pkl')
rf_pipeline = joblib.load(PATH + 'models/FakeNews/dt_model.pkl')
voting_pipeline = joblib.load(PATH + 'models/FakeNews/voting_model.pkl')

In [0]:
sgd_pred = sgd_pipeline.predict(x_test)
svc_pred = svc_pipeline.predict(x_test)
rf_pred = rf_pipeline.predict(x_test)
voting_pred = voting_pipeline.predict(x_test)

In [41]:
print("Accuracy for Stochastic Gradient Descent Model: %.2f" % (accuracy_score(y_test, sgd_pred) * 100))
print("Accuracy for SVC Model: %.2f" % (accuracy_score(y_test, svc_pred) * 100))
print("Accuracy for Random Forest Model: %.2f" % (accuracy_score(y_test, rf_pred) * 100))
print("Accuracy for Voting Classifier Model: %.2f" % (accuracy_score(y_test, voting_pred) * 100))

Accuracy for Stochastic Gradient Descent Model: 99.17
Accuracy for SVC Model: 99.04
Accuracy for Random Forest Model: 97.94
Accuracy for Voting Classifier Model: 99.27


As expected the voting classifier has the best accuracy between the models, so he is the best choice to classify fake news.