# Fake News Classifiers

In [1]:
%%time
import pandas as pd
import joblib
import resources.visualize as vis
import resources.preprocessor as prep
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import GridSearchCV

CPU times: user 821 ms, sys: 166 ms, total: 988 ms
Wall time: 814 ms


# 1. Importing Data

In [None]:
sample_size = 8000

news = pd.read_csv('data.csv', nrows=sample_size)
scores = {}
news.head()

In [None]:
vis.labels_bar_plot(news)

# 2. Bag Of Words Training

## 2.0 Pre-processing

### Cleaning Data
- remove html code
- remove punctuation
- make everything lower case

In [None]:
news = news.dropna() # remove rows with at least one missing element
news['text'] = news['tweet'].apply(vis.clean)

### Encoding labels

In [None]:
prep.encode_labels(news)
news.head()

### Vectorization

vectorization in NLP is the conversion of text input data into vectors of real numbers, the format supported by ML models.
I will start by using the bag of words model.

In [None]:

X, y = prep.bag_of_words_vectorize(news)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

## 2.1 Multinomial Naive Bayes

In [None]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
pred = mnb.predict(X_test)
scores["bow_mnb"] = metrics.accuracy_score(y_test, pred)

vis.plot_cf_matrix(y_test, pred)
vis.plot_pie_chart(y_test, pred)

In [None]:
joblib.dump(mnb, 'models/bow_mnb.pkl')

## 2.2 Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
pred = lr.predict(X_test)
scores["bow_lr"] = metrics.accuracy_score(y_test, pred)

vis.plot_cf_matrix(y_test, pred)
vis.plot_pie_chart(y_test, pred)

In [None]:
joblib.dump(lr, 'models/bow_lr.pkl')

## 2.3 Passive Aggressive Classifier

In [None]:
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(X_train, y_train)
pred = pac.predict(X_test)
scores["bow_pac"] = metrics.accuracy_score(y_test, pred)


vis.plot_cf_matrix(y_test, pred)
vis.plot_pie_chart(y_test, pred)

In [None]:
joblib.dump(pac, 'models/bow_pac.pkl')

## 2.4 Multi-Layer Perceptron

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(50,50,50), max_iter=500, activation = 'relu', solver='adam', random_state=0)
mlp.fit(X_train, y_train)
pred = mlp.predict(X_test)
scores["bow_mlp"] = metrics.accuracy_score(y_test, pred)

vis.plot_cf_matrix(y_test, pred)
vis.plot_pie_chart(y_test, pred)

In [None]:
joblib.dump(mlp, 'models/bow_mlp.pkl')

### Hyper-parameter Tuning

In [None]:

parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'learning_rate': ['constant','adaptive']
}

gscv = GridSearchCV(mlp, parameter_space, n_jobs=-1)
gscv.fit(X_train, y_train)

In [None]:
print('Best params: ', gscv.best_params_)

In [None]:
joblib.dump(gscv, 'models/tuned_mlp.pkl')

In [None]:
y_pred = gscv.predict(X_test)

vis.plot_cf_matrix(y_test, pred)
vis.plot_pie_chart(y_test, pred)

# 3. TF-IDF Training

## 3.0 Pre-processing

Vectorizing using the Term Frequency Inverse Document Frequency (TFIDF) model

In [None]:
X, y = prep.tfidf_vectorize(news)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

## 3.1 Multinomial Naive Bayes

In [None]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
pred = mnb.predict(X_test)
scores["tfidf_mnb"] = metrics.accuracy_score(y_test, pred)

vis.plot_cf_matrix(y_test, pred)
vis.plot_pie_chart(y_test, pred)

In [None]:
joblib.dump(mnb, 'models/tfidf_mnb.pkl')

## 3.2 Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
pred = lr.predict(X_test)
scores["tfidf_lr"] = metrics.accuracy_score(y_test, pred)

vis.plot_cf_matrix(y_test, pred)
vis.plot_pie_chart(y_test, pred)

In [None]:
joblib.dump(mnb, 'models/tfidf_lr.pkl')

## 3.3 Passive Aggressive Classifier

In [None]:
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(X_train, y_train)
pred = pac.predict(X_test)
scores["tfidf_pac"] = metrics.accuracy_score(y_test, pred)

vis.plot_cf_matrix(y_test, pred)
vis.plot_pie_chart(y_test, pred)

In [None]:
joblib.dump(mnb, 'models/tfidf_pac.pkl')

## 3.4 Multi-Layer Perceptron

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(50,50,50), max_iter=500, activation = 'relu', solver='adam', random_state=0)
mlp.fit(X_train, y_train)
pred = mlp.predict(X_test)
scores["tfidf_mlp"] = metrics.accuracy_score(y_test, pred)

vis.plot_cf_matrix(y_test, pred)
vis.plot_pie_chart(y_test, pred)

In [None]:
joblib.dump(mnb, 'models/tfidf_mlp.pkl')

# 4. BERT Training

In [None]:
from bert_sklearn import BertClassifier

If the imports above are not working, uncoment the code below and run it to install bert-sklearn

In [None]:
"""
code taken from https://colab.research.google.com/drive/1-wTNA-qYmOBdSYG7sRhIdOrxcgPpcl6L
"""

"""
!git clone -b master https://github.com/charles9n/bert-sklearn
!cd bert-sklearn; pip install .
import os
os.chdir("bert-sklearn")
print(os.listdir())

#"""

In [None]:
#time taken = 26m 35.3s

news = pd.read_csv('data.csv', nrows=8000)

news = news.dropna() # remove rows with at least one missing element
news['text'] = news['tweet'].apply(vis.clean)

prep.encode_labels(news)

X_train, X_test, y_train, y_test = train_test_split(news['tweet'], news['label'], test_size=0.5)

bert = BertClassifier(max_seq_length=64, train_batch_size=16)
bert = bert.fit(X_train, y_train)
accy = bert.score(X_test, y_test)

In [None]:
joblib.dump(bert, 'models/bert.pkl')

In [None]:
y_pred = bert.predict(X_test)
scores["bert"] = metrics.accuracy_score(y_test, y_pred)

In [None]:
vis.plot_cf_matrix(y_test, y_pred)
vis.plot_pie_chart(y_test, y_pred)

# 5. Results

In [None]:
d = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1])}

print ("{:<10}      {:<10}".format("model","score"))
for k, v in d.items():
    print ("{:<10}      {:<10}".format(k,v))