For a more detailed view of how the evaluation was made, you can go to work/Evaluation.ipynb.

# Simple model initilization

The simple model is effectively a dictionary of weights for each token. It requires a preprocessing step on the input data to work.

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

pattern = r'<num>|<date>|<email>|<url>|\w+|[^\w\s]'
tokenizer = RegexpTokenizer(pattern)
stemmer = PorterStemmer()

def preprocess(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens


def classify(sequence, token_importance):
    tokens = sequence.split()
    score = 0
    for token in tokens:
        if token in token_importance:
            score += token_importance[token]
    return score

def bin_classify(sequence, token_importance):
    score = classify(sequence, token_importance)
    return score > 0

In [None]:
import pandas as pd


simple_model = pd.read_csv('simple_model_importance.csv')

token_importance = dict(zip(simple_model['token'], simple_model['importance']))


# Simple model evaluation (FakeNews + LIAR) 

We'll start by running FakeNews.

In [None]:
import pandas as pd
complete_data = pd.read_parquet('pre_processed_news.parquet')

Getting validation set:

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(complete_data['cleaned_content'], complete_data['type'], test_size=0.2, random_state=42)
y_train = y_train.apply(lambda x: True if x in ["reliable", "political"] else False)
y_test = y_test.apply(lambda x: True if x in ["reliable", "political"] else False)

# Let's split test into test and validation:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# In this notebook, we're only goiing to use the _val sets for evaluation

Applying preproecssing on the validation set:

In [None]:
X_val = X_val.apply(preprocess)

Check the F1, accuracy, and matrices:

In [None]:
from sklearn.metrics import f1_score

simple_validation_pred = X_val.apply(lambda x: bin_classify(' '.join(x), token_importance))

# measure accuracy in F1 score:
print("F1: ")
print(f1_score(y_val, simple_validation_pred, average='weighted'))

from sklearn.metrics import accuracy_score
print("Accuracy: ")
print(accuracy_score(y_val, simple_validation_pred))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_val, simple_validation_pred))

# also confusion matrix:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_val, simple_validation_pred))

Next, we can try this on the LIAR dataset:

In [None]:
import pandas as pd

df = pd.read_csv('LIAR/train.tsv', sep='\t', header=None)
bin_data = df.replace({1: {'barely-true': True, 'true': True, 'mostly-true': True, 'half-true': False, 'pants-fire': False, 'false': False}})

In [None]:
# predict:
bin_data[2] = bin_data[2].apply(preprocess)
simple_model_pred = bin_data[2].apply(lambda x: bin_classify(' '.join(x), token_importance))

# measure accuracy in F1 score:
print("F1: ")
print(f1_score(bin_data[1], simple_model_pred, average='weighted'))

print("Accuracy: ")
print(accuracy_score(bin_data[1], simple_model_pred))

In [None]:
print(classification_report(bin_data[1], simple_model_pred))

print(confusion_matrix(bin_data[1], simple_model_pred))

# Complex model (GRU) initialization

In [None]:
import pickle

with open('tokenizer.pickle', 'rb') as handle:
    complex_tokenizer = pickle.load(handle)

import pickle
import tensorflow as tf
from tensorflow import keras

complex_model = keras.models.load_model('model3.h5')

print(complex_model.summary())
print(tokenizer)


# Complex model evaluation(FakeNews + LIAR)

Once again we start with FakeNews:

In [None]:
import pandas as pd
complete_data = pd.read_parquet('pre_processed_news.parquet')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(complete_data['cleaned_content'], complete_data['type'], test_size=0.2, random_state=42)
y_train = y_train.apply(lambda x: True if x in ["reliable", "political"] else False)
y_test = y_test.apply(lambda x: True if x in ["reliable", "political"] else False)

# Let's split test into test and validation:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# In this notebook, we're only goiing to use the _val sets for evaluation

In [None]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


sequences = tokenizer.texts_to_sequences(X_val)
padded = pad_sequences(sequences, maxlen=800, padding='post')


This next step actually runs the model and can take a while to run since its not set to run on GPU (since I don't know what machines you will run this on)

In [None]:
# Make predictions:
predictions = complex_model.predict(padded)

# Binarize the predictions:
threshold = 0.5
bin_predictions = np.where(predictions > threshold, 1, 0)
bin_predictions = np.where(bin_predictions == 1, True, False)

print("F1: ")
print(f1_score(y_val, bin_predictions, average='weighted'))

print("Accuracy: ")
print(accuracy_score(y_val, bin_predictions))

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_val, bin_predictions))
labels = ['real', 'fake']
print(metrics.classification_report(y_val, bin_predictions, target_names=labels))

Then LIAR:

In [None]:
import pandas as pd
df = pd.read_csv('LIAR/train.tsv', sep='\t', header=None)
bin_data = df.replace({1: {'barely-true': True, 'true': True, 'mostly-true': True, 'half-true': False, 'pants-fire': False, 'false': False}})

print(bin_data[1].value_counts())


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize the sequences:
sequences = tokenizer.texts_to_sequences(bin_data[2])
padded = pad_sequences(sequences, maxlen=50, padding='post')



In [None]:
import numpy as np

# Make predictions:
predictions = complex_model.predict(padded)

threshold = 0.5
bin_predictions = np.where(predictions > threshold, 1, 0)
bin_predictions = np.where(bin_predictions == 1, True, False)

In [None]:
new_df = pd.DataFrame({'text': bin_data[2], 'predictions': bin_predictions[:,0]})
new_df.to_csv('complex_model_predictions.csv', index=False)
#^ These predictions were exported to a csv file for further analysis, but strictly speaking, this is unnecessary

In [None]:
complex_preds = pd.read_csv('complex_model_predictions.csv')
# classificaiton report:
from sklearn.metrics import classification_report

# using complex_prds:
print(classification_report(bin_data[1], complex_preds['predictions']))

In [None]:
#f-score and accuracy:
print("F1: ")
print(f1_score(bin_data[1], complex_preds['predictions'], average='weighted'))

print("Accuracy: ")
print(accuracy_score(bin_data[1], complex_preds['predictions']))
