In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow.keras as keras
print("Tensorflow version: {}".format(tf.__version__))
import numpy as np

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import string

import os

Tensorflow version: 2.0.0


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
tf.test.is_gpu_available()

True

In [3]:
# Constants!
DATASET_SIZE = 9408908
TRAINING_SIZE = (DATASET_SIZE // 8) * 6
TESTING_SIZE = DATASET_SIZE // 8
VALIDATION_SIZE = DATASET_SIZE // 8
BATCH_SIZE = 128
SELECT_COLUMNS = ['type', 'content', 'title']
TYPES = ['fake', 'satire', 'bias', 'conspiracy', 'state', 'junksci', 'hate', 'clickbait', 'unreliable', 'political', 'reliable', 'unknown', 'rumor']
#MAPPED_TYPES = ['fake', 'unreliable', 'reliable']
MAPPED_TYPES = ['fake', 'reliable']

#TYPE_MAPPING = {
#    'fake': 'fake',
#    'satire': 'fake',
#    'bias': 'unreliable',
#    'conspiracy': 'unreliable',
#    'state': 'fake',
#    'junksci': 'fake',
#    'hate': 'unreliable',
#    'clickbait': 'unreliable',
#    'unreliable': 'unreliable',
#    'political': 'reliable',
#    'reliable': 'reliable',
#    'unknown': 'reliable',
#    'rumor': 'unreliable'
#}

TYPE_MAPPING = {
    'fake': 'fake',
    'satire': 'fake',
    'bias': 'fake',
    'conspiracy': 'fake',
    'state': 'fake',
    'junksci': 'fake',
    'hate': 'fake',
    'clickbait': 'fake',
    'unreliable': 'fake',
    'political': 'reliable',
    'reliable': 'reliable',
    'unknown': 'reliable',
    'rumor': 'fake'
}

CLASS_WEIGHTS = {
    0: 0.75,
    1: 0.25,
    2: 0.75
}

TYPE_INDEX_MAPPING = {}
for t, m in TYPE_MAPPING.items():
  TYPE_INDEX_MAPPING[t] = MAPPED_TYPES.index(m)

EMBEDDING_DIM = 300
MAX_WORDS = 128

In [4]:
# Utility functions
def show_batch(dataset):
  for batch, labels in dataset.take(1):
    for key, value in batch.items():
      print("{:20s}: {}".format(key,value.numpy()))
    print("Types: {}".format(labels))

In [5]:
if not os.path.exists("news_cleaned_2018_02_13.csv"):
    !wget https://storage.googleapis.com/researchably-fake-news-recognition/news_cleaned_2018_02_13.csv.zip
    !unzip news_cleaned_2018_02_13.csv.zip

In [6]:
DATASET_PATH = 'news_cleaned_2018_02_13.csv'

In [7]:
# Dataset found at: https://github.com/several27/FakeNewsCorpus
# Dataset file located at: /gdrive/My Drive/news_cleaned_2018_02_13.csv

#DATASET_PATH = '/gdrive/My Drive/news_cleaned_2018_02_13.csv'
dataset = tf.data.experimental.make_csv_dataset(file_pattern=DATASET_PATH, batch_size=BATCH_SIZE, select_columns=SELECT_COLUMNS, label_name='type', ignore_errors=True)

Instructions for updating:
Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy execution is desired, use `tf.data.Options.experimental_determinstic`.
Instructions for updating:
Use `tf.data.Dataset.shuffle(buffer_size, seed)` followed by `tf.data.Dataset.repeat(count)`. Static tf.data optimizations will take care of using the fused implementation.


In [8]:
# Let's try to setup an embedding layer...

In [9]:
GLOVE_FILE_PATH = 'glove.6B.{}d.txt'.format(EMBEDDING_DIM)

In [10]:
if not os.path.exists(GLOVE_FILE_PATH):
    !wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip glove.6B.zip

In [11]:
# Open and parse the GloVe embeddings
#GLOVE_FILE_PATH = '/gdrive/My Drive/glove.6B.100d.txt'
glove_lookup_dict = {}
with open(GLOVE_FILE_PATH, 'r') as glove_f:
  index = 0
  for line in glove_f:
      values = line.split()
      word = values[0]
      if word.isalpha():
        if word not in stopwords.words('english'):
          coefs = np.asarray(values[1:], dtype='float32')
          glove_lookup_dict[word] = (index, coefs)
          index += 1

In [12]:
# Create the weighting matrix
embedding_matrix = np.zeros((len(glove_lookup_dict) + 1, EMBEDDING_DIM))
for i, embedding_vector in glove_lookup_dict.values():
  embedding_matrix[i+1] = embedding_vector

# Make default value average?
#embedding_matrix[0] = np.average(embedding_matrix[1:], axis=0)

In [13]:
# Dictionary translation hash
keys_tensor = tf.constant(list(glove_lookup_dict.keys()))
vals_tensor = tf.constant(list(map(lambda v: v[0], list(glove_lookup_dict.values()))))
vocab_table = tf.lookup.StaticHashTable(
    tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor), 0)

In [14]:
# Label number hash
labels_tensor = tf.constant(TYPES)
label_numbers_tensor = tf.range(len(TYPES))
labels_table = tf.lookup.StaticHashTable(
    tf.lookup.KeyValueTensorInitializer(labels_tensor, label_numbers_tensor), -1)

In [15]:
# Mapped label number hash
labels_to_map_tensor = tf.constant(list(TYPE_INDEX_MAPPING.keys()))
mapped_indexes_tensor = tf.constant(list(TYPE_INDEX_MAPPING.values()))
mapped_labels_table = tf.lookup.StaticHashTable(
    tf.lookup.KeyValueTensorInitializer(labels_to_map_tensor, mapped_indexes_tensor), -1)

In [16]:
# Cleanup the dataset, map to word indexes

def clean_text(t):
  t_lower = tf.strings.lower(t)
  t_filtered = tf.strings.regex_replace(t_lower, '[%s]' % re.escape(string.punctuation), '')
  t_split = tf.strings.split(t_filtered, maxsplit=MAX_WORDS-1)
  t_tokenized = vocab_table.lookup(t_split)
  return t_tokenized

def clean_and_tokenize_dataset(features, labels):
  combined_tensor = features['title'] + tf.constant(np.full(features['title'].shape, b' ')) + features['content'] + tf.constant(np.full(features['title'].shape, MAX_WORDS*b' b'))
  features = tf.map_fn(clean_text, combined_tensor, dtype=np.int32)
  #mapped_labels = tf.one_hot(labels_table.lookup(labels), len(TYPES))
  mapped_labels = tf.one_hot(mapped_labels_table.lookup(labels), len(MAPPED_TYPES))
  #mapped_labels = mapped_labels_table.lookup(labels)
  return features, mapped_labels

cleaned_dataset = dataset.map(clean_and_tokenize_dataset)

In [17]:
# Split the dataset!
def is_train(i, d):
  return i % 8 != 0 and i % 8 != 1

def is_test(i, d):
  return i % 8 == 0

def is_val(i, d):
  return i % 8 == 1

def remove_enumerate(i, d):
  return d

train_dataset = cleaned_dataset.enumerate().filter(is_train).map(remove_enumerate)
test_dataset = cleaned_dataset.enumerate().filter(is_test).map(remove_enumerate)
val_dataset = cleaned_dataset.enumerate().filter(is_val).map(remove_enumerate)

In [18]:
embedding_layer = keras.layers.Embedding(embedding_matrix.shape[0], EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_WORDS, trainable=False)

In [19]:
# Let's do the model a different way...
#inp = tf.keras.layers.Input(shape=(MAX_WORDS, EMBEDDING_DIM,))

#filter_sizes = [1, 2, 3, 5]
#pool_sizes = [5, 3, 2]
#num_filters = 256

#maxpool_pool = []
#for f_size in filter_sizes:
  #conv = tf.keras.layers.Conv1D(num_filters, f_size, activation='relu')(inp)
  #for p_size in pool_sizes:
    #conv = tf.keras.layers.MaxPooling1D(pool_size=p_size)(conv)
  #conv = tf.keras.layers.Flatten()(conv)
  #maxpool_pool.append(conv)

#outp = tf.keras.layers.Concatenate(axis=1)(maxpool_pool)
#outp = tf.keras.layers.Flatten()(outp)

#conv_model = tf.keras.Model(inputs=inp, outputs=outp)

In [20]:
#model = keras.models.Sequential()
#model.add(embedding_layer)
#model.add(conv_model)
#model.add(keras.layers.Dense(128, activation='relu'))
#model.add(keras.layers.Dropout(0.2))
#model.add(keras.layers.Dense(64, activation='relu'))
#model.add(keras.layers.Dropout(0.2))
#model.add(keras.layers.Dense(len(MAPPED_TYPES), activation='softmax'))
#model.add(keras.layers.Dense(1, activation='sigmoid'))

#model.summary()

In [21]:
# From https://link.springer.com/chapter/10.1007/978-3-030-03928-8_17
# New model!

model = keras.models.Sequential()
model.add(embedding_layer)
model.add(keras.layers.Reshape((MAX_WORDS, EMBEDDING_DIM, 1)))
model.add(keras.layers.Conv2D(filters=128, kernel_size=(3, EMBEDDING_DIM), strides=(1, 1), data_format='channels_last', activation='relu'))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Conv2D(filters=64, kernel_size=(16, 1), strides=(1, 1), activation='relu'))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.MaxPooling2D(pool_size=(2, 1)))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(1024, activation='relu'))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(256, activation='relu'))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(32, activation='relu'))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(len(MAPPED_TYPES), activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 128, 300)          98082900  
_________________________________________________________________
reshape (Reshape)            (None, 128, 300, 1)       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 126, 1, 128)       115328    
_________________________________________________________________
dropout (Dropout)            (None, 126, 1, 128)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 111, 1, 64)        131136    
_________________________________________________________________
dropout_1 (Dropout)          (None, 111, 1, 64)        0         
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 55, 1, 64)         0

In [22]:
# Compile the model
opt = tf.keras.optimizers.Adam(learning_rate=1e-5)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
#model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [23]:
# Epoch callback
MODEL_NAME = 'DATAVISMODEL13PAPERBASED'
CHECKPOINT_DIR = 'datavismodels/checkpoints/'
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=CHECKPOINT_DIR + MODEL_NAME + '-{epoch:02d}-{val_accuracy:.2f}.hdf5',
                                                 verbose=1)

In [None]:
# Train the model
EPOCHS = 10
history = model.fit(
    train_dataset,
    steps_per_epoch=(TRAINING_SIZE // BATCH_SIZE),
    validation_data=val_dataset,
    validation_steps=(VALIDATION_SIZE // BATCH_SIZE),
    epochs=EPOCHS,
    callbacks=[cp_callback]
    #class_weight=CLASS_WEIGHTS
    )

Train for 55130 steps, validate for 9188 steps
Epoch 1/10
Epoch 00001: saving model to datavismodels/checkpoints/DATAVISMODEL13PAPERBASED-01-0.55.hdf5
Epoch 2/10
Epoch 00002: saving model to datavismodels/checkpoints/DATAVISMODEL13PAPERBASED-02-0.55.hdf5
Epoch 3/10
Epoch 00003: saving model to datavismodels/checkpoints/DATAVISMODEL13PAPERBASED-03-0.55.hdf5
Epoch 4/10
10305/55130 [====>.........................] - ETA: 4:23:17 - loss: 314984028.6121 - accuracy: 0.6740

In [None]:
test_set = test_dataset.take(1)

In [None]:
print("Predicted labels:")
pred_labels = np.argmax(model.predict(test_set), axis=1)
pred_labels

In [None]:
print("Actual labels:")
(k, v) = next(iter(test_set))
actual_labels = np.argmax(v, axis=1)
actual_labels

In [None]:
print("Test Set Accuracy: {}".format(np.sum((pred_labels == actual_labels) / pred_labels.size)))