In [None]:
# The following is data scraped and filtered from https://www.politifact.com.
# The framework used here for deep learning with NLP comes from Dipanjan Sarkar's "Text Analytics with Python".

In [2]:
# install if needed
# !pip install tensorflow==1.13.1
# !pip install tensorflow-hub

In [3]:
# essential dependencies
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
import os
from bs4 import BeautifulSoup
import unicodedata
import re
import time

In [4]:
# load dataset
path = os.path.abspath('politifact_post_eda')
df = pd.read_csv(path, index_col=0)

# filter to only what we will be using
poli_df = df[["statement","veracity"]]

# transform to binary classification data
poli_df.veracity = poli_df.veracity.map(
    {'False': False, 'True': True, 'Pants on Fire!': False})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [5]:
# checks if tensorflow will be using a GPU
print(tf.test.is_gpu_available())
print(tf.test.gpu_device_name())

False



In [6]:
poli_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1455 entries, 18 to 16610
Data columns (total 2 columns):
statement    1455 non-null object
veracity     1455 non-null bool
dtypes: bool(1), object(1)
memory usage: 24.2+ KB


In [7]:
poli_df.head()

Unnamed: 0,statement,veracity
18,"""The vast majority of Wisconsin students canno...",False
21,"""President Trump has sent 14,000 American troo...",True
27,"""To be clear, I’m not talking about confiscati...",False
30,"""When my father became commander in chief of t...",False
42,"""Ohio, Michigan, and Pennsylvania actually in ...",False


In [8]:
# text preprocessing functions
def strip_html_tags(text):
    soup = BeautifulSoup(text,"html.parser")
    [s.extract() for s in soup(['iframe','script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+','\n', stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode(
        'ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def preprocess_document(document):
    # strip HTML tags using function above
    document = strip_html_tags(document)
    # remove accented characters using function above
    document = remove_accented_chars(document)
    # lowercase
    document = document.lower()
    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()
    # finished
    return document

In [9]:
# vectorize for deep learning
preprocess_corpus = np.vectorize(preprocess_document)

# generate arrays
statements = preprocess_corpus(poli_df.statement.values)
veracity = poli_df.veracity.values

In [10]:
# let's do a manual train/test/split as Dipanjan Sarkar does in his book (note you can also use train_test_split)
# 1445 rows (train = 60% -> 874, validation = 10% -> 145, test = 30% -> 436; some numbers were rounded)
train_statements = statements[:874]
train_veracity = veracity[:874]

validation_statements = statements[874:1019]
validation_veracity = veracity[874:1019]

test_statements = statements[1019:1455]
test_veracity = veracity[1019:1455]

train_statements.shape, validation_veracity.shape, test_statements.shape

((874,), (145,), (436,))

In [11]:
# Build data ingestion functions and feature engineering pipelines
# training input on the whole training set with no limits on training epochs
train_input_fn = tf.estimator.inputs.numpy_input_fn(
                    {'sentence': train_statements}, 
                    train_veracity,
                    batch_size = 256, num_epochs = None, shuffle = True)

# prediction on the whole training set
predict_train_input_fn = tf.estimator.inputs.numpy_input_fn(
                            {'sentence': train_statements}, 
                            train_veracity,
                            shuffle = False)

# prediction on the whole validation set
predict_validation_input_fn = tf.estimator.inputs.numpy_input_fn(
                            {'sentence': validation_statements}, 
                            validation_veracity,
                            shuffle = False)

# prediction on the test set
predict_test_input_fn = tf.estimator.inputs.numpy_input_fn(
                            {'sentence': test_statements}, 
                            test_veracity,
                            shuffle = False)

In [20]:
# define the sentence-embedding features that leverages Google's Universal Sentence Encoder
# takes about 20 minutes to load; as of 2019/11/13 tensorflow version 2+ and encoder 3 do not work with this setup
embedding_feature = hub.text_embedding_column(
                    key = 'sentence', 
                    module_spec = "https://tfhub.dev/google/universal-sentence-encoder/2",
                    trainable = False)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs


In [22]:
# simple feed-forward DNN with 2 hidden layers as we are just seeing how well a simple deep learning model performs

dnn = tf.estimator.DNNClassifier(
    hidden_units = [512, 128], 
    feature_columns = [embedding_feature],
    n_classes = 2,
    activation_fn = tf.nn.relu,
    dropout = 0.1,
    optimizer = tf.train.AdagradOptimizer(learning_rate = 0.005))

INFO:tensorflow:Using default config.


INFO:tensorflow:Using default config.






INFO:tensorflow:Using config: {'_model_dir': '/var/folders/lx/dgklbb3d721cj2ghzkdtfn100000gn/T/tmpapaxg_6l', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0xb2c890eb8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


INFO:tensorflow:Using config: {'_model_dir': '/var/folders/lx/dgklbb3d721cj2ghzkdtfn100000gn/T/tmpapaxg_6l', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0xb2c890eb8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [25]:
# train model
tf.logging.set_verbosity(tf.logging.ERROR)

TOTAL_STEPS = 1500
STEP_SIZE = 100
for step in range(0, TOTAL_STEPS+1, STEP_SIZE):
    print()
    print('-'*100)
    print('Training for step =', step)
    start_time = time.time()
    dnn.train(input_fn = train_input_fn, steps = STEP_SIZE)
    elapsed_time = time.time() - start_time
    print('Train Time (s):', elapsed_time)
    print("")
    print('Eval Metrics (Train):', dnn.evaluate(input_fn = predict_train_input_fn))
    print("")
    print('Eval Metrics (Validation):', dnn.evaluate(input_fn = predict_validation_input_fn))


----------------------------------------------------------------------------------------------------
Training for step = 0
Train Time (s): 60.506800174713135
Eval Metrics (Train): {'accuracy': 0.97940505, 'accuracy_baseline': 0.6441648, 'auc': 0.99756986, 'auc_precision_recall': 0.9944781, 'average_loss': 0.14703967, 'label/mean': 0.35583523, 'loss': 18.358952, 'precision': 0.96507937, 'prediction/mean': 0.36793247, 'recall': 0.977492, 'global_step': 200}
Eval Metrics (Validation): {'accuracy': 0.6068966, 'accuracy_baseline': 0.5103448, 'auc': 0.66673017, 'auc_precision_recall': 0.6634409, 'average_loss': 0.7654497, 'label/mean': 0.5103448, 'loss': 55.495102, 'precision': 0.6164383, 'prediction/mean': 0.4957144, 'recall': 0.6081081, 'global_step': 200}

----------------------------------------------------------------------------------------------------
Training for step = 100
Train Time (s): 60.60412907600403
Eval Metrics (Train): {'accuracy': 1.0, 'accuracy_baseline': 0.6441648, 'auc

Train Time (s): 61.869592905044556
Eval Metrics (Train): {'accuracy': 1.0, 'accuracy_baseline': 0.6441648, 'auc': 1.0, 'auc_precision_recall': 1.0, 'average_loss': 0.0022163682, 'label/mean': 0.35583523, 'loss': 0.2767294, 'precision': 1.0, 'prediction/mean': 0.3557735, 'recall': 1.0, 'global_step': 1300}
Eval Metrics (Validation): {'accuracy': 0.62068963, 'accuracy_baseline': 0.5103448, 'auc': 0.6379901, 'auc_precision_recall': 0.6377767, 'average_loss': 1.3933696, 'label/mean': 0.5103448, 'loss': 101.019295, 'precision': 0.62666667, 'prediction/mean': 0.48358113, 'recall': 0.6351351, 'global_step': 1300}

----------------------------------------------------------------------------------------------------
Training for step = 1200
Train Time (s): 65.16779971122742
Eval Metrics (Train): {'accuracy': 1.0, 'accuracy_baseline': 0.6441648, 'auc': 1.0, 'auc_precision_recall': 1.0, 'average_loss': 0.0019411017, 'label/mean': 0.35583523, 'loss': 0.24236043, 'precision': 1.0, 'prediction/mean':

In [26]:
dnn.evaluate(input_fn = predict_train_input_fn)

{'accuracy': 1.0,
 'accuracy_baseline': 0.6441648,
 'auc': 1.0,
 'auc_precision_recall': 1.0,
 'average_loss': 0.0013723333,
 'label/mean': 0.35583523,
 'loss': 0.1713456,
 'precision': 1.0,
 'prediction/mean': 0.35584152,
 'recall': 1.0,
 'global_step': 1700}

In [27]:
dnn.evaluate(input_fn = predict_test_input_fn)

{'accuracy': 0.61009175,
 'accuracy_baseline': 0.57339454,
 'auc': 0.6300968,
 'auc_precision_recall': 0.58227587,
 'average_loss': 1.650802,
 'label/mean': 0.4266055,
 'loss': 179.93742,
 'precision': 0.5555556,
 'prediction/mean': 0.33982703,
 'recall': 0.43010753,
 'global_step': 1700}

# the model was able to predict about 4% better than the baseline