In [1]:
import os
import pickle
import numpy as np
import pandas as pd

from collections import Counter

from sklearn.metrics import f1_score, make_scorer, confusion_matrix, \
    classification_report
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, \
    StratifiedShuffleSplit, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder

from keras_preprocessing.sequence import pad_sequences
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

%matplotlib inline

  from ._conv import register_converters as _register_converters


# Data

In [2]:
with open("../Data/Learn/labels.pkl", "rb") as f:
    learn_labels = pickle.load(f)

with open("../Data/Learn/sequences.pkl", "rb") as f:
    learn_sequences = pickle.load(f)

X_train, X_test, y_train, y_test = train_test_split(
    learn_sequences, learn_labels, test_size=0.3,
    shuffle=True, stratify=learn_labels, random_state=42
)

# Test pipeline with bag of words

In [17]:
class DNNModel(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 model_name=None,
                 checkpoints_dir="../checkpoints/",
                 hidden_units=(10,),
                 batch_size=128,
                 weight_class_M=1.0,
                 features_key="x",
                 weight_key="weight"
                 ):
        self.set_model_directory(checkpoints_dir, model_name)
        self.hidden_units = hidden_units
        self.batch_size = batch_size
        self.features_key = features_key
        self.weight_key = weight_key
        self.weight_class_M = weight_class_M

    def fit(self, X, y, num_epochs=1, warm_start=True):
        warm_start = self.check_warm_start(warm_start)
        if not warm_start:
            X, y = self.fit_and_apply_transformers(X, y)
            self.classifier_ = self.create_dnn_classifier()
        else:
            X, y = self.apply_transformers(X, y)

        self.classifier_.train(self.input_fn(
            tf.estimator.ModeKeys.TRAIN, X, y, num_epochs))
        return self

    def fit_and_apply_transformers(self, X, y):
        # Fit and transform X
        self.vectorizer_ = CountVectorizer(lowercase=False,
                                           tokenizer=self.tokens_to_str,
                                           ngram_range=(1, 2))
        X = self.vectorizer_.fit_transform(X)
        self.n_features_ = len(self.vectorizer_.vocabulary_)

        # Fit and transform y
        self.label_encoder_ = LabelEncoder()
        y = self.label_encoder_.fit_transform(y)
        self.n_classes_ = len(self.label_encoder_.classes_)

        return X, y

    def apply_transformers(self, X, y):
        X = self.vectorizer_.transform(X)
        y = self.label_encoder_.transform(y)
        return X, y

    def predict(self, X):
        X = self.vectorizer_.transform(X)
        classes = list(self.classifier_.predict(self.input_fn(
            tf.estimator.ModeKeys.PREDICT, X)))
        labels = self.label_encoder_.inverse_transform(classes)
        return labels
    
    def score(self, X, y):
        X, y = self.apply_transformers(X, y)
        results = self.classifier_.evaluate(self.input_fn(
            tf.estimator.ModeKeys.EVAL, X, y))
        return results["f1-score"]

    def create_dnn_classifier(self):
        # Columns of X
        self.feature_columns_ = [tf.feature_column.numeric_column(
            key=self.features_key, shape=self.n_features_
        )]

        # Column of weights which will be used to compute loss
        weight_column = tf.feature_column.numeric_column(self.weight_key)
        
        # Model parameters
        params = {
            "feature_columns": self.feature_columns_,
            "weight_column": weight_column,
            "hidden_units": self.hidden_units,
            "n_classes": self.n_classes_,
        }
        
        # Create model
        model = tf.estimator.Estimator(model_fn=self.model_fn,
                                       model_dir=self.model_dir,
                                       params=params)
        model = tf.contrib.estimator.add_metrics(model, self.f1_score)
        return model
        
    def model_fn(self, features, labels, mode, params):
        # Network
        logits = self.network_fn(features, params)
        
        # Predict
        predicted_classes = tf.argmax(logits, 1)
        if mode == tf.estimator.ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(mode, predictions=predicted_classes)        
        
        # Loss
        weights = tf.feature_column.input_layer(features, params['weight_column'])
        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, 
                                                      logits=logits,
                                                      weights=weights)

        # Eval
        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(mode, loss=loss, predictions=predicted_classes)
        
        # Train
        optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
        train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
    
    def f1_score(self, labels, predictions):
        return {"f1-score": self.f1_metric_fn(labels=labels, predictions=predictions)}
    
    def f1_metric_fn(self, labels, predictions):
        p, p_op = tf.metrics.precision(labels=labels, predictions=predictions)
        r, r_op = tf.metrics.recall(labels=labels, predictions=predictions)
        return 2 * p * r / (p + r), tf.group(p_op, r_op)        

    def input_fn(self, mode, X, y=None, num_epochs=1):
        n = X.shape[0]
        if mode == tf.estimator.ModeKeys.TRAIN:
            num_batches = n // self.batch_size
        else:
            num_batches = int(np.ceil(n / self.batch_size))

        def mode_input_fn():
            # Convert to sparse tensor
            tf_X = self.convert_sparse_matrix_to_sparse_tensor(X)

            # Batch iterator
            i = tf.train.range_input_producer(
                limit=num_batches, num_epochs=num_epochs, shuffle=False
            ).dequeue()

            # Slice sparse tensor using batch number, then convert to dense
            tf_X = tf.sparse.to_dense(tf.sparse.slice(
                tf_X, start=[i * self.batch_size, 0],
                size=[self.batch_size, self.n_features_]
            ), validate_indices=False)

            if mode == tf.estimator.ModeKeys.PREDICT:
                return {self.features_key: tf_X}
            
            # Slice labels tensor using batch number
            tf_y = tf.convert_to_tensor(y)
            tf_y = tf_y[i * self.batch_size:(i + 1) * self.batch_size]

            # Create weights tensor using given weight of class "M"
            class_M = self.label_encoder_.transform(["M"])
            weights = tf.cast(tf.equal(tf_y, class_M), tf.float64)
            weights = tf.multiply(weights, (self.weight_class_M - 1)) + 1

            # Return tensors in correct format
            return {self.features_key: tf_X, self.weight_key: weights}, tf_y

        return mode_input_fn

    def check_warm_start(self, warm_start):
        if warm_start:
            # Check if model was already fitted
            try:
                self.classifier_
            except:
                warm_start = False
        return warm_start

    def set_model_directory(self, checkpoints_dir, model_name):
        if model_name is not None:
            self.model_dir = checkpoints_dir + model_name
            # Check model_dir doesn't already exist
            if os.path.exists(self.model_dir):
                raise ValueError("model_dir already exists")
        else:
            self.model_dir = None

    @staticmethod
    def tokens_to_str(tokens):
        # Used in CountVectorizer because we already have tokens
        return list(map(str, tokens))

    @staticmethod
    def convert_sparse_matrix_to_sparse_tensor(X):
        coo = X.tocoo()
        indices = np.mat([coo.row, coo.col]).transpose()
        return tf.SparseTensorValue(indices, coo.data, coo.shape)
    
    def network_fn(self, features, params):
        inputs = tf.feature_column.input_layer(features, params['feature_columns'])
        inputs = inputs[:, :, tf.newaxis]
        feature_maps = []
        for filters, kernel_size in zip([5] * 2, [2, 3]):
            tmp = tf.layers.conv1d(inputs, filters, kernel_size, padding="same")
            tmp = tf.layers.max_pooling1d(tmp, [self.n_features_], strides=1, padding="valid")
            feature_maps.append(tmp)
        feature_maps = tf.reshape(tf.concat(feature_maps, axis=2), [-1, 5 * 2])
        logits = tf.layers.dense(feature_maps, self.n_classes_, activation=None)
        return logits

In [20]:
tf.logging.set_verbosity(tf.logging.INFO)

num_epochs = 5
weight_class_M = Counter(y_train)["C"] / Counter(y_train)["M"]
model = DNNModel(weight_class_M=weight_class_M, hidden_units=(10,))

for epoch in range(num_epochs):
    model.fit(X_train[:500], y_train[:500], num_epochs=1, warm_start=True)
    f1 = model.score(X_test, y_test)
    print("\nEPOCH %d: test f1-score = %.3f" % (epoch, f1))
    print("-" * 80 + "\n")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/home/omar/tmp/tmpggh347p3', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f7278648da0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Using config: {'_model_dir': '/home/omar/tmp/tmpggh347p3', '_tf_random_seed': None, '_save_summary_steps': 100, '_s

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-11-16-14:25:12
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /home/omar/tmp/tmpggh347p3/model.ckpt-15
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-11-16-14:25:17
INFO:tensorflow:Saving dict for global step 15: f1-score = 0.22729744, global_step = 15, loss = 1.198664
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 15: /home/omar/tmp/tmpggh347p3/model.ckpt-15

EPOCH 4: test f1-score = 0.227
--------------------------------------------------------------------------------

