In [2]:
import os
import pickle
import numpy as np
import pandas as pd

from collections import Counter

from sklearn.metrics import f1_score, make_scorer, confusion_matrix, \
    classification_report
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, \
    StratifiedShuffleSplit, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder


import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

%matplotlib inline

  from ._conv import register_converters as _register_converters


# Data

In [3]:
with open("../Data/Learn/labels.pkl", "rb") as f:
    learn_labels = pickle.load(f)

with open("../Data/Learn/sequences.pkl", "rb") as f:
    learn_sequences = pickle.load(f)

In [98]:
X_train, X_test, y_train, y_test = train_test_split(
    learn_sequences, learn_labels, test_size=0.3,
    shuffle=True, stratify=learn_labels, random_state=42
)

# Model

In [74]:
def tokens_to_str(tokens):
    return list(map(str, tokens))

In [7]:
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensorValue(indices, coo.data, coo.shape)

In [65]:
def train_input_fn(x, y, shuffle=True, batch_size=128, num_epochs=None):
    n = x.shape[0]
    if n < batch_size:
        raise Exception("batch_size bigger than input")
    epoch_size = n // batch_size
    
    def input_fn():
        x_data = convert_sparse_matrix_to_sparse_tensor(x)
        y_data = tf.convert_to_tensor(y)
        i = tf.train.range_input_producer(epoch_size, num_epochs, shuffle=False).dequeue()
        ret_x = tf.sparse.slice(x_data, [i * batch_size, 0], [batch_size, x.shape[1]])
        ret_y = y_data[i * batch_size:(i + 1) * batch_size]
        return {"x": tf.sparse.to_dense(ret_x, validate_indices=False)}, ret_y
        
    return input_fn

In [75]:
def test_input_fn(x, y, batch_size=128):
    n = x.shape[0]
    if n < batch_size:
        raise Exception("batch_size bigger than input")
    epoch_size = n // batch_size
    
    def input_fn():
        x_data = convert_sparse_matrix_to_sparse_tensor(x)
        y_data = tf.convert_to_tensor(y)
        i = tf.train.range_input_producer(epoch_size, num_epochs=1, shuffle=False).dequeue()
        ret_x = tf.sparse.slice(x_data, [i * batch_size, 0], [batch_size, x.shape[1]])
        ret_y = y_data[i * batch_size:(i + 1) * batch_size]
        return {"x": tf.sparse.to_dense(ret_x, validate_indices=False)}, ret_y
        
    return input_fn

In [97]:
def score_model(model, X_test, y_test, batch_size=128):
    y_test = y_test[:X_test.shape[0] - X_test.shape[0] % batch_size]
    y_pred = np.array([x["class_ids"][0] for x in model.predict(
        test_input_fn(X_test, y_test, batch_size))])
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, digits=3))
    return f1_score(y_test, y_pred, average="macro")

In [None]:
vectorizer = CountVectorizer(lowercase=False, tokenizer=tokens_to_str, ngram_range=(1, 2))
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [71]:
vector_feature_column = tf.feature_column.numeric_column(key="x", shape=X_train_vect.shape[1])
classifier = tf.estimator.DNNClassifier(
    feature_columns=[vector_feature_column],
    hidden_units=[10],
    n_classes=2,
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/home/omar/tmp/tmpvlcbz51f', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f7164c31be0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [73]:
classifier.train(train_input_fn(X_train_vect, y_train, batch_size=128, num_epochs=10))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /home/omar/tmp/tmpvlcbz51f/model.ckpt-0
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /home/omar/tmp/tmpvlcbz51f/model.ckpt.
INFO:tensorflow:loss = 89.579666, step = 1
INFO:tensorflow:global_step/sec: 8.99633
INFO:tensorflow:loss = 29.498064, step = 101 (11.116 sec)
INFO:tensorflow:global_step/sec: 9.3962
INFO:tensorflow:loss = 32.90124, step = 201 (10.643 sec)
INFO:tensorflow:global_step/sec: 9.40878
INFO:tensorflow:loss = 13.064793, step = 301 (10.628 sec)
INFO:tensorflow:global_step/sec: 9.41532
INFO:tensorflow:loss = 21.742981, step = 401 (10.621 sec)
INFO:tensorflow:global_step/sec: 9.33222
INFO:tensorflow:loss = 13.841312, step = 501 (10.715 sec)
INFO:tensorflow:global_step/sec: 9.48408
INFO:tensorflow:los

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f7164c311d0>

In [98]:
score_model(classifier, X_test_vect, y_test)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /home/omar/tmp/tmpvlcbz51f/model.ckpt-2510
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
[[11593   310]
 [  949   844]]
              precision    recall  f1-score   support

           0      0.924     0.974     0.948     11903
           1      0.731     0.471     0.573      1793

   micro avg      0.908     0.908     0.908     13696
   macro avg      0.828     0.722     0.761     13696
weighted avg      0.899     0.908     0.899     13696



0.7606412545132412

# Clean model

In [117]:
class DNNModel(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 model_name=None,
                 checkpoints_dir="../checkpoints/",
                 hidden_units=(10,),
                 batch_size=128,
                 weight_class_M=1.0,
                 features_key="x",
                 weight_key="weight"
                 ):
        self.set_model_directory(checkpoints_dir, model_name)
        self.hidden_units = hidden_units
        self.batch_size = batch_size
        self.features_key = features_key
        self.weight_key = weight_key
        self.weight_class_M = weight_class_M

    def fit(self, X, y, num_epochs=1, warm_start=True):
        warm_start = self.check_warm_start(warm_start)
        if not warm_start:
            X, y = self.fit_and_apply_transformers(X, y)
            self.classifier_ = self.create_dnn_classifier()
        else:
            X, y = self.apply_transformers(X, y)

        self.classifier_.train(self.train_input_fn(X, y, num_epochs))
        return self

    def fit_and_apply_transformers(self, X, y):
        # Fit and transform X
        self.vectorizer_ = CountVectorizer(lowercase=False,
                                           tokenizer=self.tokens_to_str,
                                           ngram_range=(1, 2))
        X = self.vectorizer_.fit_transform(X)
        self.n_features_ = len(self.vectorizer_.vocabulary_)

        # Fit and transform y
        self.label_encoder_ = LabelEncoder()
        y = self.label_encoder_.fit_transform(y)
        self.n_classes_ = len(self.label_encoder_.classes_)

        return X, y

    def apply_transformers(self, X, y):
        X = self.vectorizer_.transform(X)
        y = self.label_encoder_.transform(y)
        return X, y

    def predict(self, X):
        X = self.vectorizer_.transform(X)
        predictions = list(self.classifier_.predict(self.predict_input_fn(X)))
        classes = np.array([p["class_ids"][0] for p in predictions])
        labels = self.label_encoder_.inverse_transform(classes)
        return labels

    def score(self, X, y, print_report=False):
        predictions = self.predict(X)
        if print_report:
            print(confusion_matrix(y, predictions))
            print(classification_report(y, predictions, digits=3))
        return f1_score(y, predictions, average="macro")

    def create_dnn_classifier(self):
        # Columns of X
        self.feature_columns_ = [tf.feature_column.numeric_column(
            key=self.features_key, shape=self.n_features_
        )]

        # Column of weights which will be used to compute loss
        weight_column = tf.feature_column.numeric_column(self.weight_key)

        # Model
        return tf.estimator.DNNClassifier(
            feature_columns=self.feature_columns_,
            weight_column=weight_column,
            hidden_units=self.hidden_units,
            n_classes=self.n_classes_,
            model_dir=self.model_dir
        )

    def train_input_fn(self, X, y, num_epochs):
        n = X.shape[0]
        num_batches = n // self.batch_size

        def input_fn():
            # Convert to tensors
            tf_X = self.convert_sparse_matrix_to_sparse_tensor(X)
            tf_y = tf.convert_to_tensor(y)

            # Batch iterator
            i = tf.train.range_input_producer(
                limit=num_batches, num_epochs=num_epochs, shuffle=False
            ).dequeue()

            # Slice sparse tensor using batch number, then convert to dense
            tf_X = tf.sparse.to_dense(tf.sparse.slice(
                tf_X, start=[i * self.batch_size, 0],
                size=[self.batch_size, self.n_features_]
            ), validate_indices=False)

            # Slice labels tensor using batch number
            output_y = tf_y[i * self.batch_size:(i + 1) * self.batch_size]

            # Create weights tensor using given weight of class "M"
            class_M = self.label_encoder_.transform(["M"])
            weights = tf.cast(tf.equal(output_y, class_M), tf.float64)
            weights = tf.multiply(weights, (self.weight_class_M - 1)) + 1

            # Return tensors in correct format
            return {self.features_key: tf_X, self.weight_key: weights}, output_y

        return input_fn

    def predict_input_fn(self, X):
        n = X.shape[0]

        # Ensure num_batches has the right value so that we predict every point
        if n % self.batch_size == 0:
            num_batches = n // self.batch_size
        else:
            num_batches = n // self.batch_size + 1

        def input_fn():
            # Convert to tensor
            tf_X = self.convert_sparse_matrix_to_sparse_tensor(X)

            # Batch iterator
            i = tf.train.range_input_producer(
                limit=num_batches, num_epochs=1, shuffle=False
            ).dequeue()

            # Slice sparse tensor using batch number, then convert to dense
            tf_X = tf.sparse.to_dense(tf.sparse.slice(
                tf_X, start=[i * self.batch_size, 0],
                size=[self.batch_size, self.n_features_]
            ), validate_indices=False)

            # Return tensor in correct format
            return {self.features_key: tf_X}

        return input_fn

    def check_warm_start(self, warm_start):
        if warm_start:
            # Check if model was already fitted
            try:
                self.classifier_
            except:
                warm_start = False
        return warm_start

    def set_model_directory(self, checkpoints_dir, model_name):
        if model_name is not None:
            self.model_dir = checkpoints_dir + model_name
            # Check model_dir doesn't already exist
            if os.path.exists(self.model_dir):
                raise ValueError("model_dir already exists")
        else:
            self.model_dir = None

    @staticmethod
    def tokens_to_str(tokens):
        # Used in CountVectorizer because we already have tokens
        return list(map(str, tokens))

    @staticmethod
    def convert_sparse_matrix_to_sparse_tensor(X):
        coo = X.tocoo()
        indices = np.mat([coo.row, coo.col]).transpose()
        return tf.SparseTensorValue(indices, coo.data, coo.shape)

In [118]:
num_epochs = 5
weight_class_M = Counter(y_train)["C"] / Counter(y_train)["M"]
range_units = [(10,), (25,), (50,), (100,), (250,), (500,), (1000,)]

for hidden_units in range_units:
    print("\n" + "-" * 80 + "\n")
    model = DNNModel(weight_class_M=weight_class_M, hidden_units=hidden_units)
    print(model)
    model.fit(X_train, y_train, num_epochs=num_epochs, warm_start=True)
    print("TEST F1-SCORE:", model.score(X_test, y_test, print_report=True))


--------------------------------------------------------------------------------

DNNModel(batch_size=128, checkpoints_dir=None, features_key='x',
     hidden_units=(10,), model_name=None, weight_class_M=6.631379064799431,
     weight_key='weight')
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/home/omar/tmp/tmpro8kmmk1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f8818048f60>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_clus

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /home/omar/tmp/tmpnqlbhxy0/model.ckpt.
INFO:tensorflow:loss = 139.13034, step = 1
INFO:tensorflow:global_step/sec: 7.7548
INFO:tensorflow:loss = 87.221634, step = 101 (12.896 sec)
INFO:tensorflow:global_step/sec: 8.02155
INFO:tensorflow:loss = 93.37489, step = 201 (12.466 sec)
INFO:tensorflow:global_step/sec: 8.02623
INFO:tensorflow:loss = 16.97586, step = 301 (12.459 sec)
INFO:tensorflow:global_step/sec: 8.05867
INFO:tensorflow:loss = 42.193398, step = 401 (12.409 sec)
INFO:tensorflow:global_step/sec: 7.9737
INFO:tensorflow:loss = 20.851974, step = 501 (12.541 sec)
INFO:tensorflow:global_step/sec: 8.09493
INFO:tensorflow:loss = 4.958822, step = 601 (12.353 sec)
INFO:tensorflow:global_step/sec: 8.01549
INFO:

INFO:tensorflow:loss = 1.9631517, step = 801 (27.369 sec)
INFO:tensorflow:global_step/sec: 3.7135
INFO:tensorflow:loss = 2.2439332, step = 901 (26.929 sec)
INFO:tensorflow:global_step/sec: 3.69752
INFO:tensorflow:loss = 1.1247187, step = 1001 (27.045 sec)
INFO:tensorflow:global_step/sec: 3.71946
INFO:tensorflow:loss = 1.3478184, step = 1101 (26.886 sec)
INFO:tensorflow:global_step/sec: 3.70525
INFO:tensorflow:loss = 0.8157321, step = 1201 (26.989 sec)
INFO:tensorflow:Saving checkpoints for 1255 into /home/omar/tmp/tmp_pstabic/model.ckpt.
INFO:tensorflow:Loss for final step: 0.83735275.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /home/omar/tmp/tmp_pstabic/model.ckpt-1255
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
[[11376   598]
 [  767  1038]]
              precision    recall  f1-score   support

           C      0.937     0.950     0.943

KeyboardInterrupt: 

In [119]:
num_epochs = 5
weight_class_M = Counter(y_train)["C"] / Counter(y_train)["M"]
range_units = [(100, 10), (100, 25), (100, 50), (100, 75), (100, 100)]

for hidden_units in range_units:
    print("\n" + "-" * 80 + "\n")
    model = DNNModel(weight_class_M=weight_class_M, hidden_units=hidden_units)
    print(model)
    model.fit(X_train, y_train, num_epochs=num_epochs, warm_start=True)
    print("TEST F1-SCORE:", model.score(X_test, y_test, print_report=True))


--------------------------------------------------------------------------------

DNNModel(batch_size=128, checkpoints_dir=None, features_key='x',
     hidden_units=(100, 10), model_name=None,
     weight_class_M=6.631379064799431, weight_key='weight')
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/home/omar/tmp/tmp1n54vr1c', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f885440fd30>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /home/omar/tmp/tmphanujsdo/model.ckpt.
INFO:tensorflow:loss = 139.03014, step = 1
INFO:tensorflow:global_step/sec: 6.07428
INFO:tensorflow:loss = 90.6378, step = 101 (16.464 sec)
INFO:tensorflow:global_step/sec: 6.25363
INFO:tensorflow:loss = 86.006714, step = 201 (15.990 sec)
INFO:tensorflow:global_step/sec: 6.26854
INFO:tensorflow:loss = 8.981398, step = 301 (15.953 sec)
INFO:tensorflow:global_step/sec: 6.28045
INFO:tensorflow:loss = 26.874666, step = 401 (15.922 sec)
INFO:tensorflow:global_step/sec: 6.22976
INFO:tensorflow:loss = 12.67927, step = 501 (16.052 sec)
INFO:tensorflow:global_step/sec: 6.25709
INFO:tensorflow:loss = 0.7920958, step = 601 (15.982 sec)
INFO:tensorflow:global_step/sec: 6.24613
INFO

KeyboardInterrupt: 

In [None]:
num_epochs = 10
weight_class_M = Counter(y_train)["C"] / Counter(y_train)["M"]
model = DNNModel(weight_class_M=weight_class_M, hidden_units=(250,))

for epoch in range(num_epochs):
    print("\n" + "-" * 80 + "\n" + "EPOCH %d\n" % epoch)
    model.fit(X_train, y_train, num_epochs=1, warm_start=True)
    print("TEST F1-SCORE:", model.score(X_test, y_test, print_report=True))


--------------------------------------------------------------------------------
EPOCH 0

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/home/omar/tmp/tmpd1vukvx7', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f878c093828>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:t

[[11319   655]
 [  751  1054]]
              precision    recall  f1-score   support

           C      0.938     0.945     0.942     11974
           M      0.617     0.584     0.600      1805

   micro avg      0.898     0.898     0.898     13779
   macro avg      0.777     0.765     0.771     13779
weighted avg      0.896     0.898     0.897     13779

TEST F1-SCORE: 0.7707050212534846

--------------------------------------------------------------------------------
EPOCH 5

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /home/omar/tmp/tmpd1vukvx7/model.ckpt-1255
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1255 into /home/omar/tmp/tmpd1vukvx7/model.ckpt.
INFO:tensorflow:loss = 0.32170808, step = 1256
INFO:tensorflow:global_step/sec: 3.6626
INFO:tensorflow:loss