In [4]:
import pickle
import numpy as np
import pandas as pd

from collections import Counter

from sklearn.metrics import f1_score, make_scorer, confusion_matrix, \
    classification_report
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, \
    StratifiedShuffleSplit, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier


import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

%matplotlib inline

# Data

In [5]:
with open("../Data/Learn/labels.pkl", "rb") as f:
    learn_labels = pickle.load(f)

with open("../Data/Learn/sequences.pkl", "rb") as f:
    learn_sequences = pickle.load(f)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    learn_sequences, learn_labels, test_size=0.3,
    shuffle=True, stratify=learn_labels, random_state=42
)

# Model

In [7]:
def tokens_to_str(tokens):
    return list(map(str, tokens))

In [11]:
vectorizer = CountVectorizer(lowercase=False, tokenizer=tokens_to_str, ngram_range=(1, 2))
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [78]:
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensorValue(indices, coo.data, coo.shape)

X_train_tensor = convert_sparse_matrix_to_sparse_tensor(X_train_vect)

In [80]:
def train_input_fn(features, labels, batch_size):
    dataset = tf.data.Dataset.from_tensors((features, labels))
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)
    return dataset.make_one_shot_iterator().get_next()

inp = train_input_fn(X_train_tensor, y_train, batch_size=128)

In [81]:
tf.reset_default_graph()
vector_feature_column = tf.feature_column.numeric_column(key="x",
                                                         shape=X_train_tensor.shape[1])
classifier = tf.estimator.DNNClassifier(
    feature_columns=[vector_feature_column],
    hidden_units=[10, 10],
    n_classes=2,
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/home/omar/tmp/tmpu8kbo5vg', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fbd653cbba8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [84]:
classifier.train(lambda : inp, steps=2)

INFO:tensorflow:Calling model_fn.


ValueError: features should be a dictionary of `Tensor`s. Given type: <class 'tensorflow.python.framework.ops.Tensor'>

# Model

In [292]:
def my_model_fn(features, labels, mode):
    logits = compute_logits(features)
    predicted_labels = tf.argmax(logits, axis=1)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {"classes": predicted_labels, "probabilities": tf.nn.softmax(logits)}
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
    
    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metric_ops = {"accuracy": score(labels, predicted_labels)}
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

def compute_logits(features):
    logits = tf.layers.dense(flat, units=2, activation=tf.nn.relu)
    return logits

def score(labels, predictions):
    return tf.metrics.accuracy(labels, predictions)

In [293]:
classifier = tf.estimator.Estimator(model_fn=my_model_fn, model_dir="checkpoints14")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'checkpoints14', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f1631292588>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [294]:
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x=np.array(padded_sequences),
    y=np.array(hot_labels),
    batch_size=128,
    num_epochs=10,
    shuffle=True
)

test_input_fn = tf.estimator.inputs.numpy_input_fn(
    x=np.array(padded_sequences),
    y=np.array(hot_labels),
    batch_size=len(hot_labels),
    shuffle=True
)

In [295]:
classifier.train(
    input_fn=train_input_fn,
)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into checkpoints14/model.ckpt.
INFO:tensorflow:loss = 0.6913747787475586, step = 1
INFO:tensorflow:global_step/sec: 171.892
INFO:tensorflow:loss = 0.3540460467338562, step = 101 (0.582 sec)
INFO:tensorflow:global_step/sec: 218.991
INFO:tensorflow:loss = 0.3380691409111023, step = 201 (0.457 sec)
INFO:tensorflow:global_step/sec: 220.418
INFO:tensorflow:loss = 0.5283647179603577, step = 301 (0.454 sec)
INFO:tensorflow:global_step/sec: 219.47
INFO:tensorflow:loss = 0.9800107479095459, step = 401 (0.456 sec)
INFO:tensorflow:global_step/sec: 218.623
INFO:tensorflow:loss = 0.10859549790620804, step = 501 (0.457 sec)
INFO:tensorflow:Saving checkpoints for 547 into checkpoints14/model.ckpt.
INFO:tensorflow:Loss for final

<tensorflow.python.estimator.estimator.Estimator at 0x7f16312922b0>

In [296]:
list(classifier.predict(
    input_fn=test_input_fn,
))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from checkpoints14/model.ckpt-547
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


[{'classes': 1, 'probabilities': array([0.18624519, 0.81375481])},
 {'classes': 1, 'probabilities': array([0.09213663, 0.90786337])},
 {'classes': 1, 'probabilities': array([0.10569803, 0.89430197])},
 {'classes': 1, 'probabilities': array([0.08121251, 0.91878749])},
 {'classes': 1, 'probabilities': array([0.13793903, 0.86206097])},
 {'classes': 1, 'probabilities': array([0.13340942, 0.86659058])},
 {'classes': 1, 'probabilities': array([0.0958954, 0.9041046])},
 {'classes': 1, 'probabilities': array([0.07924536, 0.92075464])},
 {'classes': 1, 'probabilities': array([0.08732024, 0.91267976])},
 {'classes': 1, 'probabilities': array([0.1042783, 0.8957217])},
 {'classes': 1, 'probabilities': array([0.16692035, 0.83307965])},
 {'classes': 1, 'probabilities': array([0.05728464, 0.94271536])},
 {'classes': 1, 'probabilities': array([0.0987131, 0.9012869])},
 {'classes': 1, 'probabilities': array([0.09571445, 0.90428555])},
 {'classes': 1, 'probabilities': array([0.19438799, 0.80561201])},
 