In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import warnings
from time import time

from collections import Counter

from sklearn.metrics import f1_score, make_scorer, confusion_matrix, \
    classification_report
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, \
    StratifiedShuffleSplit, RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from keras_preprocessing.sequence import pad_sequences
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

%matplotlib inline

  from ._conv import register_converters as _register_converters


# Data

In [2]:
with open("../Data/Learn/labels.pkl", "rb") as f:
    learn_labels = pickle.load(f)

with open("../Data/generated/my_learn_sequences.pkl", "rb") as f:
    learn_sequences = pickle.load(f)

with open("../Data/generated/my_embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

X_train, X_test, y_train, y_test = train_test_split(
    learn_sequences, learn_labels, test_size=0.3,
    shuffle=True, stratify=learn_labels, random_state=42 + 2
)
X_train, X_test = np.array(X_train), np.array(X_test)
y_train, y_test = np.array(y_train), np.array(y_test)

embeddings.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape

((28934, 300), (32151,), (32151,), (13779,), (13779,))

# Model

In [16]:
class LSTMModel(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 sentence_length,
                 embeddings,
                 num_units=50,
                 batch_size=128,
                 learning_rate=0.05,
                 dropout_keep_prob=1.0,
                 weight_class_M=1.0,
                 model_name=None,
                 checkpoints_dir="../checkpoints/",
                 ):
        self.sentence_length = sentence_length
        self.embeddings = embeddings
        self.embedding_dim = self.embeddings.shape[1]
        self.num_units = num_units
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.dropout_keep_prob = dropout_keep_prob
        self.weight_class_M = weight_class_M
        self.features_key = "x"
        self.seq_lengths_key = "lengths"
        self.model_dir = self.set_model_directory(checkpoints_dir, model_name)

    @staticmethod
    def set_model_directory(checkpoints_dir, model_name):
        if model_name is not None:
            model_dir = checkpoints_dir + model_name
            # Check model_dir doesn't already exist
            if os.path.exists(model_dir):
                raise ValueError("model_dir already exists")
        else:
            model_dir = None
        return model_dir

    def check_warm_start(self, warm_start):
        if warm_start:
            # Check if model was already fitted
            try:
                self.classifier_
            except AttributeError:
                warm_start = False
        return warm_start

    @staticmethod
    def f1_metric_fn(labels, predictions):
        p, p_op = tf.metrics.precision(labels=labels, predictions=predictions)
        r, r_op = tf.metrics.recall(labels=labels, predictions=predictions)
        return 2 * p * r / (p + r), tf.group(p_op, r_op)

    def f1_score(self, labels, predictions):
        return {"f1-score": self.f1_metric_fn(labels=labels, predictions=predictions)}

    def model_fn(self, features, labels, mode, params):
        # Network
        logits = self.network_fn(features, params)

        # Predict
        predicted_classes = tf.argmax(logits, 1)
        if mode == tf.estimator.ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(mode, predictions=predicted_classes)

        # Loss
        class_M = self.label_encoder_.transform(["M"])
        weights = tf.cast(tf.equal(labels, class_M), tf.float64)
        weights = tf.multiply(weights, (self.weight_class_M - 1)) + 1
        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits, weights=weights)

        # Eval
        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(mode, loss=loss, predictions=predicted_classes)

        # Train
        optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate)
        train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
    
    def pad_sentences(self, sentences):
        return pad_sequences(sentences, self.sentence_length, padding="post")
        
    @staticmethod
    def sequences_lengths(X):
        return X.shape[1] - np.argmax(X[:, ::-1] != 0, axis=1)

    def input_fn(self, mode, X, y=None, num_epochs=1):
        if mode in [tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL]:
            shuffle = True
        else:
            shuffle, num_epochs, y = (False, 1, None)
            
        lengths = self.sequences_lengths(X)
        X = {self.features_key: X, self.seq_lengths_key: lengths}
        
        return tf.estimator.inputs.numpy_input_fn(X, y, batch_size=self.batch_size,
                                                  num_epochs=num_epochs, shuffle=shuffle)

    def create_dnn_classifier(self):
        # Feature columns
        self.feature_columns_ = [tf.feature_column.numeric_column(
            key=self.features_key, shape=self.sentence_length)]
        self.seq_lengths_column_ = [tf.feature_column.numeric_column(
            key=self.seq_lengths_key)]

        # Model
        run_config = tf.estimator.RunConfig(model_dir=self.model_dir, log_step_count_steps=50)
        model = tf.estimator.Estimator(model_fn=self.model_fn, config=run_config)
        model = tf.contrib.estimator.add_metrics(model, self.f1_score)
        return model

    def apply_transformers(self, X, y):
        X = self.pad_sentences(X)
        y = self.label_encoder_.transform(y)
        return X, y

    def fit_and_apply_transformers(self, X, y):
        X = self.pad_sentences(X)
        self.label_encoder_ = LabelEncoder()
        y = self.label_encoder_.fit_transform(y)
        self.n_classes_ = len(self.label_encoder_.classes_)
        return X, y

    def fit(self, X, y, num_epochs=1, warm_start=True):
        warm_start = self.check_warm_start(warm_start)
        if not warm_start:
            X, y = self.fit_and_apply_transformers(X, y)
            self.classifier_ = self.create_dnn_classifier()
        else:
            X, y = self.apply_transformers(X, y)

        self.classifier_.train(self.input_fn(tf.estimator.ModeKeys.TRAIN, X, y, num_epochs))
        return self

    def predict(self, X):
        X = self.pad_sentences(X)
        classes = list(self.classifier_.predict(self.input_fn(tf.estimator.ModeKeys.PREDICT, X)))
        labels = self.label_encoder_.inverse_transform(classes)
        return labels

    def score(self, X, y):
        X, y = self.apply_transformers(X, y)
        results = self.classifier_.evaluate(self.input_fn(tf.estimator.ModeKeys.EVAL, X, y))
        return results["f1-score"]

    def network_fn(self, features, params):        
        # Create embedding matrix
        embeddings = tf.convert_to_tensor(self.embeddings)
        unknown_words_embedding = tf.Variable(tf.random_uniform(
            [1, self.embedding_dim], -1.0, 1.0, tf.float64), trainable=True)
        embeddings = tf.concat([embeddings, unknown_words_embedding], axis=0)

        # Extract sequences embeddings
        sequences = tf.feature_column.input_layer(features, self.feature_columns_)
        embeddings = tf.nn.embedding_lookup(embeddings, tf.cast(sequences, tf.int64))

        # Extract sequences lengths
        cur_batch_size = tf.shape(embeddings)[0]
        lengths = tf.feature_column.input_layer(features, self.seq_lengths_column_)
        lengths = tf.cast(tf.reshape(lengths, [cur_batch_size]), tf.int32)

        # LSTM layer with dropout on outputs
        cell = tf.nn.rnn_cell.LSTMCell(self.num_units, activation="relu")
        cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=self.dropout_keep_prob)
        initial_state = cell.zero_state(cur_batch_size, tf.float64)
        outputs, _ = tf.nn.dynamic_rnn(cell, embeddings, initial_state=initial_state, 
                                       sequence_length=lengths)

        # Get last relevant output
        index = tf.range(0, cur_batch_size) * self.sentence_length + (lengths - 1)
        flat = tf.reshape(outputs, [-1, self.num_units])
        relevant_output = tf.gather(flat, index)
        
        # Softmax
        logits = tf.layers.dense(relevant_output, self.n_classes_, activation=None)
        return logits

# Hyperparameters

### Learning rate and number of epochs

In [None]:
tf.logging.set_verbosity(tf.logging.ERROR)

n_splits, num_epochs = 3, 20
learning_rates = [0.001, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1]
splitter = StratifiedKFold(n_splits, shuffle=True, random_state=1)
results = {lr: {epoch: [] for epoch in range(num_epochs)} for lr in learning_rates}

for lr in learning_rates:
    for train_ind, test_ind in splitter.split(X_train, y_train):
        model = LSTMModel(
            weight_class_M=Counter(y_train)["C"] / Counter(y_train)["M"],
            sentence_length=max(map(len, X_train)),
            embeddings=embeddings,
            num_units=50,
            batch_size=128, 
            dropout_keep_prob=1.0,
            learning_rate=lr,
        )
        for epoch in range(num_epochs):
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore")
                model.fit(X_train[train_ind], y_train[train_ind])
                score = model.score(X_train[test_ind], y_train[test_ind])
                results[lr][epoch].append(score)

In [None]:
# with open("lstm_results.p", "wb") as f:
#     pickle.dump(results, f)