In [1]:
import os
import pickle
import numpy as np
import pandas as pd

from collections import Counter

from sklearn.metrics import f1_score, make_scorer, confusion_matrix, \
    classification_report
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, \
    StratifiedShuffleSplit, RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from keras_preprocessing.sequence import pad_sequences
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

%matplotlib inline

  from ._conv import register_converters as _register_converters


# Data

In [4]:
with open("../Data/Learn/labels.pkl", "rb") as f:
    learn_labels = pickle.load(f)

with open("../Data/generated/my_learn_sequences.pkl", "rb") as f:
    learn_sequences = pickle.load(f)

with open("../Data/generated/my_embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

X_train, X_test, y_train, y_test = train_test_split(
    learn_sequences, learn_labels, test_size=0.3,
    shuffle=True, stratify=learn_labels, random_state=42 + 2
)
X_train, X_test = np.array(X_train), np.array(X_test)
y_train, y_test = np.array(y_train), np.array(y_test)

In [5]:
embeddings.shape

(28934, 300)

# Model

In [4]:
import os

import tensorflow as tf
from keras_preprocessing.sequence import pad_sequences

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import LabelEncoder


class CNNModel(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 sentence_length,
                 embeddings,
                 filters_by_ksize=50,
                 kernel_sizes=(2,),
                 batch_size=128,
                 learning_rate=0.01,
                 dropout_keep_prob=1.0,
                 model_name=None,
                 checkpoints_dir="../checkpoints/",
                 ):
        self.sentence_length = sentence_length
        self.embeddings = embeddings
        self.embedding_dim = self.embeddings.shape[1]
        self.filters_by_ksize = filters_by_ksize
        self.kernel_sizes = kernel_sizes
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.dropout_keep_prob = dropout_keep_prob
        self.features_key = "x"
        self.weight_key = "weight"
        self.model_dir = self.set_model_directory(checkpoints_dir, model_name)

    @staticmethod
    def set_model_directory(checkpoints_dir, model_name):
        if model_name is not None:
            model_dir = checkpoints_dir + model_name
            # Check model_dir doesn't already exist
            if os.path.exists(model_dir):
                raise ValueError("model_dir already exists")
        else:
            model_dir = None
        return model_dir

    def check_warm_start(self, warm_start):
        if warm_start:
            # Check if model was already fitted
            try:
                self.classifier_
            except AttributeError:
                warm_start = False
        return warm_start

    @staticmethod
    def f1_metric_fn(labels, predictions):
        p, p_op = tf.metrics.precision(labels=labels, predictions=predictions)
        r, r_op = tf.metrics.recall(labels=labels, predictions=predictions)
        return 2 * p * r / (p + r), tf.group(p_op, r_op)

    def f1_score(self, labels, predictions):
        return {"f1-score": self.f1_metric_fn(labels=labels, predictions=predictions)}

    def network_fn(self, features, params):
        # Create embedding matrix
        embeddings = tf.convert_to_tensor(self.embeddings)
        unknown_words_embedding = tf.Variable(tf.random_uniform(
            [1, self.embedding_dim], -1.0, 1.0, tf.float64), trainable=True)
        embeddings = tf.concat([embeddings, unknown_words_embedding], axis=0)

        # Extract sequences embeddings
        sequences = tf.feature_column.input_layer(features, params['feature_columns'])
        embeddings = tf.nn.embedding_lookup(embeddings, tf.cast(sequences, tf.int64))

        # Convolutions and max poolings
        feature_maps = []
        iterator = zip([self.filters_by_ksize] * len(self.kernel_sizes), self.kernel_sizes)
        for filters, kernel_size in iterator:
            tmp = tf.layers.conv1d(embeddings, filters, kernel_size, padding="same")
            tmp = tf.layers.max_pooling1d(tmp, [self.sentence_length], strides=1, padding="valid")
            feature_maps.append(tmp)

        # Concat all feature maps, add dropout, and add softmax
        shape = [-1, self.filters_by_ksize * len(self.kernel_sizes)]
        feature_maps = tf.reshape(tf.concat(feature_maps, axis=2), shape)
        feature_maps = tf.nn.dropout(feature_maps, self.dropout_keep_prob)
        logits = tf.layers.dense(feature_maps, self.n_classes_, activation=None)
        return logits

    def model_fn(self, features, labels, mode, params):
        # Network
        logits = self.network_fn(features, params)

        # Predict
        predicted_classes = tf.argmax(logits, 1)
        if mode == tf.estimator.ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(mode, predictions=predicted_classes)

        # Loss
        class_M = self.label_encoder_.transform(["M"])
        weights = tf.cast(tf.equal(labels, class_M), tf.float64)
        weights = tf.multiply(weights, (6.63 - 1)) + 1
        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits, weights=weights)

        # Eval
        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(mode, loss=loss, predictions=predicted_classes)

        # Train
        optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate)
        train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
    
    def input_fn(self, mode, X, y=None, num_epochs=1):
        if mode in [tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL]:
            shuffle = True
        else:
            shuffle, num_epochs, y = (False, 1, None)
        X = {self.features_key: X}
        return tf.estimator.inputs.numpy_input_fn(X, y, self.batch_size, num_epochs, shuffle)

    def create_dnn_classifier(self):
        # Columns of X
        self.feature_columns_ = [tf.feature_column.numeric_column(
            key=self.features_key, shape=self.sentence_length)]

        # Params
        params = {"feature_columns": self.feature_columns_, "n_classes": self.n_classes_}
        run_config = tf.estimator.RunConfig(model_dir=self.model_dir, log_step_count_steps=10)
        
        # Model
        model = tf.estimator.Estimator(model_fn=self.model_fn, params=params, config=run_config)
        model = tf.contrib.estimator.add_metrics(model, self.f1_score)
        return model

    def apply_transformers(self, X, y):
        X = pad_sequences(X, self.sentence_length)
        y = self.label_encoder_.transform(y)
        return X, y

    def fit_and_apply_transformers(self, X, y):
        X = pad_sequences(X, self.sentence_length)
        self.label_encoder_ = LabelEncoder()
        y = self.label_encoder_.fit_transform(y)
        self.n_classes_ = len(self.label_encoder_.classes_)
        return X, y

    def fit(self, X, y, num_epochs=1, warm_start=True):
        warm_start = self.check_warm_start(warm_start)
        if not warm_start:
            X, y = self.fit_and_apply_transformers(X, y)
            self.classifier_ = self.create_dnn_classifier()
        else:
            X, y = self.apply_transformers(X, y)
        self.classifier_.train(self.input_fn(tf.estimator.ModeKeys.TRAIN, X, y, num_epochs))
        return self

    def predict(self, X):
        X = pad_sequences(X, self.sentence_length)
        classes = list(self.classifier_.predict(self.input_fn(tf.estimator.ModeKeys.PREDICT, X)))
        labels = self.label_encoder_.inverse_transform(classes)
        return labels

    def score(self, X, y):
        X, y = self.apply_transformers(X, y)
        results = self.classifier_.evaluate(self.input_fn(tf.estimator.ModeKeys.EVAL, X, y))
        return results["f1-score"]

# Hyperparameters

### Grid search

In [17]:
gs = GridSearchCV(
    CNNModel(sentence_length=max(map(len, X_train)), embeddings=embeddings),
    param_grid={
        "dropout_keep_prob": [1.0, 0.9, 0.8, 0.7, 0.6, 0.5],
        "learning_rate": [0.001, 0.005, 0.01, 0.05, 0.1],
    }, 
    n_jobs=-1, 
    refit=False, 
    cv=3, 
    return_train_score=False
)

gs.fit(X_train, y_train, num_epochs=10)

In [21]:
pd.DataFrame(gs.cv_results_).sort_values("rank_test_score")

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_dropout_keep_prob,param_learning_rate,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score,std_fit_time,std_score_time,std_test_score
3,2517.571017,40.272612,0.521074,1.0,0.05,"{'dropout_keep_prob': 1.0, 'learning_rate': 0.05}",1,0.507187,0.543007,0.513028,1.255948,0.457329,0.015691
9,2521.973347,28.311614,0.514757,0.9,0.1,"{'dropout_keep_prob': 0.9, 'learning_rate': 0.1}",2,0.516556,0.516984,0.51073,3.537341,2.068971,0.002853
18,2488.88129,44.128287,0.494524,0.7,0.05,"{'dropout_keep_prob': 0.7, 'learning_rate': 0.05}",3,0.492471,0.492549,0.498552,4.388833,10.377894,0.002849
14,2507.087153,37.452126,0.49238,0.8,0.1,"{'dropout_keep_prob': 0.8, 'learning_rate': 0.1}",4,0.489097,0.510522,0.47752,4.993876,2.774273,0.013671
13,2511.981449,38.861667,0.484987,0.8,0.05,"{'dropout_keep_prob': 0.8, 'learning_rate': 0.05}",5,0.500743,0.522393,0.431818,17.379806,10.620683,0.038618
2,2517.89546,40.560565,0.478341,1.0,0.01,"{'dropout_keep_prob': 1.0, 'learning_rate': 0.01}",6,0.479931,0.485823,0.469268,3.528913,1.850987,0.006851
23,2496.496011,36.983,0.465365,0.6,0.05,"{'dropout_keep_prob': 0.6, 'learning_rate': 0.05}",7,0.471088,0.472493,0.452512,12.869703,13.201122,0.009106
29,1249.867789,13.059176,0.452812,0.5,0.1,"{'dropout_keep_prob': 0.5, 'learning_rate': 0.1}",8,0.481883,0.407701,0.46885,852.400119,3.262004,0.032339
17,2487.29934,35.334724,0.452049,0.7,0.01,"{'dropout_keep_prob': 0.7, 'learning_rate': 0.01}",9,0.461778,0.450176,0.444191,0.798169,0.767798,0.007301
7,2525.84768,31.402206,0.448915,0.9,0.01,"{'dropout_keep_prob': 0.9, 'learning_rate': 0.01}",10,0.446457,0.451007,0.449282,11.335145,9.321308,0.001876


In [None]:
tf.logging.set_verbosity(tf.logging.ERROR)

n_splits, num_epochs = 3, 20
splitter = StratifiedKFold(n_splits, shuffle=True, random_state=1)
results = {epoch: [] for epoch in range(num_epochs)}

for cv, (train_ind, test_ind) in enumerate(splitter.split(X_train, y_train)):
    print("Split %d..." % cv)
    model = CNNModel(
        sentence_length=max(map(len, X_train)),
        embeddings=embeddings,
        filters_by_ksize=50, 
        kernel_sizes=(2,), 
        batch_size=128, 
        learning_rate=0.05,
        dropout_keep_prob=1.0,
    )
    for epoch in range(num_epochs):
        print("=> Epoch %d..." % epoch)
        model.fit(X_train[train_ind], y_train[train_ind])
        score = model.score(X_train[test_ind], y_train[test_ind])
        results[epoch].append(score)

print("DONE")

In [68]:
pd.DataFrame(results).mean(axis=0) - pd.DataFrame(results).std(axis=0, ddof=0)

0     0.331304
1     0.451423
2     0.378235
3     0.392895
4     0.503359
5     0.455275
6     0.472321
7     0.404910
8     0.426447
9     0.466970
10    0.488289
11    0.498498
12    0.452369
13    0.490205
14    0.511028
15    0.509235
16    0.512176
17    0.505529
18    0.467169
19    0.496655
dtype: float64

In [None]:
tf.logging.set_verbosity(tf.logging.ERROR)

n_splits, num_epochs = 3, 20
splitter = StratifiedKFold(n_splits, shuffle=True, random_state=1)
results2 = {epoch: [] for epoch in range(num_epochs)}

for cv, (train_ind, test_ind) in enumerate(splitter.split(X_train, y_train)):
    print("Split %d..." % cv)
    model = CNNModel(
        sentence_length=max(map(len, X_train)),
        embeddings=embeddings,
        filters_by_ksize=50, 
        kernel_sizes=(2,), 
        batch_size=128, 
        learning_rate=0.1,
        dropout_keep_prob=0.9,
    )
    for epoch in range(num_epochs):
        print("=> Epoch %d..." % epoch)
        model.fit(X_train[train_ind], y_train[train_ind])
        score = model.score(X_train[test_ind], y_train[test_ind])
        results2[epoch].append(score)

print("DONE")

In [69]:
pd.DataFrame(results2).mean(axis=0) - pd.DataFrame(results2).std(axis=0, ddof=0)

0     0.285033
1     0.361691
2     0.293884
3     0.413216
4     0.500309
5     0.425285
6     0.392782
7     0.366639
8     0.423945
9     0.467207
10    0.503105
11    0.428178
12    0.453636
13    0.499964
14    0.490692
15    0.468583
16    0.465390
17    0.400012
18    0.477036
19    0.402809
dtype: float64

### Choice:
- learning_rate = 0.05
- dropout = 1.0
- num_epochs = 5

### Test

In [71]:
tf.logging.set_verbosity(tf.logging.ERROR)

model = CNNModel(
    sentence_length=max(map(len, X_train)),
    embeddings=embeddings,
    filters_by_ksize=50, 
    kernel_sizes=(2,), 
    batch_size=128, 
    learning_rate=0.05,
    dropout_keep_prob=1.0,
)

model.fit(X_train, y_train, num_epochs=5)
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           C       0.94      0.90      0.92     11974
           M       0.48      0.61      0.54      1805

   micro avg       0.86      0.86      0.86     13779
   macro avg       0.71      0.76      0.73     13779
weighted avg       0.88      0.86      0.87     13779



### Paper configs

In [None]:
tf.logging.set_verbosity(tf.logging.ERROR)

n_splits, num_epochs = 3, 5
splitter = StratifiedKFold(n_splits, shuffle=True, random_state=1)
results3 = {epoch: [] for epoch in range(num_epochs)}

for cv, (train_ind, test_ind) in enumerate(splitter.split(X_train, y_train)):
    print("Split %d..." % cv)
    model = CNNModel(
        sentence_length=max(map(len, X_train)),
        embeddings=embeddings,
        filters_by_ksize=100, 
        kernel_sizes=(3, 4, 5), 
        batch_size=50, 
        learning_rate=0.001,
        dropout_keep_prob=0.5,
    )
    for epoch in range(num_epochs):
        print("=> Epoch %d" % epoch, end="... ")
        model.fit(X_train[train_ind], y_train[train_ind])
        score = model.score(X_train[test_ind], y_train[test_ind])
        print(score)
        results3[epoch].append(score)

Split 0...
=> Epoch 0... 