In [1]:
from collections import Counter
from sklearn.model_selection import KFold
import numpy as np
import re
import os
import shutil

from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, Bidirectional, Dense, Input, Masking, Lambda
import keras.backend as K
from keras.optimizers import Adam
from keras.callbacks import LambdaCallback, LearningRateScheduler, Callback

Using TensorFlow backend.


In [2]:
def read_train_val(filename, val_prob=0.1):
    with open(filename) as handler:
        train_X, train_Y, val_X, val_Y = [], [], [], []
        for line in handler:
            x, y = line.strip().lower().split("\t")
            x = re.sub("\W", " ", x)
            if np.random.uniform() > val_prob:
                train_X.append(x)
                train_Y.append(y)
            else:
                val_X.append(x)
                val_Y.append(y)
        return np.array(train_X), np.array(train_Y), np.array(val_X), np.array(val_Y)

def read_test(filename):
    with open(filename) as handler:
        X = []
        for line in handler:
            line = re.sub("\W", " ", line.strip().lower())
            x = line.strip().lower()
            X.append(re.sub("\W", " ", x))
        return np.array(X)

def acc_metric(labels, predictions):
    assert isinstance(labels, np.ndarray)
    assert isinstance(predictions, np.ndarray)
    return len(labels[labels == predictions]) / len(labels)

def meashure_model(model, X, Y):
    kf = KFold(n_splits=2)
    metrics = []
    for train_index, test_index in kf.split(X):
        train_X, test_X = X[train_index], X[test_index]
        train_Y, test_Y = Y[train_index], Y[test_index]
        model.fit(train_X, train_Y)
        predictions = model.predict(test_X)
        metrics.append(acc_metric(predictions, test_Y))
    return np.mean(metrics), np.var(metrics)

class MostPopularModel:
    def fit(self, X, Y):
        self._answer = Counter(Y).most_common()[0][0]

    def predict(self, X):
        return np.array([self._answer] * len(X))

In [3]:
TRAIN_FILENAME = "names_and_rubrics_learn.tsv"
TEST_FILENAME = "names_and_rubrics_test_no_rubric.tsv"
MIN_FREQ = 5

TOKENS_NUM = 150767
HIDDEN_DIM = 512
MAX_LEN = 45
LSTM_NUM = 2
HIDDEN_LAYESR_NUM = 2
BATCH_SIZE = 64
LABELS_NUM = 1222
ACTIVATION = 'tanh'

In [4]:
train_X, train_Y, val_X, val_Y = read_train_val(TRAIN_FILENAME)
test_X = read_test(TEST_FILENAME)

In [5]:
len(train_X), len(train_Y), len(val_X), len(val_Y), len(test_X)

(8026112, 8026112, 890990, 890990, 1000000)

In [6]:
def make_tokens_counter(*datas):
    tokens_counter = Counter()
    for lines in datas:
        for line in lines:
            for word in line.split():
                tokens_counter[word] += 1
    return tokens_counter

def make_index_by_token(tokens_counter, min_freq):
    tokens = [token for token in tokens_counter if tokens_counter[token] > min_freq]
    return {
        token: i
        for i, token in enumerate(tokens)
    }

def make_label_by_y(*Ys):
    all_Y = set()
    for Y in Ys:
        all_Y.update(Y)
    return {
        y: i
        for i, y in enumerate(set(all_Y))
    }

def make_indices_by_label(Y, X, label_by_y, index_by_token):
    indices_by_label = {
        label_by_y[y]: []
        for y in set(Y)
    }
    for x, y in zip(X, Y):
        indices_by_label[label_by_y[y]].append(make_indices_from_x(x, index_by_token))
    return indices_by_label

def get_max_X_len(*Xs):
    max_len = -1
    for X in Xs:
        max_len = max(max_len, max([len(x.split()) for x in X]))
    return max_len

def make_indices_from_x(x, index_by_token):
    return [
        index_by_token[token]
        for token in x.split()
        if token in index_by_token
    ]

def make_positives_data_gen(X, Y, label_by_y, index_by_token):
    def positives_data_gen():
        while True:
            index = np.random.randint(0, len(X))
            yield (
                make_indices_from_x(X[index], index_by_token),
                label_by_y[Y[index]]
            )
    return positives_data_gen()

def make_nagative(label, labels_nd_array, indices_by_label):
    negative_label = label
    while negative_label == label:
        negative_label = np.random.choice(labels_nd_array)
    variants = indices_by_label[negative_label]
    index = np.random.randint(0, len(variants))
    return variants[index]

def make_data_gen(X, Y, label_by_y, index_by_token, indices_by_label, half_batch_size):
    labels_nd_array = np.array([label_by_y[y] for y in set(Y)])
    positive_data_gen = make_positives_data_gen(X, Y, label_by_y, index_by_token)
    tokens_eye = np.eye(len(index_by_token))
    labels_eye = np.eye(len(label_by_y))
    outputs = np.array([1] * half_batch_size + [-1] * half_batch_size)
    def data_gen():
        while True:
            positive_labels = []
            names_inputs = np.zeros([half_batch_size * 2, MAX_LEN])
            for i in range(half_batch_size):
                positive_tokens_indices, label = next(positive_data_gen)                
                names_inputs[i, :len(positive_tokens_indices)] = positive_tokens_indices
                positive_labels.append(label)
            negative_labels = []
            for i, label in enumerate(positive_labels):
                negative_tokens_indices = make_nagative(label, labels_nd_array, indices_by_label)
                names_inputs[half_batch_size + i, :len(negative_tokens_indices)] = negative_tokens_indices
                negative_labels.append(label)
                
            yield (
                {'name_input': names_inputs, 'label_input': labels_eye[positive_labels + negative_labels]},
                {'output': outputs}
            )
    return data_gen()


def make_classification_data_gen(X, Y, label_by_y, index_by_token, batch_size):
    labels_nd_array = np.array([label_by_y[y] for y in set(Y)])
    positive_data_gen = make_positives_data_gen(X, Y, label_by_y, index_by_token)
    tokens_eye = np.eye(len(index_by_token))
    labels_eye = np.eye(len(label_by_y))
    def data_gen():
        while True:
            labels = []
            inputs = np.zeros([batch_size, MAX_LEN])
            for i in range(batch_size):
                positive_tokens_indices, label = next(positive_data_gen)                
                inputs[i, :len(positive_tokens_indices)] = positive_tokens_indices
                labels.append(label)
                
            yield inputs, labels_eye[labels]
    return data_gen()

In [7]:
label_by_y = make_label_by_y(train_Y, val_Y)
tokens_counter = make_tokens_counter(train_X, val_X, test_X)
index_by_token = make_index_by_token(tokens_counter, MIN_FREQ)
indices_by_label_train = make_indices_by_label(train_Y, train_X, label_by_y, index_by_token)
indices_by_label_val = make_indices_by_label(val_Y, val_X, label_by_y, index_by_token)

In [None]:
def my_cosine_proximity(y_true, y_pred):
    return -K.mean(y_pred * y_true)

def mean_positive_score(y_true, y_pred):
    filter_mult = (y_true + 1) / 2
    return K.mean(y_pred * filter_mult)

def mean_positive_var(y_true, y_pred):
    mean_positive = mean_positive_score(y_true, y_pred)
    filter_mult = (y_true + 1) / 2
    return K.mean((y_pred * filter_mult - mean_positive) ** 2)

def get_pred(y_true, y_pred):
    mean_positive = mean_positive_score(y_true, y_pred)
    mean_negative = mean_negative_score(y_true, y_pred)
    
    threshold = (mean_positive + mean_negative) / 2
    
    positive_mult = (y_true + 1) / 2
    negative_mult = (1 - y_true) / 2

    return K.mean((y_pred * filter_mult - mean_positive) ** 2)

def mean_negative_score(y_true, y_pred):
    filter_mult = (1 - y_true) / 2
    return K.mean(y_pred * filter_mult)

def mean_negative_var(y_true, y_pred):
    mean_negative = mean_negative_score(y_true, y_pred)
    filter_mult = (1 - y_true) / 2
    return K.mean((y_pred * filter_mult - mean_negative) ** 2)

def normalize(embedding):
    return K.l2_normalize(embedding, axis=-1)

def dot_product(embeddings):
    return K.sum(embeddings[0] * embeddings[1], axis=-1)

def reshape_to_prediction(score):
    return K.reshape(score, (-1, 1))

def make_dssm_predictions(X, y_set, model, label_by_y, index_by_token, limit):
    labels_eye = np.eye(len(y_set))
    labels = labels_eye[[label_by_y[y] for y in y_set]]
    predictions = []
    for x in X[:limit]:
        names_input = np.zeros(MAX_LEN)
        indices = make_indices_from_x(x, index_by_token)
        names_input[:len(indices)] = indices
        names_input = names_input.reshape([1, MAX_LEN]).repeat(len(labels), axis=0)
        scores = model.predict({"name_input": names_input, "label_input": labels}).reshape(-1)
        predictions.append(np.argmax(scores))
        print(scores[predictions[-1]])
    return predictions

In [None]:
class EvaluateCallback(Callback):
    def __init__(self,
                 model,
                 models_folder,
                 metrics_file,
                 train_generator,
                 test_generator,
                 validation_steps,
                 validation_batch_divider):
        self._model = model
        self._models_folder = models_folder
        self._metrics_file = metrics_file
        self._test_generator = test_generator
        self._train_generator = train_generator
        self._validation_steps = validation_steps
        self._validation_batch_divider = validation_batch_divider
        self._epoch = 0

        self.history = {}
        for name in self._model.metrics_names:
            self.history["train_" + name] = []
            self.history["test_" + name] = []
            
    def on_train_begin(self, logs):
        if os.path.exists(self._models_folder):
            shutil.rmtree(self._models_folder)
            os.mkdir(self._models_folder)
        if os.path.exists(self._metrics_file):
            os.remove(self._metrics_file)
            open(self._metrics_file, "w").close()

    def on_batch_end(self, batch, logs):
        if batch % self._validation_batch_divider == 0:
            test_evals = self._model.evaluate_generator(
                self._test_generator,
                steps=self._validation_steps
            )
            train_evals = self._model.evaluate_generator(
                self._train_generator,
                steps=self._validation_steps
            )
            for metric_name, metric in zip(self._model.metrics_names, train_evals):
                self.history["train_" + metric_name].append(metric)
            for metric_name, metric in zip(self._model.metrics_names, test_evals):
                self.history["test_" + metric_name].append(metric)
            short_model_name = "epoch_{}_batch_{}".format(
                self._epoch,
                batch)
            metrics_string = short_model_name + "_train_{}_test_{}".format(
                "_".join(map(str, train_evals)),
                "_".join(map(str, test_evals))
            )
            with open(self._metrics_file, "a") as handler:
                handler.write(metrics_string + "\n")
            
            self._model.save_weights(os.path.join(self._models_folder, short_model_name))

    def on_epoch_end(self, epoch, logs):
        self._epoch += 1

In [None]:
classification_train_data = make_classification_data_gen(
    train_X,
    train_Y,
    label_by_y,
    index_by_token,
    BATCH_SIZE
)
classification_val_data = make_classification_data_gen(
    val_X,
    val_Y,
    label_by_y,
    index_by_token,
    BATCH_SIZE
)

In [None]:
name_input = Input(shape=(MAX_LEN,))  # shape: (BATCH_SIZE, MAX_LEN)
masked = Masking(mask_value=0)(name_input)
encoded = Embedding(TOKENS_NUM, HIDDEN_DIM)(masked)  # shape: (BATCH_SIZE, MAX_LEN, HIDDEN_DIM)
for i in range(LSTM_NUM - 1):
    encoded = Bidirectional(LSTM(HIDDEN_DIM, return_sequences=True))(encoded)  # shape: (BATCH_SIZE, HIDDEN_DIM)
hidden = Bidirectional(LSTM(HIDDEN_DIM))(encoded)  # shape: (BATCH_SIZE, HIDDEN_DIM)
for i in range(HIDDEN_LAYESR_NUM):
    hidden = Dense(HIDDEN_DIM, activation=ACTIVATION)(hidden)  # shape: (BATCH_SIZE, HIDDEN_DIM)

for i in range(HIDDEN_LAYESR_NUM - 1):
    hidden = Dense(HIDDEN_DIM, activation=ACTIVATION)(hidden)  # shape: (BATCH_SIZE, HIDDEN_DIM)
scores = Dense(LABELS_NUM, activation='softmax')(hidden)  # shape: (BATCH_SIZE, LABELS_NUM)

classificaton_model = Model(inputs=name_input, outputs=scores)

Instructions for updating:
Colocations handled automatically by placer.


In [None]:
classificaton_model.compile(
    Adam(),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
evaluate_callback_classification = EvaluateCallback(
    classificaton_model,
    "logs_classification",
    "metrics_classification.txt",
    classification_train_data,
    classification_val_data,
    1000,
    1000
)

In [None]:
history = classificaton_model.fit_generator(
    classification_train_data,
    steps_per_epoch=int(len(train_X) / (BATCH_SIZE)),
    epochs=10,
    verbose=1,
    callbacks=[evaluate_callback_classification],
    initial_epoch=0
)

Instructions for updating:
Use tf.cast instead.
Epoch 1/10
     1/125408 [..............................] - ETA: 493:55:26 - loss: 7.1082 - acc: 0.0000e+00

  % delta_t_median)


     2/125408 [..............................] - ETA: 18761:12:55 - loss: 7.0974 - acc: 0.0078  

  % delta_t_median)


    49/125408 [..............................] - ETA: 861:52:52 - loss: 6.3437 - acc: 0.0261