In [1]:
from collections import Counter
from sklearn.model_selection import KFold
import numpy as np
import re
import os
import shutil

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, Bidirectional, Dense, Input, Masking, Lambda
import keras.backend as K
from keras.optimizers import Adam
from keras.callbacks import LambdaCallback, LearningRateScheduler, Callback

Using TensorFlow backend.


In [2]:
def read_train_val(filename, val_prob=0.1):
    with open(filename, "rb") as handler:
        train_X, train_Y, val_X, val_Y = [], [], [], []
        for line in handler:
            x, y = line.decode("utf-8").strip().lower().split("\t")
            x = re.sub("\W", " ", x)
            if np.random.uniform() > val_prob:
                train_X.append(x)
                train_Y.append(y)
            else:
                val_X.append(x)
                val_Y.append(y)
        return np.array(train_X), np.array(train_Y), np.array(val_X), np.array(val_Y)

def read_test(filename):
    with open(filename, "rb") as handler:
        X = []
        for line in handler:
            line = re.sub("\W", " ", line.decode("utf-8").strip().lower())
            x = line.strip().lower()
            X.append(re.sub("\W", " ", x))
        return np.array(X)

def acc_metric(labels, predictions):
    assert isinstance(labels, np.ndarray)
    assert isinstance(predictions, np.ndarray)
    return len(labels[labels == predictions]) / len(labels)

def meashure_model(model, X, Y):
    kf = KFold(n_splits=2)
    metrics = []
    for train_index, test_index in kf.split(X):
        train_X, test_X = X[train_index], X[test_index]
        train_Y, test_Y = Y[train_index], Y[test_index]
        model.fit(train_X, train_Y)
        predictions = model.predict(test_X)
        metrics.append(acc_metric(predictions, test_Y))
    return np.mean(metrics), np.var(metrics)

class MostPopularModel:
    def fit(self, X, Y):
        self._answer = Counter(Y).most_common()[0][0]

    def predict(self, X):
        return np.array([self._answer] * len(X))

In [3]:
TRAIN_FILENAME = "names_and_rubrics_learn.tsv"
TEST_FILENAME = "names_and_rubrics_test_no_rubric.tsv"
MIN_FREQ = 5

TOKENS_NUM = 150767
HIDDEN_DIM = 256
MAX_LEN = 45
LSTM_NUM = 2
HIDDEN_LAYESR_NUM = 2
BATCH_SIZE = 1025
LABELS_NUM = 1222
ACTIVATION = 'tanh'

In [4]:
train_X, train_Y, val_X, val_Y = read_train_val(TRAIN_FILENAME)
test_X = read_test(TEST_FILENAME)

In [5]:
len(train_X), len(train_Y), len(val_X), len(val_Y), len(test_X)

(8026179, 8026179, 890923, 890923, 1000000)

In [6]:
def make_tokens_counter(*datas):
    tokens_counter = Counter()
    for lines in datas:
        for line in lines:
            for word in line.split():
                tokens_counter[word] += 1
    return tokens_counter

def make_index_by_token(tokens_counter, min_freq):
    tokens = [token for token in tokens_counter if tokens_counter[token] > min_freq]
    return {
        token: i
        for i, token in enumerate(tokens)
    }

def make_label_by_y(*Ys):
    all_Y = set()
    for Y in Ys:
        all_Y.update(Y)
    return {
        y: i
        for i, y in enumerate(set(all_Y))
    }

def make_indices_by_label(Y, X, label_by_y, index_by_token):
    indices_by_label = {
        label_by_y[y]: []
        for y in set(Y)
    }
    for x, y in zip(X, Y):
        indices_by_label[label_by_y[y]].append(make_indices_from_x(x, index_by_token))
    return indices_by_label

def get_max_X_len(*Xs):
    max_len = -1
    for X in Xs:
        max_len = max(max_len, max([len(x.split()) for x in X]))
    return max_len

def make_indices_from_x(x, index_by_token):
    return [
        index_by_token[token]
        for token in x.split()
        if token in index_by_token
    ]

def make_positives_data_gen(X, Y, label_by_y, index_by_token):
    def positives_data_gen():
        while True:
            index = np.random.randint(0, len(X))
            yield (
                make_indices_from_x(X[index], index_by_token),
                label_by_y[Y[index]]
            )
    return positives_data_gen()

def make_nagative(label, labels_nd_array, indices_by_label):
    negative_label = label
    while negative_label == label:
        negative_label = np.random.choice(labels_nd_array)
    variants = indices_by_label[negative_label]
    index = np.random.randint(0, len(variants))
    return variants[index]

def make_data_gen(X, Y, label_by_y, index_by_token, indices_by_label, half_batch_size):
    labels_nd_array = np.array([label_by_y[y] for y in set(Y)])
    positive_data_gen = make_positives_data_gen(X, Y, label_by_y, index_by_token)
    labels_eye = np.eye(len(label_by_y))
    outputs = np.array([1] * half_batch_size + [-1] * half_batch_size)
    def data_gen():
        while True:
            positive_labels = []
            names_inputs = np.zeros([half_batch_size * 2, MAX_LEN])
            for i in range(half_batch_size):
                positive_tokens_indices, label = next(positive_data_gen)                
                names_inputs[i, :len(positive_tokens_indices)] = positive_tokens_indices
                positive_labels.append(label)
            negative_labels = []
            for i, label in enumerate(positive_labels):
                negative_tokens_indices = make_nagative(label, labels_nd_array, indices_by_label)
                names_inputs[half_batch_size + i, :len(negative_tokens_indices)] = negative_tokens_indices
                negative_labels.append(label)
                
            yield (
                {'name_input': names_inputs, 'label_input': labels_eye[positive_labels + negative_labels]},
                {'output': outputs}
            )
    return data_gen()


def make_classification_data_gen(X, Y, label_by_y, index_by_token, batch_size):
    labels_nd_array = np.array([label_by_y[y] for y in set(Y)])
    positive_data_gen = make_positives_data_gen(X, Y, label_by_y, index_by_token)
    labels_eye = np.eye(len(label_by_y))
    def data_gen():
        while True:
            labels = []
            inputs = np.zeros([batch_size, MAX_LEN])
            for i in range(batch_size):
                positive_tokens_indices, label = next(positive_data_gen)                
                inputs[i, :len(positive_tokens_indices)] = positive_tokens_indices
                labels.append(label)
                
            yield inputs, labels_eye[labels]
    return data_gen()

In [7]:
label_by_y = make_label_by_y(train_Y, val_Y)
tokens_counter = make_tokens_counter(train_X, val_X, test_X)
index_by_token = make_index_by_token(tokens_counter, MIN_FREQ)
indices_by_label_train = make_indices_by_label(train_Y, train_X, label_by_y, index_by_token)
indices_by_label_val = make_indices_by_label(val_Y, val_X, label_by_y, index_by_token)

In [8]:
def my_cosine_proximity(y_true, y_pred):
    return -K.mean(y_pred * y_true)

def mean_positive_score(y_true, y_pred):
    filter_mult = (y_true + 1) / 2
    return K.mean(y_pred * filter_mult)

def mean_positive_var(y_true, y_pred):
    mean_positive = mean_positive_score(y_true, y_pred)
    filter_mult = (y_true + 1) / 2
    return K.mean((y_pred * filter_mult - mean_positive) ** 2)

def get_pred(y_true, y_pred):
    mean_positive = mean_positive_score(y_true, y_pred)
    mean_negative = mean_negative_score(y_true, y_pred)
    
    threshold = (mean_positive + mean_negative) / 2
    
    positive_mult = (y_true + 1) / 2
    negative_mult = (1 - y_true) / 2

    return K.mean((y_pred * filter_mult - mean_positive) ** 2)

def mean_negative_score(y_true, y_pred):
    filter_mult = (1 - y_true) / 2
    return K.mean(y_pred * filter_mult)

def mean_negative_var(y_true, y_pred):
    mean_negative = mean_negative_score(y_true, y_pred)
    filter_mult = (1 - y_true) / 2
    return K.mean((y_pred * filter_mult - mean_negative) ** 2)

def normalize(embedding):
    return K.l2_normalize(embedding, axis=-1)

def dot_product(embeddings):
    return K.sum(embeddings[0] * embeddings[1], axis=-1)

def reshape_to_prediction(score):
    return K.reshape(score, (-1, 1))

def make_dssm_predictions(X, y_set, model, label_by_y, index_by_token, limit):
    labels_eye = np.eye(len(y_set))
    labels = labels_eye[[label_by_y[y] for y in y_set]]
    predictions = []
    for x in X[:limit]:
        names_input = np.zeros(MAX_LEN)
        indices = make_indices_from_x(x, index_by_token)
        names_input[:len(indices)] = indices
        names_input = names_input.reshape([1, MAX_LEN]).repeat(len(labels), axis=0)
        scores = model.predict({"name_input": names_input, "label_input": labels}).reshape(-1)
        predictions.append(np.argmax(scores))
        print(scores[predictions[-1]])
    return predictions

In [9]:
class EvaluateCallback(Callback):
    def __init__(self,
                 model,
                 models_folder,
                 metrics_file,
                 train_generator,
                 test_generator,
                 validation_steps,
                 validation_batch_divider):
        self._model = model
        self._models_folder = models_folder
        self._metrics_file = metrics_file
        self._test_generator = test_generator
        self._train_generator = train_generator
        self._validation_steps = validation_steps
        self._validation_batch_divider = validation_batch_divider
        self._epoch = 0

        self.history = {}
        for name in self._model.metrics_names:
            self.history["train_" + name] = []
            self.history["test_" + name] = []
            
    def on_train_begin(self, logs):
        if os.path.exists(self._models_folder):
            shutil.rmtree(self._models_folder)
            os.mkdir(self._models_folder)
        if os.path.exists(self._metrics_file):
            os.remove(self._metrics_file)
            open(self._metrics_file, "w").close()

    def on_batch_end(self, batch, logs):
        if batch % self._validation_batch_divider == 0:
            test_evals = self._model.evaluate_generator(
                self._test_generator,
                steps=self._validation_steps
            )
            train_evals = self._model.evaluate_generator(
                self._train_generator,
                steps=self._validation_steps
            )
            for metric_name, metric in zip(self._model.metrics_names, train_evals):
                self.history["train_" + metric_name].append(metric)
            for metric_name, metric in zip(self._model.metrics_names, test_evals):
                self.history["test_" + metric_name].append(metric)
            short_model_name = "epoch_{}_batch_{}".format(
                self._epoch,
                batch)
            metrics_string = short_model_name + "_train_{}_test_{}".format(
                "_".join(map(str, train_evals)),
                "_".join(map(str, test_evals))
            )
            with open(self._metrics_file, "a") as handler:
                handler.write(metrics_string + "\n")
            
            self._model.save_weights(os.path.join(self._models_folder, short_model_name))

    def on_epoch_end(self, epoch, logs):
        self._epoch += 1

In [10]:
classification_train_data = make_classification_data_gen(
    train_X,
    train_Y,
    label_by_y,
    index_by_token,
    BATCH_SIZE
)
classification_val_data = make_classification_data_gen(
    val_X,
    val_Y,
    label_by_y,
    index_by_token,
    BATCH_SIZE
)

In [19]:
name_input = Input(shape=(MAX_LEN,))  # shape: (BATCH_SIZE, MAX_LEN)
masked = Masking(mask_value=0)(name_input)
encoded = Embedding(TOKENS_NUM, HIDDEN_DIM)(masked)  # shape: (BATCH_SIZE, MAX_LEN, HIDDEN_DIM)
for i in range(LSTM_NUM - 1):
    encoded = Bidirectional(LSTM(HIDDEN_DIM, return_sequences=True))(encoded)  # shape: (BATCH_SIZE, HIDDEN_DIM)
hidden = Bidirectional(LSTM(HIDDEN_DIM))(encoded)  # shape: (BATCH_SIZE, HIDDEN_DIM)
for i in range(HIDDEN_LAYESR_NUM):
    hidden = Dense(HIDDEN_DIM, activation=ACTIVATION)(hidden)  # shape: (BATCH_SIZE, HIDDEN_DIM)

for i in range(HIDDEN_LAYESR_NUM - 1):
    hidden = Dense(HIDDEN_DIM, activation=ACTIVATION)(hidden)  # shape: (BATCH_SIZE, HIDDEN_DIM)
scores = Dense(LABELS_NUM, activation='softmax')(hidden)  # shape: (BATCH_SIZE, LABELS_NUM)

classificaton_model = Model(inputs=name_input, outputs=scores)

In [20]:
classificaton_model.compile(
    Adam(lr=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [21]:
evaluate_callback_classification = EvaluateCallback(
    classificaton_model,
    "logs_256",
    "metrics_256.txt",
    classification_train_data,
    classification_val_data,
    50,
    500
)

In [22]:
divide_lr_by = 10
start_lr = 0.001
reduce_lr_every = 5
def learning_rate(epoch):
    return start_lr / divide_lr_by ** (epoch / reduce_lr_every)
lr_decay_callback = LearningRateScheduler(schedule=learning_rate, verbose=1)

In [23]:
history = classificaton_model.fit_generator(
    classification_train_data,
    steps_per_epoch=int(len(train_X) / (BATCH_SIZE)),
    epochs=20,
    verbose=1,
    callbacks=[evaluate_callback_classification, lr_decay_callback],
    initial_epoch=0
)

Epoch 1/20

Epoch 00001: LearningRateScheduler setting learning rate to 0.001.
   1/7830 [..............................] - ETA: 10:31:25 - loss: 7.1080 - acc: 0.0000e+00

  % delta_t_median)


   2/7830 [..............................] - ETA: 43:27:34 - loss: 7.1022 - acc: 0.0132    

  % delta_t_median)


Epoch 2/20

Epoch 00002: LearningRateScheduler setting learning rate to 0.0006309573444801932.
   1/7830 [..............................] - ETA: 2:23:24 - loss: 2.2826 - acc: 0.5649

  % delta_t_median)


   2/7830 [..............................] - ETA: 38:26:28 - loss: 2.2633 - acc: 0.5712

  % delta_t_median)


Epoch 3/20

Epoch 00003: LearningRateScheduler setting learning rate to 0.00039810717055349724.
   1/7830 [..............................] - ETA: 2:23:58 - loss: 1.9304 - acc: 0.6283

  % delta_t_median)


   2/7830 [..............................] - ETA: 38:19:16 - loss: 1.9350 - acc: 0.6302

  % delta_t_median)


Epoch 4/20

Epoch 00004: LearningRateScheduler setting learning rate to 0.000251188643150958.
   1/7830 [..............................] - ETA: 2:23:52 - loss: 1.9137 - acc: 0.6410

  % delta_t_median)


   2/7830 [..............................] - ETA: 38:19:17 - loss: 1.8429 - acc: 0.6517

  % delta_t_median)


Epoch 5/20

Epoch 00005: LearningRateScheduler setting learning rate to 0.00015848931924611134.
   1/7830 [..............................] - ETA: 2:23:57 - loss: 1.7506 - acc: 0.6537

  % delta_t_median)


   2/7830 [..............................] - ETA: 38:20:25 - loss: 1.7856 - acc: 0.6551

  % delta_t_median)


Epoch 6/20

Epoch 00006: LearningRateScheduler setting learning rate to 0.0001.
   1/7830 [..............................] - ETA: 2:23:30 - loss: 1.8347 - acc: 0.6498

  % delta_t_median)


   2/7830 [..............................] - ETA: 38:21:11 - loss: 1.8439 - acc: 0.6380

  % delta_t_median)


1193/7830 [===>..........................] - ETA: 2:10:59 - loss: 1.7757 - acc: 0.6511

KeyboardInterrupt: 

In [24]:
classificaton_model.save("model_256")

In [25]:
for x in test_X:
    indices = make_indices_from_x(x, index_by_token)
    print(indices)
    break

[65262, 137744]


In [26]:
def get_real_labels(filename):
    with open(filename, "rb") as handler:
        real_labels = set()
        for line in handler:
            label = line.decode("utf-8").strip().split("\t")[1]
            real_labels.add(label)
    return real_labels

In [27]:
real_labels = get_real_labels(TRAIN_FILENAME)

In [28]:
get_real_label = {
    real_label.lower(): real_label
    for real_label in real_labels
}

In [40]:
import codecs

In [None]:
file = codecs.open("val_X", "w", "utf-8")
for x in val_X:        
    file.write(x + "\n")
file.close()

In [None]:
file = codecs.open("val_Y", "w", "utf-8")
for x in val_Y:        
    file.write(x + "\n")
file.close()

In [None]:
file = codecs.open("train_X", "w", "utf-8")
for x in train_X:        
    file.write(x + "\n")
file.close()

In [None]:
file = codecs.open("train_Y", "w", "utf-8")
for x in train_Y:        
    file.write(x + "\n")
file.close()

In [29]:
def make_net_input(X):
    net_input = []
    for i, x in enumerate(X):
        if i % 1000 == 0:
            print("{} / {}".format(i, len(X)))
        cur_net_input = np.zeros(MAX_LEN)
        indices = make_indices_from_x(x, index_by_token)
        cur_net_input[:len(indices)] = indices
        net_input.append(cur_net_input)
    return np.array(net_input)

In [30]:
test_net_input = make_net_input(test_X)

0 / 1000000
1000 / 1000000
2000 / 1000000
3000 / 1000000
4000 / 1000000
5000 / 1000000
6000 / 1000000
7000 / 1000000
8000 / 1000000
9000 / 1000000
10000 / 1000000
11000 / 1000000
12000 / 1000000
13000 / 1000000
14000 / 1000000
15000 / 1000000
16000 / 1000000
17000 / 1000000
18000 / 1000000
19000 / 1000000
20000 / 1000000
21000 / 1000000
22000 / 1000000
23000 / 1000000
24000 / 1000000
25000 / 1000000
26000 / 1000000
27000 / 1000000
28000 / 1000000
29000 / 1000000
30000 / 1000000
31000 / 1000000
32000 / 1000000
33000 / 1000000
34000 / 1000000
35000 / 1000000
36000 / 1000000
37000 / 1000000
38000 / 1000000
39000 / 1000000
40000 / 1000000
41000 / 1000000
42000 / 1000000
43000 / 1000000
44000 / 1000000
45000 / 1000000
46000 / 1000000
47000 / 1000000
48000 / 1000000
49000 / 1000000
50000 / 1000000
51000 / 1000000
52000 / 1000000
53000 / 1000000
54000 / 1000000
55000 / 1000000
56000 / 1000000
57000 / 1000000
58000 / 1000000
59000 / 1000000
60000 / 1000000
61000 / 1000000
62000 / 1000000
63000

498000 / 1000000
499000 / 1000000
500000 / 1000000
501000 / 1000000
502000 / 1000000
503000 / 1000000
504000 / 1000000
505000 / 1000000
506000 / 1000000
507000 / 1000000
508000 / 1000000
509000 / 1000000
510000 / 1000000
511000 / 1000000
512000 / 1000000
513000 / 1000000
514000 / 1000000
515000 / 1000000
516000 / 1000000
517000 / 1000000
518000 / 1000000
519000 / 1000000
520000 / 1000000
521000 / 1000000
522000 / 1000000
523000 / 1000000
524000 / 1000000
525000 / 1000000
526000 / 1000000
527000 / 1000000
528000 / 1000000
529000 / 1000000
530000 / 1000000
531000 / 1000000
532000 / 1000000
533000 / 1000000
534000 / 1000000
535000 / 1000000
536000 / 1000000
537000 / 1000000
538000 / 1000000
539000 / 1000000
540000 / 1000000
541000 / 1000000
542000 / 1000000
543000 / 1000000
544000 / 1000000
545000 / 1000000
546000 / 1000000
547000 / 1000000
548000 / 1000000
549000 / 1000000
550000 / 1000000
551000 / 1000000
552000 / 1000000
553000 / 1000000
554000 / 1000000
555000 / 1000000
556000 / 10000

986000 / 1000000
987000 / 1000000
988000 / 1000000
989000 / 1000000
990000 / 1000000
991000 / 1000000
992000 / 1000000
993000 / 1000000
994000 / 1000000
995000 / 1000000
996000 / 1000000
997000 / 1000000
998000 / 1000000
999000 / 1000000


In [31]:
test_prediction = classificaton_model.predict(test_net_input)

In [43]:
test_prediction.shape

(1000000, 1222)

In [44]:
test_prediction.shape

(1000000, 1222)

In [45]:
real_y_by_label = {
    label_by_y[y]: get_real_label[y]
    for y in label_by_y
}

In [46]:
file = codecs.open("test_prediction", "w", "utf-8")
for prediction in test_prediction:
    file.write(real_y_by_label[np.argmax(prediction)] + "\n")

In [None]:
np.save("net_data", prediction)

In [36]:
real_y_by_label

{0: 'Орехи, снеки, сухофрукты',
 1: 'Нефтегазовое оборудование',
 2: 'Медицинская мебель',
 3: 'Фармацевтическая компания',
 4: 'Молочная продукция оптом',
 5: 'Информационная безопасность',
 6: 'Логистическая компания',
 7: 'Детский сад',
 8: 'Памятник, скульптура',
 9: 'Очистные сооружения и оборудование',
 10: 'Стеклянные двери',
 11: 'Теннисный клуб',
 12: 'Багетная мастерская',
 13: 'Негосударственный пенсионный фонд',
 14: 'Клининговое оборудование и инвентарь',
 15: 'Производство продуктов питания',
 16: 'Услуги вышивки',
 17: 'Магазин ткани',
 18: 'Рынок',
 19: 'Магазин электротоваров',
 20: 'Ремонт кассовых аппаратов',
 21: 'Психотерапевтическая помощь',
 22: 'Оператор сотовой связи',
 23: 'Коррекция зрения',
 24: 'Фотошкола',
 25: 'Православный храм',
 26: 'Натяжные и подвесные потолки',
 27: 'Статистическая организация',
 28: 'Барбершоп',
 29: 'Конференц-зал',
 30: 'Школа искусств',
 31: 'Вездеходы, гидроциклы, снегоходы',
 32: '3D-печать',
 33: 'Складское оборудование',
 34

In [None]:
classificaton_model.load_weights("logs_classification_lr=0.0001/epoch_3_batch_7620.h5py")