In [1]:
import numpy as np
import pandas as pd
import scipy

import tensorflow as tf
import tensorflow.keras.layers as L

from IPython.display import clear_output
import matplotlib.pyplot as plt

from transliterate import translit

from file_storage import FileStorage

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from tqdm import tqdm_notebook

import re

In [2]:
import os
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

---

###### Переведем все запросы в нижний регистр и выкинем запросы на языках, отличных от русского и английского, а также транслитерируем запросы на русском языке. 

###### Также извлечем название статьи

In [3]:
%%time

train_data = pd.read_csv('req_ans_learn.tsv', sep='\t', names=['query', 'document'])

train_data['query'] = train_data['query'].apply(lambda x: " ".join(re.sub('[^a-zA-Zа-яА-Я0-9ёЁ]+', ' ', str(x)).split()).lower())
train_data = train_data[train_data['query'] != '']
train_data['query'] = train_data['query'].apply(lambda x: translit(x, 'ru', reversed=True))

train_data['document'] = train_data['document'].apply(lambda x: x.split('/')[-1].lower())

CPU times: user 18.8 s, sys: 94.3 ms, total: 18.9 s
Wall time: 18.9 s


In [4]:
train_data.head()

Unnamed: 0,query,document
1,data integrity chto eto,data_integrity
2,a harmonious rejuvenation add to your retreat ...,balmoral_castle
3,can can,can-can
4,all the quiet,all_quiet_on_the_western_front
5,liliaceae tulipa lord beaconsfield parmesiano,liliaceae


---
###### Добавим к обучающей выборки все найденные статьи из первой домашки

In [5]:
queries = []
documents = []

fs = FileStorage('storage')
for key in list(fs.keys()):
    d = key.split('/')[-1]
    queries.append(d.replace('_', ' '))
    documents.append(d)
    
queries = np.asarray(queries)
documents = np.asarray(documents)
    
train_data_add = pd.DataFrame(np.vstack((queries, documents)).T, columns=['query', 'document'])

train_data = train_data.append(train_data_add, ignore_index=True)

---
###### Подсчитаем вектор частот триграмм символов и вектор частот слов (для запросов и документов)

In [6]:
class Vectorizer:
    def __init__(self, vectorizers):
        self.vectorizers = vectorizers
    
    def fit(self, X):
        for vectorizer in self.vectorizers:
            vectorizer.fit(X)
    
    def transform(self, X):
        vec = []
        for v in self.vectorizers:
            vec.append(v.transform(X))
        return scipy.sparse.hstack(vec).tocsr()

In [7]:
q_vec = Vectorizer([
    CountVectorizer(
        lowercase=True,
        ngram_range=(3, 3),
        max_features=20000,
        analyzer='char'
    ),
    CountVectorizer(
        lowercase=True,
        ngram_range=(1, 1),
        max_features=10000,
        analyzer='word'
    )
])

d_vec = Vectorizer([
    CountVectorizer(
        lowercase=True,
        ngram_range=(3, 3),
        max_features=20000,
        analyzer='char'
    ),
    CountVectorizer(
        lowercase=True,
        ngram_range=(1, 1),
        max_features=10000,
        analyzer='word'
    )
])

q_vec.fit(train_data['query'].to_numpy())
d_vec.fit(train_data['document'].to_numpy())

train_q_vec = q_vec.transform(train_data['query'].to_numpy())
train_d_vec = d_vec.transform(train_data['document'].to_numpy())

---
###### Для простоты переведем все целевые переменные (документы) в числа 

In [8]:
labels = LabelEncoder().fit_transform(train_data['document'].to_numpy())

---
###### Архитектура сети взята из статьи по DSSM

In [9]:
def build_model():
    return tf.keras.Sequential([
        L.Dense(1024, activation='relu'),
        L.BatchNormalization(),
        L.Dropout(0.1),
        L.Dense(512,  activation='relu'),
        L.BatchNormalization(),
        L.Dropout(0.1),
        L.Dense(128,  activation='linear'),
        tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))
    ])

---
###### Напишем функцию ошибки

In [10]:
def calc_loss(batch_q, batch_d, batch_l, gamma=3.0):
    
    cosines = tf.matmul(batch_q, batch_d, transpose_b=True)
    exp_pos = tf.exp(gamma * tf.linalg.diag_part(cosines))
    
    cosines = tf.where(tf.math.equal(batch_l[:, None], batch_l[None, :]), -1000, cosines)
    
    cosines = tf.sort(cosines, axis=-1, direction='DESCENDING')
    exp_neg = tf.exp(gamma * cosines[:, :20])
    
    return -tf.math.log(exp_pos / (exp_pos + tf.reduce_sum(exp_neg, axis=-1)))

---
###### Процесс обучения

In [11]:
q_model = build_model()
d_model = build_model()

BATCH_SIZE = 1024

optimizer = tf.keras.optimizers.Adam(0.001)

for epoch in tqdm_notebook(range(5)):
    print('epoch %d' % (epoch + 1))
    
    loss_hist = []
    
    for i in tqdm_notebook(range(0, len(labels), BATCH_SIZE)):
        batch_q_vec = train_q_vec[i : (i + BATCH_SIZE)].toarray().astype(np.float32)
        batch_d_vec = train_d_vec[i : (i + BATCH_SIZE)].toarray().astype(np.float32)
        batch_l = labels[i : (i + BATCH_SIZE)]

        with tf.GradientTape() as tape:
            batch_q_emb = q_model(batch_q_vec)
            batch_d_emb = d_model(batch_d_vec)

            loss = calc_loss(batch_q_emb, batch_d_emb, batch_l)
            
            loss_hist.append(loss.numpy().mean())

            grads = tape.gradient(loss, q_model.trainable_variables + d_model.trainable_variables)
            optimizer.apply_gradients(zip(grads, q_model.trainable_variables + d_model.trainable_variables))
            
    print('loss: %.2f' % np.mean(loss_hist))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

epoch 1


HBox(children=(IntProgress(value=0, max=665), HTML(value='')))


loss: 2.07
epoch 2


HBox(children=(IntProgress(value=0, max=665), HTML(value='')))


loss: 1.63
epoch 3


HBox(children=(IntProgress(value=0, max=665), HTML(value='')))


loss: 1.54
epoch 4


HBox(children=(IntProgress(value=0, max=665), HTML(value='')))


loss: 1.48
epoch 5


HBox(children=(IntProgress(value=0, max=665), HTML(value='')))


loss: 1.45



---
###### Проведем преобразования для тестовой выборки

In [15]:
test_data = pd.read_csv('req_ans_test_no_url.csv', sep=',')
test_data['Category'] = test_data['Category'].apply(lambda x: " ".join(re.sub('[^a-zA-Zа-яА-Я0-9ёЁ]+', ' ', str(x)).split()).lower())
test_data['Category'] = test_data['Category'].apply(lambda x: translit(x, 'ru', reversed=True))

In [16]:
test_data.head()

Unnamed: 0,Id,Category
0,1,there are two rivers in the city the selenga
1,2,biriani spice
2,3,bernli komanda
3,4,he 114
4,5,vikipedija pro megatrona iz kino 3 vikipedija


---
###### Получим эмбеддинги для запросов и документов

In [21]:
test_q_vec = q_vec.transform(test_data['Category'].to_numpy())

In [31]:
test_q_emb = []

for i in tqdm_notebook(range(0, test_q_vec.shape[0], BATCH_SIZE)):
    test_q_emb.append(q_model(test_q_vec[i:i + BATCH_SIZE].toarray(), training=False).numpy())

test_q_emb = np.concatenate(test_q_emb, axis=0)

HBox(children=(IntProgress(value=0, max=139), HTML(value='')))




In [38]:
test_d_vec = d_vec.transform(documents)

In [40]:
test_d_emb = []

for i in tqdm_notebook(range(0, test_d_vec.shape[0], BATCH_SIZE)):
    test_d_emb.append(d_model(test_d_vec[i:(i + BATCH_SIZE)].toarray(), training=False).numpy())

test_d_emb = np.concatenate(test_d_emb, axis=0)

HBox(children=(IntProgress(value=0, max=185), HTML(value='')))




---
###### Получим предсказания для теста

In [57]:
ans = []

for i, q in tqdm_notebook(enumerate(test_data['Category'])):
    cosines = test_q_emb[i] @ test_d_emb.T
    ans.append('/wiki/' + documents[np.argmax(cosines)])
    
test_data['Category'] = ans

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




---
###### Сохраним предсказания

In [60]:
test_data.to_csv('test_documents.csv', index=False)

---
###### Данное решение получило 62.7% accuracy на тестовой выборке