# Automated Intelligent Module for matching master data of Data Warehouse.
Based on Word2Vec code of MIPT's Deep Learning and Neural Systems lab -
https://github.com/deepmipt/deep-nlp-seminars/blob/master/seminar_02/embeddings.ipynb

In [None]:
import os
import re
from collections import Counter, deque
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
import pymorphy2
import tensorflow as tf

## Download master data

In [None]:
data_file = pd.read_csv('material.csv', sep=';', encoding='cp1251',
                        error_bad_lines=False)
data_file = data_file[['FullName']].astype('str')

## Data preparation

In [None]:
morph = pymorphy2.MorphAnalyzer()
tokenizer = RegexpTokenizer(r'\w+')
words = list()
for i in range(data_file.shape[0]):
    for column in data_file.columns:
        data_file[column][i] = re.sub(r'\d+', '', data_file[column][i])
        for word in tokenizer.tokenize(data_file[column][i]):
            if len(word) != 1:
                word = morph.parse(word)[0].normal_form
                words.append(word.lower())

## Creating a dictionary

In [None]:
count = []
count.extend(Counter(words).most_common())
word_to_index = {word: i for i, (word, _) in enumerate(count)}
data = [word_to_index.get(word, 0) for word in words]
index_to_word = dict(zip(word_to_index.values(), word_to_index.keys()))

## Function of generating batches for training neural network

In [None]:
def generate_batch(data_index, data_size, batch_size, bag_window):
    span = 2 * bag_window + 1  # [ bag_window, target, bag_window ]
    batch = np.ndarray(shape=(batch_size, span - 1), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)

    data_buffer = deque(maxlen=span)

    for _ in range(span):
        data_buffer.append(data[data_index])
        data_index = (data_index + 1) % data_size

    for i in range(batch_size):
        data_list = list(data_buffer)
        labels[i, 0] = data_list.pop(bag_window)
        batch[i] = data_list

        data_buffer.append(data[data_index])
        data_index = (data_index + 1) % data_size
    return data_index, batch, labels

## Hyperparametrs of neural network

In [None]:
data_index, data_size = 0, len(data)
vocabulary_size = len(count)
batch_size = 128
embedding_size = 64
bag_window = 2

## Creating graph (CBOW)

In [None]:
tf.reset_default_graph()
graph = tf.Graph()
with graph.as_default():
    train_data = tf.placeholder(tf.int32, [batch_size, bag_window * 2])
    train_labels = tf.placeholder(tf.int32, [batch_size, 1])
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size,
                                                embedding_size], -1.0, 1.0))

    embed = tf.nn.embedding_lookup(embeddings, train_data)
    context_sum = tf.reduce_sum(embed, 1) / (bag_window * 2)
    score = tf.matmul(context_sum, embeddings, transpose_b=True)
    one_hot_labels = tf.one_hot(train_labels, depth=vocabulary_size)
    loss_tensor = tf.nn.softmax_cross_entropy_with_logits_v2(logits=score,
                                                             labels=one_hot_labels)
    loss = tf.reduce_mean(loss_tensor)

    optimizer = tf.train.AdamOptimizer(0.01).minimize(loss)

    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm

## Training neural network

In [None]:
num_steps = 3 * len(data)
loss_every_nsteps = 200

with tf.Session(graph=graph) as sess:
    try:
        tf.global_variables_initializer().run()
        print('Initialized')
        average_loss = 0

        for step in range(num_steps):
            data_index, batch, labels = generate_batch(data_index, data_size,
                                                       batch_size, bag_window)
            feed_dict = {train_data: batch, train_labels: labels}
            _, current_loss = sess.run([optimizer, loss], feed_dict=feed_dict)
            average_loss += current_loss
            if step % loss_every_nsteps == 0:
                if step > 0:
                    average_loss = average_loss / loss_every_nsteps
                    print("step = {0}, average_loss = {1}".format(step,
                                                                  average_loss))
                    average_loss = 0
    except KeyboardInterrupt:
        final_embeddings = normalized_embeddings.eval()
    final_embeddings = normalized_embeddings.eval()

## Save embedding's matrix in .txt files

In [None]:
with open('embeddings.txt', 'w') as f:
    for n in range(vocabulary_size):
        s = '\t'.join([index_to_word[n]] +
                      [str(num) for num in final_embeddings[n]])
        f.write(s + '\n')
with open('mentadata.txt', 'w') as f:
    for n in range(vocabulary_size):
        f.write(index_to_word[n] + '\n')