# Исследование векторного представления элементов и настройка параметров

## Корпус справочных элементов и предобработка

Корпус справочных элементов представляет собой набор из 264 тысяч записей справочника "Номенклатуры"

Минимальная предобработка выполнена с помошью библиотекой Python - NLTK и включает в себя следующую последовательность действий:
* Удаление пунктуации и знаков
* Удаление числовых значений
* Приведение слов к нижнему регистру
* Токенизация

In [5]:
from nltk.tokenize import RegexpTokenizer
import re
import pandas as pd
from tqdm import tqdm_notebook as tq

data_file = pd.read_csv('./Data/material.csv', sep=';', encoding = 'cp1251', error_bad_lines=False,
                        low_memory=False)[['FullName']]

data_file = data_file[['FullName']].astype('str')

tokenizer = RegexpTokenizer(r'\w+')

rows = [row[0] for row in data_file.values]

words = []
for row in tq(rows):
    words.append([i.lower() for i in tokenizer.tokenize(re.sub(r'\d+', '', row)) if len(i) > 1])

HBox(children=(IntProgress(value=0, max=263740), HTML(value='')))




## Набор гиперпараметров

Параметры | Значения
----------| --------
Размерность вектора | 25 / 50 / **100** / 200 / 400 / 800
Окно | 1 / 2 / 3 / 4 / **5** / 6 / 7 / 8
Минимальная частота | 0 / **5** / 10 / 20 / 50 / 100 / 200/400 / 800 / 1000 / 1200 / 2400
Негативное семплирование | 1 / 2 / 3 / **5** / 8 /10 / 15
Шаг обучения | 0.0125 / **0.025** / 0.05 / 0.1
Сэмплирование | 0 / 1e-1 / 1e-2 / **1e-3** / 1e-4 / 1e-5 / 1e-6 / 1e-7 / 1e-8 / 1e-9

### Baseline model

In [2]:
from gensim.models import Word2Vec as w2v

def_param = {'size': 100, 'window': 5, 'min_count': 5, 'negative': 5, 'alpha': 0.025, 'sample': 1e-3}

set_param = {
    'size': [25, 50, 100, 200, 400, 800], 
    'window': [1, 2, 3, 4, 5, 5, 7, 8],
    'min_count': [0, 5, 10, 20, 50, 100, 200, 400, 800, 1000, 1200, 2400],
    'negative': [1, 2, 3, 5, 8, 10, 15],
    'alpha': [0.0125, 0.025, 0.05, 0.01],
    'sample': [0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9]
}

param_to_index = {'size': 0, 'window': 1, 'min_count': 2, 'negative': 3, 'alpha': 4, 'sample': 5}
# index_to_param = {0: 'size', 1: 'window', 2: 'min_count', 3: 'negative', 4: 'alpha', 5: 'sample'}

# size = [25, 50, 100, 200, 400, 800]
# window = [1, 2, 3, 4, 5, 5, 7, 8]
# min_count = [0, 5, 10, 20, 50, 100, 200, 400, 800, 1000, 1200, 2400]
# negative = [1, 2, 3, 5, 8, 10, 15]
# alpha = [0.0125, 0.025, 0.05, 0.01]
# sample = [0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9]

# models = w2v(words, size=def_param['size'], window=def_param['window'], min_count=def_param['min_count'], sg=1,
#              negative=def_param['negative'], alpha=def_param['alpha'], sample=def_param['alpha'])

## Внутренняя оценка векторов

Внутренняя оценка проводится по методу оценки сходства слов на размечанном корпусе 380 пар для измерения близости элементов спавочника. 

Элемент справочника $el = \{w_{1}, w_{2}, ..., w_{k}\}$, тогда вектор элемента вычисляется по следующей формуле

$$
x_{el} = \frac{1}{k}\sum_{i=1}^{k}w_{i}
$$

Все пары однозначно определены, следовательно средний показатель косинусной близости должен стремиться к единице для всех примеров.

In [3]:
import pandas as pd
import numpy as np
import scipy

data = pd.read_csv(r'./Data/labeled_data.csv', sep=';', encoding='cp1251', error_bad_lines=False)[['ЭталоннаяПозиция',
                                                                                                   'Номенклатура']]
def tokens(elem, model, size):
    elem = [i.lower() for i in tokenizer.tokenize(re.sub(r'\d+', '', elem)) if len(i) > 1]
    el = []
    for i in elem:
        try:
            el.append(model.wv[i])
        except KeyError:
            el.append(np.zeros(size))
    return el

In [9]:
simil = []
params = [j for j in def_param.values()]
for i in tq(def_param.keys()):
    index = param_to_index[i]
    for k in set_param[i]:
        param_test = params.copy()
        param_test[index] = k
        model = w2v(words, size=param_test[0], window=param_test[1], min_count=param_test[2], sg=1, negative=param_test[3],
                    alpha=param_test[4], sample=param_test[5])
        cos = []
        for _, row in data.iterrows():
            el1 = np.mean(tokens(row['ЭталоннаяПозиция'], model, param_test[0]), axis=0)
            el2 = np.mean(tokens(row['Номенклатура'], model, param_test[0]), axis=0)
            cos.append(1 - scipy.spatial.distance.cosine(el1, el2))
        print(len(cos))
        mean = np.mean(cos)
        print('{0} = {1}'.format(i, k), param_test, '\n', 'Similarity = {0}'.format(mean))
        simil.append(mean)

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

379
size = 25 [25, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.9280106619442456
379
size = 50 [50, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.9098054833299476
379
size = 100 [100, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.9026454101055017
379
size = 200 [200, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.901944495173828
379
size = 400 [400, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.9036644331943059
379
size = 800 [800, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.9044860928459704
379
window = 1 [100, 1, 5, 5, 0.025, 0.001] 
 Similarity = 0.9073514897341098
379
window = 2 [100, 2, 5, 5, 0.025, 0.001] 
 Similarity = 0.9072034939545546
379
window = 3 [100, 3, 5, 5, 0.025, 0.001] 
 Similarity = 0.9058057921008699
379
window = 4 [100, 4, 5, 5, 0.025, 0.001] 
 Similarity = 0.9037604111944467
379
window = 5 [100, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.9033542412561906
379
window = 5 [100, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.9025733900469253
379
window = 7 [100, 7, 5, 5, 0.025, 0.001] 
 Similarity = 0.90214415

  dist = 1.0 - uv / np.sqrt(uu * vv)


379
min_count = 20 [100, 5, 20, 5, 0.025, 0.001] 
 Similarity = nan
379
min_count = 50 [100, 5, 50, 5, 0.025, 0.001] 
 Similarity = nan
379
min_count = 100 [100, 5, 100, 5, 0.025, 0.001] 
 Similarity = nan
379
min_count = 200 [100, 5, 200, 5, 0.025, 0.001] 
 Similarity = nan
379
min_count = 400 [100, 5, 400, 5, 0.025, 0.001] 
 Similarity = nan
379
min_count = 800 [100, 5, 800, 5, 0.025, 0.001] 
 Similarity = nan
379
min_count = 1000 [100, 5, 1000, 5, 0.025, 0.001] 
 Similarity = nan
379
min_count = 1200 [100, 5, 1200, 5, 0.025, 0.001] 
 Similarity = nan
379
min_count = 2400 [100, 5, 2400, 5, 0.025, 0.001] 
 Similarity = nan
379
negative = 1 [100, 5, 5, 1, 0.025, 0.001] 
 Similarity = 0.8957014053492978
379
negative = 2 [100, 5, 5, 2, 0.025, 0.001] 
 Similarity = 0.9053314800615279
379
negative = 3 [100, 5, 5, 3, 0.025, 0.001] 
 Similarity = 0.9055048112166174
379
negative = 5 [100, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.9029581469619804
379
negative = 8 [100, 5, 5, 8, 0.025, 0.001] 
 

## Внешняя оценка векторов

Внешняя оценка проводится с помощью заранее обученного классификатора присвоения бухгалтерского счета. Во время тестирование намеренно не будет поизводится тонкая настройка классификатора и дообучение векторов, в связи с тем чтобы более наглядно продемонстрировать зависимость качетсва классификатора от векторов.

Оценивание классификатора проводится по F1-score методу

In [7]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from random import shuffle

classif_model = LinearSVC()

models = w2v(words, size=def_param['size'], window=def_param['window'], min_count=def_param['min_count'], sg=1,
             negative=def_param['negative'], alpha=def_param['alpha'], sample=def_param['alpha'])

nomen = pd.read_csv('./Data/nomenklatura.csv', sep=';',
                    encoding='cp1251', error_bad_lines=False, low_memory=False)[['FullName', 'Count']]

X = []
Y = []
f1_score_test = []
params = [j for j in def_param.values()]
for i in tq(def_param.keys()):
    index = param_to_index[i]
    for k in set_param[i]:
        param_test = params.copy()
        param_test[index] = k
        model = w2v(words, size=param_test[0], window=param_test[1], min_count=param_test[2], sg=1, negative=param_test[3],
                    alpha=param_test[4], sample=param_test[5])
        X = []
        Y = []
        for _, row in nomen.iterrows():
            X.append(np.mean(tokens(row['FullName'], model, size=param_test[0]), axis=0))
            if row['Count'] == '10.11.2001':
                row['Count'] = '10,11,01'
            if row['Count'] == '10.11.2002':
                row['Count'] = '10,11,02'
            if row['Count'] == '10.мар':
                row['Count'] = '10,03'
            if row['Count'] == '10.окт':
                row['Count'] = '10,09'
            Y.append(row['Count'])

        shuffle(X)
        shuffle(Y)

        X_train = X[:3150]
        X_valid = X[3150:4500]

        Y_train = Y[:3150]
        Y_valid = Y[3150:4500]


        classif_model.fit(X_train, Y_train)
        f1_param = f1_score(np.array(Y_valid), classif_model.predict(X_valid), average='macro')
        print('{0} = {1}'.format(i, k), param_test, '\n', 'Similarity = {0}'.format(f1_param))
        f1_score_test.append(f1_param)

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

  'precision', 'predicted', average, warn_for)


size = 25 [25, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.04568114274757934


  'precision', 'predicted', average, warn_for)


size = 50 [50, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.04709618094592274


  'precision', 'predicted', average, warn_for)


size = 100 [100, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.05716289541474342


  'precision', 'predicted', average, warn_for)


size = 200 [200, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.05565298823847951


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


size = 400 [400, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.06193799789917489


  'precision', 'predicted', average, warn_for)


size = 800 [800, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.05735572053460841


  'precision', 'predicted', average, warn_for)


window = 1 [100, 1, 5, 5, 0.025, 0.001] 
 Similarity = 0.054129716760425044


  'precision', 'predicted', average, warn_for)


window = 2 [100, 2, 5, 5, 0.025, 0.001] 
 Similarity = 0.0552043606214333


  'precision', 'predicted', average, warn_for)


window = 3 [100, 3, 5, 5, 0.025, 0.001] 
 Similarity = 0.05009777862406339


  'precision', 'predicted', average, warn_for)


window = 4 [100, 4, 5, 5, 0.025, 0.001] 
 Similarity = 0.058029893191591644


  'precision', 'predicted', average, warn_for)


window = 5 [100, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.05912312102788402


  'precision', 'predicted', average, warn_for)


window = 5 [100, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.05563461480406011


  'precision', 'predicted', average, warn_for)


window = 7 [100, 7, 5, 5, 0.025, 0.001] 
 Similarity = 0.05846893727736866


  'precision', 'predicted', average, warn_for)


window = 8 [100, 8, 5, 5, 0.025, 0.001] 
 Similarity = 0.05628870388665333


  'precision', 'predicted', average, warn_for)


min_count = 0 [100, 5, 0, 5, 0.025, 0.001] 
 Similarity = 0.06683637798732309


  'precision', 'predicted', average, warn_for)


min_count = 5 [100, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.0602970902149829


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


min_count = 10 [100, 5, 10, 5, 0.025, 0.001] 
 Similarity = 0.05156934539863536


  'precision', 'predicted', average, warn_for)


min_count = 20 [100, 5, 20, 5, 0.025, 0.001] 
 Similarity = 0.054828865444060675


  'precision', 'predicted', average, warn_for)


min_count = 50 [100, 5, 50, 5, 0.025, 0.001] 
 Similarity = 0.056398957922941466


  'precision', 'predicted', average, warn_for)


min_count = 100 [100, 5, 100, 5, 0.025, 0.001] 
 Similarity = 0.05132894018893602


  'precision', 'predicted', average, warn_for)


min_count = 200 [100, 5, 200, 5, 0.025, 0.001] 
 Similarity = 0.05527746564893318


  'precision', 'predicted', average, warn_for)


min_count = 400 [100, 5, 400, 5, 0.025, 0.001] 
 Similarity = 0.05997719853491226


  'precision', 'predicted', average, warn_for)


min_count = 800 [100, 5, 800, 5, 0.025, 0.001] 
 Similarity = 0.04811518986975668


  'precision', 'predicted', average, warn_for)


min_count = 1000 [100, 5, 1000, 5, 0.025, 0.001] 
 Similarity = 0.04560956939688282


  'precision', 'predicted', average, warn_for)


min_count = 1200 [100, 5, 1200, 5, 0.025, 0.001] 
 Similarity = 0.048628927576235687


  'precision', 'predicted', average, warn_for)


min_count = 2400 [100, 5, 2400, 5, 0.025, 0.001] 
 Similarity = 0.05493025723732925


  'precision', 'predicted', average, warn_for)


negative = 1 [100, 5, 5, 1, 0.025, 0.001] 
 Similarity = 0.057187251814393616


  'precision', 'predicted', average, warn_for)


negative = 2 [100, 5, 5, 2, 0.025, 0.001] 
 Similarity = 0.05855935136568866


  'precision', 'predicted', average, warn_for)


negative = 3 [100, 5, 5, 3, 0.025, 0.001] 
 Similarity = 0.05925096101314759


  'precision', 'predicted', average, warn_for)


negative = 5 [100, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.05035918244465253


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


negative = 8 [100, 5, 5, 8, 0.025, 0.001] 
 Similarity = 0.05312220483741942


  'precision', 'predicted', average, warn_for)


negative = 10 [100, 5, 5, 10, 0.025, 0.001] 
 Similarity = 0.06512146318461651


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


negative = 15 [100, 5, 5, 15, 0.025, 0.001] 
 Similarity = 0.05865565904142566


  'precision', 'predicted', average, warn_for)


alpha = 0.0125 [100, 5, 5, 5, 0.0125, 0.001] 
 Similarity = 0.05333181239133442


  'precision', 'predicted', average, warn_for)


alpha = 0.025 [100, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.05675586187230006


  'precision', 'predicted', average, warn_for)


alpha = 0.05 [100, 5, 5, 5, 0.05, 0.001] 
 Similarity = 0.05828684722591392


  'precision', 'predicted', average, warn_for)


alpha = 0.01 [100, 5, 5, 5, 0.01, 0.001] 
 Similarity = 0.048349672887079725


  'precision', 'predicted', average, warn_for)


sample = 0 [100, 5, 5, 5, 0.025, 0] 
 Similarity = 0.0498261723219573


  'precision', 'predicted', average, warn_for)


sample = 0.1 [100, 5, 5, 5, 0.025, 0.1] 
 Similarity = 0.0647797674875594


  'precision', 'predicted', average, warn_for)


sample = 0.01 [100, 5, 5, 5, 0.025, 0.01] 
 Similarity = 0.061665397238807464


  'precision', 'predicted', average, warn_for)


sample = 0.001 [100, 5, 5, 5, 0.025, 0.001] 
 Similarity = 0.055096356246362575


  'precision', 'predicted', average, warn_for)


sample = 0.0001 [100, 5, 5, 5, 0.025, 0.0001] 
 Similarity = 0.04915294244205953


  'precision', 'predicted', average, warn_for)


sample = 1e-05 [100, 5, 5, 5, 0.025, 1e-05] 
 Similarity = 0.044384057971014496


  'precision', 'predicted', average, warn_for)


sample = 1e-06 [100, 5, 5, 5, 0.025, 1e-06] 
 Similarity = 0.04695304695304696


  'precision', 'predicted', average, warn_for)


sample = 1e-07 [100, 5, 5, 5, 0.025, 1e-07] 
 Similarity = 0.04590758079130172


  'precision', 'predicted', average, warn_for)


sample = 1e-08 [100, 5, 5, 5, 0.025, 1e-08] 
 Similarity = 0.0426313855200294


  'precision', 'predicted', average, warn_for)


sample = 1e-09 [100, 5, 5, 5, 0.025, 1e-09] 
 Similarity = 0.0538961038961039
