In [1]:
import os
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer, util
from numpy import linalg as LA
import re
import pymorphy2
import math

In [2]:
class Transformer():
    def __init__(self, name):
        self.df = pd.read_excel('augmented_sample.xlsx')
        self.model = SentenceTransformer(name)
        self.sample_encoded = self.model.encode(self.df['вопрос'])
        self.morph = pymorphy2.MorphAnalyzer()
        self.dct = {'мат': 'материальная',
                    'студак': 'студенческий билет',
                    'вышка' : 'НИУ ВШЭ',
                    'академ' : 'академический отпуск',
                    'труба' : 'телефон',
                    'учебок' : 'учебный офис МИЭМ',
                    }
        self.sample_norms = LA.norm(self.sample_encoded, axis=1)

    def preprocessing(self, message):
        russian_terms =  re.findall(r'[а-яА-Яё]+', message)
        mas = []
        for el in russian_terms:
            mas.append(self.dct.get(self.morph.parse(el)[0].normal_form, el))
        return " ".join(mas)
    

    def get_best_three(self, question, threshold=0.7, n=3):
        question = self.preprocessing(question)
        vect_quest = self.model.encode(question)
        dots = np.array(util.dot_score(vect_quest, self.sample_encoded)[0]).clip(min=0)
        cosine_similarities = dots / (self.sample_norms * LA.norm(vect_quest))
        indices = (-cosine_similarities).argsort()[:n]
        coefs = np.abs(np.sort(-cosine_similarities)[:n])
        if coefs[0] < threshold:
            return None
        return self.df.iloc[list(indices), :].values, indices

In [3]:
#loading sample
df = pd.read_excel('augmented_sample.xlsx')
#loading models
pretrained_pavlov = Transformer('DeepPavlov/rubert-base-cased-sentence')
pretrained_qa = Transformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
pretrained_tiny = Transformer('cointegrated/rubert-tiny')

Some weights of the model checkpoint at C:\Users\masli/.cache\torch\sentence_transformers\cointegrated_rubert-tiny were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def get_aggregated_results(message, df, *models):
    answers = np.array([model.get_best_three(message)[1] for model in models if model.get_best_three(message) is not None])
    if len(answers) < math.ceil(len(models) / 2):
        return None
    stacked = np.vstack(answers)
#     return stacked
    answer_num = stacked.shape[1]
    indices = []
    for i in range(answer_num):
        indices.append(np.unique(stacked[:, i]))
    result = np.hstack(indices)[:answer_num]
    return df.iloc[result, :].values

In [5]:
get_aggregated_results('что такое учебный офис', df, pretrained_pavlov, pretrained_qa, pretrained_tiny)

array([['адрес учебного офиса', 'информация УО',
        '123458, г. Москва, ул. Таллинская, д.34', nan, nan],
       ['часы работы учебного офиса', 'информация УО',
        'Посетить Учебный офис лично можно с понедельника по пятницу с 10:00 до 17:00 (за исключением праздничных и выходных дней)',
        nan, nan],
       ['кабинет руководителя учебного офиса', 'информация УО',
        'кабинет руководителя учебного офиса - к.315', nan, nan]],
      dtype=object)