In [1]:
import json
import numpy as np
import torch


from transformers import BertModel
from transformers import BertTokenizer
from scipy import sparse
from pymystem3 import Mystem
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import KeyedVectors


m = Mystem()
tokenizer = RegexpTokenizer(r'\w+')
count_vectorizer = CountVectorizer()
tf_vectorizer = TfidfVectorizer(use_idf=False)
tfidf_vectorizer = TfidfVectorizer(use_idf=True)

b_tokenizer = BertTokenizer.from_pretrained('sberbank-ai/sbert_large_nlu_ru')
model = BertModel.from_pretrained('sberbank-ai/sbert_large_nlu_ru')


def get_bert_corpus(texts):
  vectors = []
  for text in texts:
    input_sentence = torch.tensor(b_tokenizer.encode("[CLS] " + text)).unsqueeze(0)
    out = model(input_sentence)
    embeddings_of_last_layer = out[0]
    cls_embeddings = embeddings_of_last_layer[0][0].detach().numpy()
    vectors.append(cls_embeddings)
  
  return sparse.csr_matrix(vectors)

def get_query_bert():
    query = input('Введите ваш запрос: ')
    query = preprocessing([query])
    input_sentence = torch.tensor(b_tokenizer.encode("[CLS] " + query[0])).unsqueeze(0)
    out = model(input_sentence)
    embeddings_of_last_layer = out[0]
    cls_embeddings = embeddings_of_last_layer[0][0].detach().numpy()
    return sparse.csr_matrix(cls_embeddings)

def preprocessing(texts):
    preprocessed_texts = []
    for text in texts:
        text_stripped = text.rstrip()
        text_stripped = ' '.join(tokenizer.tokenize(text_stripped.lower()))
        lemmas = m.lemmatize(text_stripped)
        lemmas = [w for w in lemmas if not w.isdigit() and w != ' ']
        preprocessed_texts.append(' '.join(lemmas))

    return preprocessed_texts

In [3]:
preprocessing(['слово'])

['слово \n']