In [None]:
!pip install transformers
!pip install nmslib

#### Пример: семантическая близость

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
sentences = [
    'Позовите оператора поддержки, бот не помог',
    'Нужна помощь человека для решения вопроса',
    'Мне нужна новая карта',
    'Хотел бы выпустить ещё одну карточку',
    'So, the robot was useless I need a human expert',
]

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-xlm-r-multilingual-v1')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-xlm-r-multilingual-v1')

In [None]:
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    model_output = model(**encoded_input)

sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

In [None]:
sentence_embeddings

In [None]:
import nmslib

index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(sentence_embeddings[: -1], ids=list(range(len(sentence_embeddings[: -1]))))
index.createIndex({'post': 2}, print_progress=True)

In [None]:
ids, distances = index.knnQuery(sentence_embeddings[-1], k=10)

In [None]:
for i, d in zip(ids, distances):
    print(sentences[i], '\t', d)

#### Пример: генерация текста

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2')
model = GPT2LMHeadModel.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2')

In [None]:
text = 'Как же жить хорошо!'
input_ids = tokenizer.encode(text, return_tensors='pt')

tokens = model.generate(
    input_ids,
    max_length=64,
    repetition_penalty=5.0,
    do_sample=False,
    top_k=5,
    top_p=0.95,
    temperature=1.0,
    num_beams=5,
    no_repeat_ngram_size=4,
)
print([tokenizer.decode(t) for t in tokens])