In [None]:
!pip install -U pip setuptools wheel
!pip install transformers

In [None]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import spacy


class Place:
  def __init__(self,  tags, vec_model, text_tokenizer):
    '''
    tags = tags for each place or text prompt
    vec_model = Language Model for getting tags embeddings
    '''
    self.vec_model = vec_model
    self.text_tokenizer = text_tokenizer
    self.tags = tags
    self.embs = torch.tensor([])


  def compute_embeddings(self, is_return=False):
    tensor_tags = self.text_tokenizer(self.tags, padding=True, truncation=True, max_length=24, return_tensors='pt')
    with torch.no_grad():
      model_out = self.vec_model(**tensor_tags)
    self.embs = self._mean_pooling(model_out, tensor_tags['attention_mask']).squeeze()
    if is_return:
      return self.embs



  def _mean_pooling(self, model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [None]:
class User:
  def __init__(self,  tags, prompt, vec_model, text_tokenizer, liked_history=None):
    self.vec_model = vec_model
    self.text_tokenizer = text_tokenizer
    self.prompt = prompt
    self.tags = tags
    self.cos = torch.nn.CosineSimilarity(dim=-1)
    self.liked_history = liked_history
    tensor_tags = self.text_tokenizer(tags, padding=True, truncation=True, max_length=24, return_tensors='pt')
    tensor_prompt = self.text_tokenizer(prompt, padding=True, truncation=True, max_length=50, return_tensors='pt')
    with torch.no_grad():
      model_out_tags = self.vec_model(**tensor_tags)
      model_out_prompt = self.vec_model(**tensor_prompt)
    embs_tags = self._mean_pooling(model_out_tags, tensor_tags['attention_mask']).squeeze()
    embs_prompt = self._mean_pooling(model_out_prompt, tensor_prompt['attention_mask']).squeeze()
    self.user_embs = (embs_tags + embs_prompt) / 2


  def get_topk_rec(self, idx, place_embeddings, k=5):
    '''
    func get index of person and return cos_sim of person with every man from BD
    '''
    man = self.user_embs[idx]
    indices = torch.tensor([i for i in range(place_embeddings.shape[0])])
    # other_embeddings = torch.index_select(self.user_embs, 0, indices)
    cosine_arr = self.cos(man, place_embeddings)
    try:
      max_sim_idx = torch.topk(cosine_arr, k).indices
    except:
      max_sim_idx = torch.topk(cosine_arr, place_embeddings.shape[0]).indices
    max_sim_places = indices[max_sim_idx]
    return max_sim_places





  def _mean_pooling(self, model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")
sbert_model = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru")

In [None]:
sm_places = Place([ "Спортивный зал", "Музей поэтов", "Театр искусств"], sbert_model, tokenizer)
sm_man = User(tags=["поэты", "Пушкин"], prompt=["Мне очень нравится творчество Пушкина"], vec_model=sbert_model, text_tokenizer=tokenizer)

In [None]:
place_embs = sm_places.compute_embeddings(True)
sm_man.get_topk_rec(0, place_embs, k=5)

In [None]:
place_embs.shape