In [1]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm, trange
from IPython.display import clear_output

In [2]:
!nvidia-smi

Thu Dec  7 15:20:26 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-PCI...  On   | 00000000:03:00.0 Off |                    0 |
| N/A   44C    P0    35W / 250W |      2MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCI...  On   | 00000000:03:01.0 Off |                    0 |
| N/A   32C    P0    34W / 250W |      2MiB / 40960MiB |      0%      Default |
|       

In [3]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny")
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(29564, 312, padding_idx=0)
    (position_embeddings): Embedding(512, 312)
    (token_type_embeddings): Embedding(2, 312)
    (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-2): 3 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=312, out_features=312, bias=True)
            (key): Linear(in_features=312, out_features=312, bias=True)
            (value): Linear(in_features=312, out_features=312, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=312, out_features=312, bias=True)
            (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
    

In [5]:
def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings.cpu().numpy()

In [3]:
df = pd.read_csv('hh_database_clear.csv', index_col=0)

In [4]:
df['Text'] = df['Ищет работу на должность:'] + '. ' + df['Занятость'] + '. ' + df['График'] + '. ' + df['Опыт работы'] + \
    '. ' + df['Последнее/нынешнее место работы'] + '. ' + df['Последняя/нынешняя должность'] + '. ' + df['Образование и ВУЗ'] + \
        '. ' + df['Age'].to_string() + '. ' + df['Sex'] + '. ' + df['Money'].to_string() + '. ' + df['Rate']

In [5]:
list_text = df['Text'].to_list()

In [9]:
len(list_text)

44423

In [9]:
BATCH_SIZE = 1024

In [10]:
for i in trange(len(list_text) // BATCH_SIZE):
    if i == 0:
        embed = embed_bert_cls(list_text[i * BATCH_SIZE : (i+1) * BATCH_SIZE], model, tokenizer)
        dataframe = pd.DataFrame(embed)
        continue
    embed = embed_bert_cls(list_text[i * BATCH_SIZE : (i+1) * BATCH_SIZE], model, tokenizer)
    dataframe = pd.concat([dataframe, pd.DataFrame(embed).reset_index(drop=True)], ignore_index=True, axis=0)
    print(dataframe.shape)
clear_output()
last = list_text[(i+1) * BATCH_SIZE:]
embed = embed_bert_cls(last, model, tokenizer)
dataframe = pd.concat([dataframe, pd.DataFrame(embed).reset_index(drop=True)], ignore_index=True, axis=0)

In [11]:
dataframe.to_csv('embed.csv')