In [1]:
import torch.nn.functional as F
import pandas as pd
from tqdm import trange
from IPython.display import clear_output
import os

import torch
from torch import Tensor
from transformers import AutoTokenizer, AutoModel


def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-base')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-base')
model.to(device)

XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=Tru

In [2]:
!nvidia-smi

Fri Dec  8 11:32:20 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-PCI...  On   | 00000000:03:00.0 Off |                    0 |
| N/A   45C    P0    38W / 250W |   2132MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCI...  On   | 00000000:03:01.0 Off |                    0 |
| N/A   33C    P0    34W / 250W |      2MiB / 40960MiB |      0%      Default |
|       

In [3]:
df = pd.read_csv('hh_database_clear.csv', index_col=0)
df['query'] = ['query: '] * df.shape[0]
df['Text'] = df['query'] + df['Ищет работу на должность:'] + '. ' + df['Занятость'] + '. ' + df['График'] + '. ' + df['Опыт работы'] + \
    '. ' + df['Последнее/нынешнее место работы'] + '. ' + df['Последняя/нынешняя должность'] + '. ' + df['Образование и ВУЗ'] + \
        '. ' + df['Age'].to_string() + '. ' + df['Sex'] + '. ' + df['Money'].to_string() + '. ' + df['Rate']
list_text = df['Text'].to_list()

In [5]:
BATCH_SIZE = 32
FLAG = False

In [10]:
torch.cuda.empty_cache()
for i in trange(len(list_text) // BATCH_SIZE):
    if os.path.exists('embed_inter.csv') and FLAG:
        dataframe = pd.read_csv('embed_inter.csv', index_col=0)
        i = dataframe.shape[0] // BATCH_SIZE
        FLAG = False
        continue
    if i == 0:
        input_texts = list_text[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
        batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt').to(device)
        outputs = model(**batch_dict)
        embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
        embeddings = F.normalize(embeddings, p=2, dim=1)
        dataframe = pd.DataFrame(embeddings.cpu().detach().numpy())
        del embeddings
        print(dataframe.shape)
        torch.cuda.empty_cache()
        continue
    input_texts = list_text[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt').to(device)
    outputs = model(**batch_dict)
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    embeddings = F.normalize(embeddings, p=2, dim=1) 
    dataframe = pd.concat([dataframe, pd.DataFrame(embeddings.cpu().detach().numpy()).reset_index(drop=True)], ignore_index=True, axis=0)
    print(dataframe.shape)
    if i % 100 == 0:
        clear_output()
        dataframe.to_csv('embed_inter.csv')
    del embeddings
    torch.cuda.empty_cache()
clear_output()
input_texts = list_text[(i+1) * BATCH_SIZE:]
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt').to(device)
outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
dataframe = pd.concat([dataframe, pd.DataFrame(embeddings.cpu().detach().numpy()).reset_index(drop=True)], ignore_index=True, axis=0)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [16]:
dataframe.to_csv('embed_e5.csv')