А данном ноутбуке попробуем получить эмбеддинги для текстов постов с помощью модели Bert.

In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

from warnings import filterwarnings

filterwarnings('ignore')

In [2]:
from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel

def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']

    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }

    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }

    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

Объявляем и скачиваем модель Bert

In [3]:
tokenizer, model = get_model('bert')

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [4]:
from torch.utils.data import DataLoader

In [5]:
from tqdm import tqdm

@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()

    total_embeddings = []
    labels = []

    for batch in tqdm(loader):
        labels.append(batch['labels'].unsqueeze(1))

        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0), torch.cat(labels, dim=0).to(torch.float32)

In [6]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenization(example):
    return tokenizer.batch_encode_plus(example['text'], add_special_tokens=True, return_token_type_ids=False, padding=True, truncation=True)

In [7]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)

cuda:0


In [24]:
# скачиваем таблицу с постами и преобразуем её в датасет

from datasets import Dataset, DatasetDict
from sqlalchemy import text
import psycopg2


from sqlalchemy import create_engine

engine = create_engine(
    "postgresql://robot-startml-secret"
    "postgres.lab.karpov.courses-secret"
)

con = engine.connect()

with engine.begin() as conn:
    query = text("""SELECT * FROM public.post_text_df""")
    post_df = pd.read_sql_query(query, conn)
data = post_df[['text', 'post_id']].rename(columns={'post_id':'label'})


In [25]:
# создаём DataLoader

dataset = Dataset.from_pandas(data)
dataset = dataset.map(tokenization, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
loader = DataLoader(dataset, batch_size=64, collate_fn=data_collator, pin_memory=True, shuffle=False)

Map:   0%|          | 0/7023 [00:00<?, ? examples/s]

In [26]:
dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 7023
})

In [27]:
model = model.to(device)

In [28]:
# Получаем эмбеддинги

embeddings, labels = get_embeddings_labels(model, loader)

  0%|          | 0/110 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 110/110 [03:20<00:00,  1.83s/it]


In [29]:
emb = pd.DataFrame(embeddings.numpy())

In [42]:
# Добавляем эмбеддинги к таблице с постами и сохраняём её

new_post_df = pd.concat([post_df, emb], axis=1)
new_post_df.to_csv('/content/drive/MyDrive/Colab Notebooks/new_post_df.csv')

In [32]:
emb.shape

(7023, 768)

In [40]:
emb.to_csv('emb.csv')