In [1]:
import warnings

warnings.filterwarnings('ignore')

import tqdm
import os

import google.colab as colab

import numpy as np
import pandas as pd

import torch
import transformers as ppb

In [2]:
# подключим наш гугл-диск для считывания и сохранения данных
my_drive = '/content/drive'
colab.drive.mount(my_drive)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# генератор разбиения на батчи
# нужен для оптимизации по памяти
class BatchGenerator:

  def __init__(self, df, batch_size=2000):

    self.df = df
    self.batch_size = batch_size

  def __iter__(self):

        start = 0

        while start < self.df.shape[0]:

            if start + self.batch_size > self.df.shape[0]:
                yield self.df[start:self.df.shape[0]].reset_index(drop=True)

            else:
                yield self.df[start:start + self.batch_size].reset_index(drop=True)

            start += self.batch_size

  def __len__(self):

    return int(np.ceil(self.df.shape[0] / self.batch_size))

In [4]:
class InferenceBERT:

  allowed_models = ('distilbert-base-uncased', 'bert-base-uncased')

  def __init__(self, df, batch_size, save_path,
               model_name='distilbert-base-uncased', tokenizer_params=None):

    # зададим процессор и генератор батчей
    self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    self.batches = BatchGenerator(df, batch_size)

    # Загрузим bert и его токенайзер
    assert model_name in self.allowed_models
    self.model_name = model_name
    self.tokenizer = ppb.AutoTokenizer.from_pretrained(self.model_name)
    self.bert_model = ppb.AutoModel.from_pretrained(self.model_name).to(self.device)
    # настроим токенизацию в зависимости от модели
    self.tokenizer_params = tokenizer_params if tokenizer_params else {
        'add_special_tokens': True,
        'truncation': True,
        'max_length': 512
    }

    # настроим эпохи
    self.epoch = 0
    self.max_epoch = len(self.batches)

    # папка для сохранения результата
    self.save_path = save_path

  def __del__(self):

    del self.bert_model, self.tokenizer, self.batches

  def inference(self, start, end):

    assert end <= self.max_epoch

    # Прогоняем данные через модель
    loop = tqdm.tqdm(self.batches, leave=False)

    for batch in loop:

      if start <= self.epoch < end:

          # Токенизация батча
          tokenized = batch[0].apply((
              lambda x: self.tokenizer.encode(x, **self.tokenizer_params)
          ))

          max_len = 0
          for i in tokenized.values:
            if len(i) > max_len:
              max_len = len(i)

          padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized.values])
          attention_mask = np.where(padded != 0, 1, 0)

          input_ids = torch.tensor(padded).to(self.device)
          attention_mask = torch.tensor(attention_mask).to(self.device)

          with torch.no_grad():
              last_hidden_states = self.bert_model(
                  input_ids, attention_mask=attention_mask
              )

          del input_ids, attention_mask

          # Извлечение эмбеддингов последнего скрытого слоя
          features = last_hidden_states[0][:, 0, :].cpu().numpy()

          # Сохраним эмбединг
          pd.concat(
              [
                pd.DataFrame(features),
                pd.DataFrame(batch[1].values, columns=['target'])
              ], axis=1
            ).to_csv(os.path.join(self.save_path, f'state_epoch_{self.epoch}.csv'))

          self.epoch += 1

      else:
          print('All bathes embeded!')
          break

    torch.cuda.empty_cache()

In [5]:
# в репозитории архив находится тут: https://github.com/Romashka8/AmazonRecomendationSystem/tree/main/data/raw/tonality
# загрузим его с другого github-а - https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv
TONALITY_PATH = 'https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv'
EXTENDED_REVIEWS_PATH = '/content/drive/MyDrive/colab_data/ExtendedReviewsForBERT/reviews_with_goods.csv'

# настройка путей для сохранения
SAVE_PATH = '/content/drive/MyDrive/colab_data'
SAVE_TONALITY = 'TonalityBERTStates'
SAVE_REVIEWS = 'RewievsEmbedded'

for path in (SAVE_TONALITY, SAVE_REVIEWS):

  check_exists = os.path.join(SAVE_PATH, path)

  if not os.path.exists(check_exists):
    os.mkdir(check_exists)

# Подготовим данные

In [6]:
df_tonality = pd.read_csv(TONALITY_PATH, delimiter='\t', header=None)
df_reviews = pd.read_csv(EXTENDED_REVIEWS_PATH).dropna()
df_reviews.reset_index(inplace=True)
# для обработки возьмем только текст и id
df_reviews_bert_ids, df_reviews_bert_text = df_reviews['user_id'].copy(), df_reviews['text'].copy()
df_reviews_bert = pd.concat(
    [df_reviews_bert_text, pd.DataFrame(np.zeros(df_reviews_bert_ids.shape[0]))],
    axis=1
)
df_reviews_bert.columns = [0, 1]

# Подготовим Эмбединги

In [7]:
tonality_bert = InferenceBERT(df_tonality, 2000, os.path.join(SAVE_PATH, SAVE_TONALITY))
tonality_bert.inference(0, 4)
del tonality_bert

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]



In [8]:
tokenizer_params = {
    'add_special_tokens': True,
    'truncation': True, # обрежем токены по длине
    'max_length': 512 # максимальная длина для distil-bert
}

reviews_bert = InferenceBERT(df_reviews_bert, 100,
                             os.path.join(SAVE_PATH, SAVE_REVIEWS),
                             tokenizer_params=tokenizer_params)

reviews_bert.inference(0, 84)
del reviews_bert



# Сохраним файлы

In [9]:
%cd /content/drive/MyDrive/colab_data

/content/drive/MyDrive/colab_data


In [12]:
! zip -r TonalityBERTStates.zip TonalityBERTStates/
! zip -r RewievsEmbedded.zip RewievsEmbedded/

updating: TonalityBERTStates/ (stored 0%)
updating: TonalityBERTStates/state_epoch_0.csv (deflated 59%)
updating: TonalityBERTStates/state_epoch_1.csv (deflated 59%)
updating: TonalityBERTStates/state_epoch_2.csv (deflated 59%)
updating: TonalityBERTStates/state_epoch_3.csv (deflated 59%)
  adding: RewievsEmbedded/ (stored 0%)
  adding: RewievsEmbedded/state_epoch_0.csv (deflated 58%)
  adding: RewievsEmbedded/state_epoch_1.csv (deflated 58%)
  adding: RewievsEmbedded/state_epoch_2.csv (deflated 58%)
  adding: RewievsEmbedded/state_epoch_3.csv (deflated 58%)
  adding: RewievsEmbedded/state_epoch_4.csv (deflated 58%)
  adding: RewievsEmbedded/state_epoch_5.csv (deflated 58%)
  adding: RewievsEmbedded/state_epoch_6.csv (deflated 58%)
  adding: RewievsEmbedded/state_epoch_7.csv (deflated 58%)
  adding: RewievsEmbedded/state_epoch_8.csv (deflated 58%)
  adding: RewievsEmbedded/state_epoch_9.csv (deflated 58%)
  adding: RewievsEmbedded/state_epoch_10.csv (deflated 58%)
  adding: RewievsEmbe

In [13]:
colab.files.download('TonalityBERTStates.zip')
colab.files.download('RewievsEmbedded.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>