In [1]:
import warnings

warnings.filterwarnings('ignore')

import tqdm
import pickle
import os

import numpy as np
import pandas as pd

import torch
import transformers as ppb

# Подготовим и посмотрим данные

In [2]:
class TonalityDataset(torch.utils.data.Dataset):
  def __init__(self, path: str):
    self.data = pd.read_csv(path, delimiter='\t', header=None)

    self.max_sentence_len = 0
    for i in range(self.data.shape[0]):
      self.max_sentence_len = max(self.max_sentence_len, len(self.data.iloc[i][0]))

  def __len__(self):
    return self.data.shape[0]

  def __getitem__(self, index: int):
    return self.data.iloc[index][0]

In [3]:
# в репозитории архив находится тут: https://github.com/Romashka8/AmazonRecomendationSystem/tree/main/data/raw/tonality
# загрузим его с другого github-а - https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv
df = TonalityDataset('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv')

In [4]:
df.data.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [5]:
df.data.shape

(6920, 2)

In [6]:
df.data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6920 entries, 0 to 6919
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       6920 non-null   object
 1   1       6920 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 1.1 MB


# Подготовим Эмбединги

In [7]:
# Выберем, на каком процессоре будем учить модель
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [8]:
# Разобьем выборку на батчи
loader = torch.utils.data.DataLoader(df, batch_size=40)

In [9]:
# Загрузим distil-bert и его токенайзер
model_name = 'distilbert-base-uncased'
tokenizer = ppb.AutoTokenizer.from_pretrained(model_name)
bert_model = ppb.AutoModel.from_pretrained(model_name)

In [15]:
save_every_iter = 1
save_path = '/content/TonalityBERTStates'
start_from_epoch = 0
end_on_epoch = 1

# Прогоняем данные через модель
loop = tqdm.tqdm(loader, leave=False)

for epoch, batch in enumerate(loop):

    if start_from_epoch <= epoch < end_on_epoch:

      # Токенизация батча
      inputs = tokenizer(batch, padding='max_length',
                        max_length=df.max_sentence_len, return_tensors='pt')

      # Передача данных в модель
      with torch.no_grad():
          outputs = bert_model(**inputs)

      # Извлечение эмбеддингов последнего скрытого слоя
      last_hidden_states = outputs.last_hidden_state

      # Сохраним эмбединг
      if epoch % save_every_iter == 0:
        with open(os.path.join(save_path, f'state_epoch_{epoch}.pkl'), 'wb') as f:
          pickle.dump([last_hidden_states[:, 0, :],
                      epoch, start_from_epoch, end_on_epoch], f)

    else:
      print('All bathes embeded!')
      break

                                               

All bathes embeded!


