In [None]:
import warnings

warnings.filterwarnings('ignore')

import tqdm
import pickle
import os

import numpy as np
import pandas as pd

import torch
import transformers as ppb

# Подготовим и посмотрим данные

In [None]:
class TonalityDataset(torch.utils.data.Dataset):
  def __init__(self, path: str):
    self.data = pd.read_csv(path, delimiter='\t', header=None)

    self.max_sentence_len = 0
    for i in range(self.data.shape[0]):
      self.max_sentence_len = max(self.max_sentence_len, len(self.data.iloc[i][0]))

  def __len__(self):
    return self.data.shape[0]

  def __getitem__(self, index: int):
    return self.data.iloc[index][0]

In [None]:
# в репозитории архив находится тут: https://github.com/Romashka8/AmazonRecomendationSystem/tree/main/data/raw/tonality
# загрузим его с другого github-а - https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv
df = TonalityDataset('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv')

In [None]:
df.data.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [None]:
df.data.shape

(6920, 2)

In [None]:
df.data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6920 entries, 0 to 6919
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       6920 non-null   object
 1   1       6920 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 1.1 MB


# Подготовим Эмбединги

In [None]:
# Выберем, на каком процессоре будем учить модель
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
# Разобьем выборку на батчи
loader = torch.utils.data.DataLoader(df, batch_size=40)

In [None]:
# Загрузим distil-bert и его токенайзер
model_name = 'distilbert-base-uncased'
tokenizer = ppb.AutoTokenizer.from_pretrained(model_name)
bert_model = ppb.AutoModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
if not os.path.exists('/content/TonalityBERTStates'):
  os.mkdir('/content/TonalityBERTStates')

In [None]:
save_every_iter = 1
save_path = '/content/TonalityBERTStates'
start_from_epoch = 70
end_on_epoch = 90
epoch = start_from_epoch

# Прогоняем данные через модель
loop = tqdm.tqdm(loader, leave=False)

for batch in loop:

    if start_from_epoch <= epoch < end_on_epoch:

      # Токенизация батча
      inputs = tokenizer(batch, padding='max_length',
                        max_length=df.max_sentence_len, return_tensors='pt')

      # Передача данных в модель
      with torch.no_grad():
          outputs = bert_model(**inputs)

      # Извлечение эмбеддингов последнего скрытого слоя
      last_hidden_states = outputs.last_hidden_state

      # Сохраним эмбединг
      if epoch % save_every_iter == 0:
        with open(os.path.join(save_path, f'state_epoch_{epoch}.pkl'), 'wb') as f:
          pickle.dump([last_hidden_states[:, 0, :],
                      epoch, start_from_epoch, end_on_epoch], f)

      epoch += 1

    else:
      print('All bathes embeded!')
      break

                                                

All bathes embeded!




# Сохраним файлы

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content

/content


In [None]:
! zip -r TonalityBERTStates.zip TonalityBERTStates/

  adding: TonalityBERTStates/ (stored 0%)
  adding: TonalityBERTStates/state_epoch_0.pkl (deflated 7%)
  adding: TonalityBERTStates/state_epoch_70.pkl (deflated 7%)
  adding: TonalityBERTStates/state_epoch_86.pkl (deflated 7%)
  adding: TonalityBERTStates/state_epoch_54.pkl (deflated 7%)
  adding: TonalityBERTStates/state_epoch_76.pkl (deflated 7%)
  adding: TonalityBERTStates/state_epoch_61.pkl (deflated 7%)
  adding: TonalityBERTStates/state_epoch_40.pkl (deflated 7%)
  adding: TonalityBERTStates/state_epoch_55.pkl (deflated 7%)
  adding: TonalityBERTStates/state_epoch_15.pkl (deflated 7%)
  adding: TonalityBERTStates/state_epoch_42.pkl (deflated 7%)
  adding: TonalityBERTStates/state_epoch_79.pkl (deflated 7%)
  adding: TonalityBERTStates/state_epoch_50.pkl (deflated 7%)
  adding: TonalityBERTStates/state_epoch_59.pkl (deflated 7%)
  adding: TonalityBERTStates/state_epoch_88.pkl (deflated 7%)
  adding: TonalityBERTStates/state_epoch_58.pkl (deflated 7%)
  adding: TonalityBERTStates/

In [None]:
from google.colab import files
files.download('TonalityBERTStates.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>