In [1]:
import warnings

warnings.filterwarnings('ignore')

import tqdm
import os

import numpy as np
import pandas as pd

import torch
import transformers as ppb

# Подготовим и посмотрим данные

In [2]:
class TonalityDataset(torch.utils.data.Dataset):
  def __init__(self, path: str):
    self.data = pd.read_csv(path, delimiter='\t', header=None)

    self.max_sentence_len = 0
    for i in range(self.data.shape[0]):
      self.max_sentence_len = max(self.max_sentence_len, len(self.data.iloc[i][0]))

  def __len__(self):
    return self.data.shape[0]

  def __getitem__(self, index: int):
    return self.data.iloc[index][0]

In [3]:
# в репозитории архив находится тут: https://github.com/Romashka8/AmazonRecomendationSystem/tree/main/data/raw/tonality
# загрузим его с другого github-а - https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv
df = TonalityDataset('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv')

In [4]:
df.data.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [5]:
df.data.shape

(6920, 2)

In [6]:
df.data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6920 entries, 0 to 6919
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       6920 non-null   object
 1   1       6920 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 1.1 MB


# Подготовим Эмбединги

In [7]:
# Выберем, на каком процессоре будем учить модель
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [8]:
# Разобьем выборку на батчи
loader = torch.utils.data.DataLoader(df, batch_size=40)

In [9]:
# Загрузим distil-bert и его токенайзер
model_name = 'distilbert-base-uncased'
tokenizer = ppb.AutoTokenizer.from_pretrained(model_name)
bert_model = ppb.AutoModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [10]:
os.path.exists('/content/drive/MyDrive/colab_data')

if not os.path.exists('/content/drive/MyDrive/colab_data/TonalityBERTStates'):
  os.mkdir('/content/drive/MyDrive/colab_data/TonalityBERTStates')

In [11]:
save_path = '/content/drive/MyDrive/colab_data/TonalityBERTStates'
start_from_epoch = 0
end_on_epoch = 174
epoch = start_from_epoch

# Прогоняем данные через модель
loop = tqdm.tqdm(loader, leave=False)

for batch in loop:

    if start_from_epoch <= epoch < end_on_epoch:

      # Токенизация батча
      inputs = tokenizer(batch, padding='max_length',
                        max_length=df.max_sentence_len, return_tensors='pt')

      # Передача данных в модель
      with torch.no_grad():
          outputs = bert_model(**inputs)

      # Извлечение эмбеддингов последнего скрытого слоя
      last_hidden_states = outputs.last_hidden_state

      # Сохраним эмбединг
      pd.DataFrame(last_hidden_states[:, 0, :]).to_csv(
          os.path.join(save_path, f'state_epoch_{epoch}.csv')
      )

      epoch += 1

    else:
      print('All bathes embeded!')
      break



# Сохраним файлы

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
%cd /content/drive/MyDrive/colab_data

/content/drive/MyDrive/colab_data


In [15]:
! zip -r TonalityBERTStates.zip TonalityBERTStates/

  adding: TonalityBERTStates/ (stored 0%)
  adding: TonalityBERTStates/state_epoch_0.csv (deflated 58%)
  adding: TonalityBERTStates/state_epoch_1.csv (deflated 58%)
  adding: TonalityBERTStates/state_epoch_2.csv (deflated 58%)
  adding: TonalityBERTStates/state_epoch_3.csv (deflated 58%)
  adding: TonalityBERTStates/state_epoch_4.csv (deflated 58%)
  adding: TonalityBERTStates/state_epoch_5.csv (deflated 58%)
  adding: TonalityBERTStates/state_epoch_6.csv (deflated 58%)
  adding: TonalityBERTStates/state_epoch_7.csv (deflated 58%)
  adding: TonalityBERTStates/state_epoch_8.csv (deflated 58%)
  adding: TonalityBERTStates/state_epoch_9.csv (deflated 58%)
  adding: TonalityBERTStates/state_epoch_10.csv (deflated 58%)
  adding: TonalityBERTStates/state_epoch_11.csv (deflated 58%)
  adding: TonalityBERTStates/state_epoch_12.csv (deflated 58%)
  adding: TonalityBERTStates/state_epoch_13.csv (deflated 58%)
  adding: TonalityBERTStates/state_epoch_14.csv (deflated 58%)
  adding: TonalityBERTS

In [16]:
from google.colab import files
files.download('TonalityBERTStates.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>