In [1]:
import warnings

warnings.filterwarnings('ignore')

import tqdm
import os

import numpy as np
import pandas as pd

import torch
import transformers as ppb

# Подготовим и посмотрим данные

In [2]:
# в репозитории архив находится тут: https://github.com/Romashka8/AmazonRecomendationSystem/tree/main/data/raw/tonality
# загрузим его с другого github-а - https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [3]:
batches = [
    df[:2000],
    df[2000:4000],
    df[4000:]
]

In [4]:
df.shape

(6920, 2)

In [5]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6920 entries, 0 to 6919
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       6920 non-null   object
 1   1       6920 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 1.1 MB


In [6]:
for b in batches:
  print(b.shape)

(2000, 2)
(2000, 2)
(2920, 2)


# Подготовим Эмбединги

In [8]:
# Выберем, на каком процессоре будем учить модель
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [9]:
# Загрузим distil-bert и его токенайзер
model_name = 'distilbert-base-uncased'
tokenizer = ppb.AutoTokenizer.from_pretrained(model_name)
bert_model = ppb.AutoModel.from_pretrained(model_name)

In [10]:
os.path.exists('/content/drive/MyDrive/colab_data')

if not os.path.exists('/content/drive/MyDrive/colab_data/TonalityBERTStates'):
  os.mkdir('/content/drive/MyDrive/colab_data/TonalityBERTStates')

In [11]:
save_path = '/content/drive/MyDrive/colab_data/TonalityBERTStates'
start_from_epoch = 0
end_on_epoch = 4
epoch = 0

# Прогоняем данные через модель
loop = tqdm.tqdm(batches, leave=False)

for batch in loop:

    if start_from_epoch <= epoch < end_on_epoch:
      # Токенизация батча
      tokenized = batch[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

      max_len = 0
      for i in tokenized.values:
        if len(i) > max_len:
          max_len = len(i)

      padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized.values])
      attention_mask = np.where(padded != 0, 1, 0)

      input_ids = torch.tensor(padded)
      attention_mask = torch.tensor(attention_mask)

      with torch.no_grad():
        last_hidden_states = bert_model(input_ids, attention_mask=attention_mask)

      # Извлечение эмбеддингов последнего скрытого слоя
      features = last_hidden_states[0][:, 0, :].numpy()

      # Сохраним эмбединг
      pd.concat(
          [
            pd.DataFrame(features),
            pd.DataFrame(batch[1].values, columns=['target'])
          ], axis=1
        ).to_csv(os.path.join(save_path, f'state_epoch_{epoch}.csv'))

      epoch += 1

    else:
      print('All bathes embeded!')
      break



# Сохраним файлы

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
%cd /content/drive/MyDrive/colab_data

/content/drive/MyDrive/colab_data


In [13]:
! zip -r TonalityBERTStates.zip TonalityBERTStates/

  adding: TonalityBERTStates/ (stored 0%)
  adding: TonalityBERTStates/state_epoch_0.csv (deflated 59%)
  adding: TonalityBERTStates/state_epoch_1.csv (deflated 59%)
  adding: TonalityBERTStates/state_epoch_2.csv (deflated 59%)


In [14]:
from google.colab import files
files.download('TonalityBERTStates.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>