In [15]:
!pip install transformers



In [11]:
!pip install pipeline



In [16]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils
from torch.optim import Adam
from tqdm import tqdm
from collections import Counter
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms

import pandas as pd

In [17]:
TRAIN_DATASET_PATH = 'mesenger_train.csv'
TEST_DATASET_PATH = 'mesenger_val.csv'

In [18]:
TRAIN_DATASET_PATH = 'D:/task/mesenger_train.csv'
TEST_DATASET_PATH = 'D:/task/mesenger_val.csv'

In [19]:
#тренировочный датасет
df_train = pd.read_csv(TRAIN_DATASET_PATH)
df_train.head(10)

Unnamed: 0,id,text,class
0,0,@alisachachka не уезжаааааааай. :(❤ я тоже не ...,0
1,1,RT @GalyginVadim: Ребята и девчата!\nВсе в кин...,1
2,2,RT @ARTEM_KLYUSHIN: Кто ненавидит пробки ретви...,0
3,3,RT @epupybobv: Хочется котлету по-киевски. Зап...,1
4,4,@KarineKurganova @Yess__Boss босапопа есбоса н...,1
5,5,"Манчестер через час играет, а я не дома (",0
6,6,RT @qukanacipr: да) а я в 2004 в жабу пришол и...,1
7,7,"Момент из ""Мальчик и маньяк"") Ражик гладит по ...",1
8,8,"Я просто неудачик, поцарапал экран на телефоне ((",0
9,9,хахаа тот день запомнился надолго) http://t.co...,1


In [20]:
#тестовый датасет
df_test = pd.read_csv(TEST_DATASET_PATH)
df_test.tail()

Unnamed: 0,id,text,class
22678,204145,А я знаю из-за кого такая паршивая погода была...,1
22679,204146,ааа.... что с Гмейлом. не ужто он умер сегодня...,0
22680,204147,"в учебники химии написано ""Пруст Жозеф Луи""\nя...",1
22681,204148,"Вот все любят,а я не люблю..как дура..говорю,ч...",0
22682,204149,уххх спасибо дорогой Леле Евгеньевне и Штепуху...,1


In [21]:
df_train.shape, df_test.shape

((181467, 3), (22683, 3))

In [22]:
df_train['class'].value_counts()

1    92063
0    89404
Name: class, dtype: int64

In [23]:
df_train['text'] = df_train['text'].apply(lambda x: x.lower())
df_test['text'] = df_test['text'].apply(lambda x: x.lower())

In [24]:
from transformers import BertTokenizer

In [25]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

example_text = 'Для решения задачи нужны данные'
bert_input = tokenizer(example_text, padding='max_length', max_length=10, 
                       truncation=True, return_tensors="pt")


print(bert_input['input_ids'])
print(bert_input['attention_mask'])

tensor([[   101,  15668,  48143,  66620,    554,  10227, 101870,  41065,    102,
              0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])


In [26]:
tokenizer.ids_to_tokens[61947], tokenizer.ids_to_tokens[10227]

('еш', '##у')

In [27]:
example_text = tokenizer.decode(bert_input.input_ids[0])

print(example_text)

[CLS] Для решения задачи нужны данные [SEP] [PAD]


In [28]:
class MessengerDataset(torch.utils.data.Dataset):
    
    def __init__(self, txts, labels):
        self._labels = labels
        
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self._txts = [self.tokenizer(text, padding='max_length', max_length=10,
                                     truncation=True, return_tensors="pt")
                      for text in txts]
        
    def __len__(self):
        return len(self._txts)
    
    def __getitem__(self, index):
        return self._txts[index], self._labels[index]

In [29]:
y_train = df_train['class'].values
y_test = df_test['class'].values

train_dataset = MessengerDataset(df_train['text'], y_train)
test_dataset = MessengerDataset(df_test['text'], y_test)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=1)

In [None]:
for txt, cls in train_loader:
    print(txt.keys())
    print(txt['input_ids'].shape)
    break

In [30]:
from transformers import BertModel

In [31]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.sigm = nn.Sigmoid()

    def forward(self, x, mask):
        
        _, pooled_output = self.bert(input_ids=x, attention_mask=mask, return_dict=False)
        # _, pooled_output - набор эмбеддинигов слов, эмбеддинг предложения
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.sigm(linear_output)
        return final_layer

In [32]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [33]:
model = BertClassifier().to(device)
criterion = nn.CrossEntropyLoss()

optimizer = Adam(model.linear.parameters(), lr=0.001)

In [34]:
print(model)
print("Parameters full train:", sum([param.nelement() for param in model.parameters()]))
print("Parameters transfer learning:", sum([param.nelement() for param in model.linear.parameters()]))

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [None]:
for epoch_num in range(2):
    total_acc_train = 0
    total_loss_train = 0

    model.train()
    for train_input, train_label in tqdm(train_loader):
        mask = train_input['attention_mask'].to(device)
        input_id = train_input['input_ids'].squeeze(1).to(device)
        train_label = train_label.to(device)

        output = model(input_id, mask)
                
        batch_loss = criterion(output, train_label)
        total_loss_train += batch_loss.item()
                
        acc = (output.argmax(dim=1) == train_label).sum().item()
        total_acc_train += acc

        model.zero_grad()
        batch_loss.backward()
        optimizer.step()
            
    model.eval()
    total_loss_val, total_acc_val = 0.0, 0.0
    for val_input, val_label in valid_loader:
        val_label = val_label.to(device)
        mask = val_input['attention_mask'].to(device)
        input_id = val_input['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask)

        batch_loss = criterion(output, val_label)
        total_loss_val += batch_loss.item()
                    
        acc = (output.argmax(dim=1) == val_label).sum().item()
        total_acc_val += acc
            
    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataset): .3f} \
        | Train Accuracy: {total_acc_train / len(train_dataset): .3f} \
        | Val Loss: {total_loss_val / len(valid_dataset): .3f} \
        | Val Accuracy: {total_acc_val / len(valid_dataset): .3f}')

  0%|          | 0/2836 [00:00<?, ?it/s]

К сожалению, возможности моей компьютерной системы на этом иссякли.