In [13]:
import pandas as pd
import numpy as np
import torch

from transformers import AutoTokenizer, AutoModel, AutoModelForPreTraining, DataCollatorWithPadding, BertForSequenceClassification
from transformers import BertTokenizer, BertForPreTraining

from model_adapter import FeatureExtractor
from sklearn import preprocessing

In [14]:
df = pd.read_pickle('data/df_first.pkl').reset_index(drop=True)

df.task_type = df.task_type.str.replace('Изменение', 'Другое', regex=True)
df.task_type = df.task_type.str.replace('Продажа', 'Другое', regex=True)

le = preprocessing.LabelEncoder()
df["task_type_int64"] = le.fit_transform(df["task_type"])

In [15]:
df[["description_merge","task_type", "task_type_int64"]]

Unnamed: 0,description_merge,task_type,task_type_int64
0,комп 947 не работает скайп комп 947\n\nПользов...,Другое,0
1,Массовая рассылка(Quick Sales) Рассмотерть воз...,Другое,0
2,Корпорация СКИП Завести клиента ЗАО «Корпораци...,Обслуживание,2
3,tserv Тормозит tserv\n\nЛожкин Кирилл Александ...,Другое,0
4,Outlook При ответе на письмо и вставки таблицы...,Другое,0
...,...,...,...
83840,привести к стандарту принтеры завести компы с ...,Другое,0
83841,ошибка РусТрейдинг Добрый день!\r\n\r\nНа экра...,Другое,0
83842,проблемы с шрифтом в pdf документе Проблемы со...,Обслуживание,2
83843,Необходимо переустановить ключи к 2 банк-клиен...,Другое,0


In [16]:
df_after2018 = df[df.year>2018]
df_before2019 = df[df.year<=2018]

df_tune_test = df_after2018[["description_merge", "task_type_int64"]]
df_tune_trainval = df_before2019[["description_merge", "task_type_int64"]]

In [17]:
from sklearn.model_selection import train_test_split

df_tune_train, df_tune_val=train_test_split(df_tune_trainval, test_size=0.3, stratify=df_tune_trainval.task_type_int64)

In [18]:
train_texts=df_tune_train.description_merge.to_list()
val_texts=df_tune_val.description_merge.to_list()
test_texts=df_tune_test.description_merge.to_list()

In [19]:
# tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny')

In [20]:
train_encodings = tokenizer(train_texts, truncation=True, padding="max_length")
val_encodings = tokenizer(val_texts, truncation=True, padding="max_length")
test_encodings = tokenizer(test_texts, truncation=True, padding="max_length")

In [21]:
train_labels=df_tune_train.task_type_int64.to_list()
val_labels=df_tune_val.task_type_int64.to_list()
test_labels=df_tune_test.task_type_int64.to_list()

In [22]:
import torch

class Mydataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Mydataset(train_encodings, train_labels)
val_dataset = Mydataset(val_encodings, val_labels)
test_dataset = Mydataset(test_encodings, test_labels)

In [23]:
from torch.utils.data import DataLoader
from transformers import AutoModel, AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = BertForSequenceClassification.from_pretrained('cointegrated/rubert-tiny')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not i

RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`