In [1]:
from transformers import AutoTokenizer, AutoModel, BertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import get_scheduler, AdamW

from typing import Dict

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [37]:
train = pd.read_csv("train.csv", index_col=0)
test = pd.read_csv("test.csv", index_col=0)
new_data = pd.read_csv("new_data.csv")

In [4]:
train["Тип оборудования"].unique()

array(['Ноутбук', 'СХД', 'Сервер'], dtype=object)

In [5]:
label2id, id2label = dict(), dict()
for id_, class_ in enumerate(train["Тип оборудования"].unique()):
    label2id[class_] = id_
    id2label[id_] = class_    

In [6]:
id2label

{0: 'Ноутбук', 1: 'СХД', 2: 'Сервер'}

In [7]:
model = BertForSequenceClassification.from_pretrained("blinoff/roberta-base-russian-v0", num_labels=len(label2id), id2label=id2label, label2id=label2id).to(device)
tokenizer = AutoTokenizer.from_pretrained("blinoff/roberta-base-russian-v0")

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at blinoff/roberta-base-russian-v0 and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight',

In [8]:
class CustomDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame,
                tokenizer: AutoTokenizer,
                label2id: Dict[str, int]):
        self.tokenizer = tokenizer
        self.max_length: int = tokenizer.model_max_length
        self.sep_token: str = tokenizer.sep_token

        dataframe["text"] = dataframe["Тема"] + f" {self.sep_token} " + dataframe["Описание"]
        self.texts = dataframe["text"].to_list()
        self.labels = [label2id[label] for label in dataframe["Тип оборудования"]]
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index: int):
        text :str = self.texts[index]
        label: int = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            # Return PyTorch tensors
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
            }

In [9]:
from sklearn.metrics import f1_score, classification_report


def train_model(model, data_loader, optimizer, lr_scheduler, device):
    model = model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        
        total_loss += loss.item()
    return total_loss / len(data_loader)

def evaluate_model(model, data_loader, device, last_epoch=False):
    model = model.eval()
    total_loss = 0
    all_predictions = []
    all_true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            all_predictions.extend(predictions.cpu().numpy())
            all_true_labels.extend(labels.cpu().numpy())

    # Calculate F1 Score
    f1 = f1_score(all_true_labels, all_predictions, average='weighted')  # choose average type as 'macro' or 'micro' if needed
    if last_epoch:
        print(classification_report(all_true_labels, all_predictions, target_names=label2id.keys()))
    return total_loss / len(data_loader), f1


def calculate_model(model, data_loader, device, last_epoch=False):
    model = model.eval()
    total_loss = 0
    all_predictions = []
    all_true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            all_predictions.extend(predictions.cpu().numpy())
            all_true_labels.extend(labels.cpu().numpy())

    # Calculate F1 Score

    cl = classification_report(all_true_labels, all_predictions, target_names=label2id.keys())
    return total_loss / len(data_loader), cl

In [39]:
train_dataset = CustomDataset(train, tokenizer, label2id)
test_dataset = CustomDataset(test, tokenizer, label2id) 
new_dataset = CustomDataset(new_data, tokenizer, label2id)

In [40]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)
new_loader = DataLoader(new_dataset, batch_size=4, shuffle=True)

In [13]:
optimizer = AdamW(model.parameters(), lr=1e-5, eps = 1e-8)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

model = model.train()
for epoch in range(num_epochs):
    train_loss = train_model(model, train_loader, optimizer, lr_scheduler, device)
    val_loss, f1 = evaluate_model(model, test_loader, device, epoch==num_epochs-1)
    print(f"Epoch: {epoch}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation F1: {f1:.4f}")



Epoch: 0, Train Loss: 0.3568, Validation Loss: 0.3677, Validation F1: 0.8517
Epoch: 1, Train Loss: 0.2794, Validation Loss: 0.3565, Validation F1: 0.9060
Epoch: 2, Train Loss: 0.2339, Validation Loss: 0.4116, Validation F1: 0.8834
Epoch: 3, Train Loss: 0.1645, Validation Loss: 0.3213, Validation F1: 0.9060
Epoch: 4, Train Loss: 0.1463, Validation Loss: 0.2590, Validation F1: 0.9268
Epoch: 5, Train Loss: 0.1130, Validation Loss: 0.3578, Validation F1: 0.9060
Epoch: 6, Train Loss: 0.1051, Validation Loss: 0.2374, Validation F1: 0.9268
Epoch: 7, Train Loss: 0.0555, Validation Loss: 0.4189, Validation F1: 0.8834
Epoch: 8, Train Loss: 0.0547, Validation Loss: 0.2860, Validation F1: 0.9268


KeyboardInterrupt: 

In [38]:
val_loss, f1 = evaluate_model(model, test_loader, device, True)

              precision    recall  f1-score   support

     Ноутбук       0.92      1.00      0.96        47
         СХД       1.00      0.67      0.80         3
      Сервер       1.00      0.67      0.80         9

    accuracy                           0.93        59
   macro avg       0.97      0.78      0.85        59
weighted avg       0.94      0.93      0.93        59



In [17]:
def batch_predict(texts, model, tokenizer, device):
    # Ensure the model is in evaluation mode
    model.eval()
    
    # Tokenize the batch of texts
    encodings = tokenizer(texts, max_length=tokenizer.model_max_length, truncation=True, padding=True, return_tensors="pt")
    
    # Move the tensors to the appropriate device
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)
    
    # Perform inference
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    # Convert logits to probabilities and then to actual predictions
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    predictions = torch.argmax(probabilities, dim=1).cpu().tolist()
    
    return [id2label[prediction] for prediction in predictions]

In [18]:
batch_predict(["Проблемы с компом, сервер SERVER", "открыть кейс по неисправному диску на dd:  dd203..ru  Enclosure=3:Disk=49  открыть кейс по неисправному диску на dd:  dd203..ru  Enclosure=3:Disk=49"], model, tokenizer, device)

['Ноутбук', 'СХД']

In [41]:
_, cl = calculate_model(model, new_loader, device)

In [42]:
print(cl)

              precision    recall  f1-score   support

     Ноутбук       0.84      1.00      0.91      1350
         СХД       1.00      0.01      0.02       110
      Сервер       0.37      0.06      0.10       180

    accuracy                           0.83      1640
   macro avg       0.74      0.35      0.34      1640
weighted avg       0.80      0.83      0.76      1640



In [36]:
train_loader.dataset

<__main__.CustomDataset at 0x1a504cc77c0>

In [25]:
print(cl)

              precision    recall  f1-score   support

     Ноутбук       0.92      1.00      0.96        47
         СХД       1.00      0.67      0.80         3
      Сервер       1.00      0.67      0.80         9

    accuracy                           0.93        59
   macro avg       0.97      0.78      0.85        59
weighted avg       0.94      0.93      0.93        59



In [22]:
new_loader.dataset.iloc[924]

Тема                                          Проблема с DIMM в dd203
Описание            Здравствуйте! У нас возникла критическая ошибк...
Тип оборудования                                                  СХД
Точка отказа                                       Оперативная память
text                Проблема с DIMM в dd203 </s> Здравствуйте! У н...
Name: 924, dtype: object