In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import json

In [2]:
import warnings
warnings.filterwarnings("ignore")

#### Импорт данных

In [3]:
with open('data/data_dish.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [4]:
not_dishes_list = data['not_dishes'].split('; ')
dishes_list = data['dishes'].split('; ')

In [5]:
text_list = not_dishes_list + dishes_list
label_list = [0]*len(not_dishes_list) + [1]*len(dishes_list)

In [7]:
df = pd.DataFrame({
    'text': text_list,
    'label': label_list
})
df.head()

Unnamed: 0,text,label
0,Средний чек 1500₽,0
1,"Санкт-Петербург, Большая Морская, 22",0
2,Заказ столиков,0
3,Доставка,0
4,Бизнес-ланч,0


In [8]:
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### Загрузка модели и токенизатора

In [61]:
model_name = "DeepPavlov/rubert-base-cased" 
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Токенизация

In [62]:
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128)

train_encodings = tokenize_function(X_train.tolist())
test_encodings = tokenize_function(X_test.tolist())

#### Создание Dataset

In [63]:
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(y_train.values)
)

test_dataset = TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(y_test.values)
)

In [64]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

#### Обучение модели

In [65]:
optimizer = AdamW(model.parameters(), lr=1e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device);

In [None]:
epochs = 20
best_accuracy = 0  
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        loss = outputs.loss  

        loss.backward()  
        optimizer.step()

        total_loss += loss.item()

        predictions = torch.argmax(logits, dim=-1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

    epoch_loss = total_loss / len(train_dataloader)
    epoch_accuracy = correct_predictions / total_predictions

    print(f"Epoch {epoch + 1}/{epochs} - Loss: {epoch_loss:.4f} - Accuracy: {epoch_accuracy:.4f}")

    model.eval()
    correct_predictions_test = 0
    total_predictions_test = 0

    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits

            predictions = torch.argmax(logits, dim=-1)
            correct_predictions_test += (predictions == labels).sum().item()
            total_predictions_test += labels.size(0)

    test_accuracy = correct_predictions_test / total_predictions_test
    print(f"Test Accuracy: {test_accuracy:.4f}")

    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        torch.save(model.state_dict(), 'models/dish_classifier/dish_classifier_model.pth')  
        tokenizer.save_pretrained('models/dish_classifier/dish_tokenizer')  
        print(f"Model saved with test accuracy: {best_accuracy:.4f}")

Epoch 1/25 - Loss: 0.6794 - Accuracy: 0.5506
Epoch 2/25 - Loss: 0.6075 - Accuracy: 0.6139
Epoch 3/25 - Loss: 0.4747 - Accuracy: 0.8101
Epoch 4/25 - Loss: 0.3041 - Accuracy: 0.9747
Epoch 5/25 - Loss: 0.1787 - Accuracy: 0.9873
Epoch 6/25 - Loss: 0.1190 - Accuracy: 0.9873
Epoch 7/25 - Loss: 0.0850 - Accuracy: 0.9873
Epoch 8/25 - Loss: 0.0506 - Accuracy: 0.9937
Epoch 9/25 - Loss: 0.0484 - Accuracy: 0.9810
Epoch 10/25 - Loss: 0.0421 - Accuracy: 0.9937
Epoch 11/25 - Loss: 0.0413 - Accuracy: 0.9937
Epoch 12/25 - Loss: 0.0277 - Accuracy: 0.9937
Epoch 13/25 - Loss: 0.0237 - Accuracy: 0.9937
Epoch 14/25 - Loss: 0.0197 - Accuracy: 0.9873
Epoch 15/25 - Loss: 0.0152 - Accuracy: 1.0000
Epoch 16/25 - Loss: 0.0176 - Accuracy: 0.9873
Epoch 17/25 - Loss: 0.0167 - Accuracy: 0.9937
Epoch 18/25 - Loss: 0.0160 - Accuracy: 0.9937
Epoch 19/25 - Loss: 0.0141 - Accuracy: 0.9937
Epoch 20/25 - Loss: 0.0109 - Accuracy: 1.0000
Epoch 21/25 - Loss: 0.0143 - Accuracy: 0.9937
Epoch 22/25 - Loss: 0.0122 - Accuracy: 0.99

#### Оценка модели

In [67]:
model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        predictions = torch.argmax(logits, dim=-1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

test_accuracy = correct_predictions / total_predictions
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.9559


#### Сохранение модели

In [None]:
# torch.save(model.state_dict(), 'models/dish_classifier/dish_classifier_model.pth')

# tokenizer.save_pretrained('models/dish_classifier/dish_tokenizer')

('models/dish_classifier/dish_tokenizer/tokenizer_config.json',
 'models/dish_classifier/dish_tokenizer/special_tokens_map.json',
 'models/dish_classifier/dish_tokenizer/vocab.txt',
 'models/dish_classifier/dish_tokenizer/added_tokens.json')