In [1]:
import pandas as pd
import json
import re
from fuzzywuzzy import fuzz

In [None]:
from transformers import BertTokenizerFast
import torch
from transformers import BertForTokenClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [3]:
import warnings
warnings.filterwarnings("ignore")

### Данные с разметкой

In [59]:
with open('data/data_request_processing_full.json', 'r', encoding='utf-8') as f:
    loaded_data = json.load(f)

texts = loaded_data['texts']
labels = loaded_data['labels']

print (len(texts), len(labels))

1202 1202


### Составление разметки для токенизированных предложений

In [36]:
label_map = {
    "O": 0,
    "B-кухня": 1, # начало названия какой-то кухни, которая нравится
    "I-кухня": 2, # продолжение названия какой-то кухни, которая нравится
    "B-блюдо": 3, # начало названия какого-то блюда, которое нравится
    "I-блюдо": 4, #  продолжение названия какого-то блюда, которое нравится
    "B-кухня-негатив": 5, # начало названия какой-то кухни, которая не нравится
    "I-кухня-негатив": 6, # продолжение названия какой-то кухни, которая не нравится
    "B-блюдо-негатив": 7, # начало названия какого-то блюда, которое не нравится
    "I-блюдо-негатив": 8 #  продолжение названия какого-то блюда, которое не нравится
}

tokenizer = BertTokenizerFast.from_pretrained('DeepPavlov/rubert-base-cased')

tokenized_inputs = tokenizer(texts, padding=True, truncation=True, is_split_into_words=True, return_offsets_mapping=True)

offsets_mapping = tokenized_inputs["offset_mapping"]
token_labels_all = []

for text, label, offset_mapping in zip(texts, labels, offsets_mapping):

  words = text[0].replace(" .", ".").replace(" ,", ",").split()
  word_ranges = []
  start = 0
  for word in words:
      end = start + len(word)
      word_ranges.append((start, end))
      start = end + 1  # +1 для пробела

  word_to_label = {range: label for range, label in zip(word_ranges, label)}

  token_labels = []

  for i, (start, end) in enumerate(offset_mapping):
      if (start, end) == (0, 0):
          token_labels.append("O")
      else:
          for word_range, label in word_to_label.items():
              if start >= word_range[0] and end <= word_range[1]:
                  if start == word_range[0]:
                      token_labels.append(label)
                  else:
                      if label.startswith("B-"):
                          token_labels.append(label.replace("B-", "I-"))
                      else:
                          token_labels.append(label)
                  break
  token_labels_all.append(token_labels)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [37]:
numeric_labels = [[label_map[label] for label in label_seq] for label_seq in token_labels_all]
# numeric_labels

### Обучение модели



In [39]:
input_ids = torch.tensor(tokenized_inputs['input_ids'])
attention_mask = torch.tensor(tokenized_inputs['attention_mask'])
labels = torch.tensor(numeric_labels)

train_inputs, test_inputs, train_labels, test_labels, train_attention_mask, test_attention_mask = train_test_split(
    input_ids, labels, attention_mask, test_size=0.2, random_state=42)

train_dataset = TensorDataset(train_inputs, train_attention_mask, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

test_dataset = TensorDataset(test_inputs, test_attention_mask, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [40]:
model = BertForTokenClassification.from_pretrained(
    'DeepPavlov/rubert-base-cased',
    num_labels=len(label_map)+1)

optimizer = AdamW(model.parameters(), lr=5e-5)

loss_fn = torch.nn.CrossEntropyLoss()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
model.train() 
for epoch in range(8):
    total_loss = 0  
    num_batches = 0 

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        num_batches += 1

    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch}, Average Loss: {avg_loss:.4f}")


    model.eval()  
    with torch.no_grad():  
        total_test_loss = 0
        num_test_batches = 0
        for batch in test_dataloader:
            input_ids, attention_mask, labels = batch

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_test_loss += loss.item()
            num_test_batches += 1

        avg_test_loss = total_test_loss / num_test_batches
        print(f"Epoch {epoch}, Test Average Loss: {avg_test_loss:.4f}")


    if epoch == 0 or avg_test_loss < best_test_loss:
        best_test_loss = avg_test_loss
        torch.save(model.state_dict(), 'models/request_processing/request_processing_model.pth')
        print(f"Model saved at epoch {epoch} with test loss: {avg_test_loss:.4f}")
    
    model.train()

Epoch 0, Average Loss: 0.2756
Epoch 0, Test Average Loss: 0.0392
Model saved at epoch 0 with test loss: 0.0392
Epoch 1, Average Loss: 0.0335
Epoch 1, Test Average Loss: 0.0238
Model saved at epoch 1 with test loss: 0.0238
Epoch 2, Average Loss: 0.0154
Epoch 2, Test Average Loss: 0.0083
Model saved at epoch 2 with test loss: 0.0083
Epoch 3, Average Loss: 0.0132
Epoch 3, Test Average Loss: 0.0175
Epoch 4, Average Loss: 0.0141
Epoch 4, Test Average Loss: 0.0208
Epoch 5, Average Loss: 0.0093
Epoch 5, Test Average Loss: 0.0056
Model saved at epoch 5 with test loss: 0.0056
Epoch 6, Average Loss: 0.0030
Epoch 6, Test Average Loss: 0.0040
Model saved at epoch 6 with test loss: 0.0040
Epoch 7, Average Loss: 0.0043
Epoch 7, Test Average Loss: 0.0116


### Тестирование модели

In [44]:
model = BertForTokenClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=10)
model.load_state_dict(torch.load('models/request_processing/request_processing_model.pth'))
model.eval();

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
model.eval()
test_loss = 0
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        test_loss += loss.item()

        predictions = torch.argmax(logits, dim=-1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.numel()

print(f"Test Loss: {test_loss / len(test_dataloader)}")
print(f"Accuracy: {correct_predictions / total_predictions * 100:.2f}%")

Test Loss: 0.004043731083920984
Accuracy: 99.88%


### Запуск модели

In [None]:
model = BertForTokenClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=10)
model.load_state_dict(torch.load('models/request_processing/request_processing_model.pth'))
model.eval()

tokenizer = BertTokenizerFast.from_pretrained('DeepPavlov/rubert-base-cased')

text = "Я люблю итальянскую кухню, особенно пасту и пиццу, какао, но не хочу бургеры и гуакамоле, а еще не люблю острое, поэтому без тайской кухни"

inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, is_split_into_words=False)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits

predicted_labels = torch.argmax(logits, dim=-1).squeeze().tolist()

tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())
words = tokenizer.batch_decode(input_ids, skip_special_tokens=True)[0].split()

def align_tokens_with_words(tokens, predicted_labels):
    word_labels = []
    current_word = None
    for token, label in zip(tokens, predicted_labels):
        if token.startswith("##"):
            current_word += token[2:]
        else:
            if current_word is not None:
                word_labels.append((current_word, label_map_reverse[current_label]))
            current_word = token
            current_label = label
    if current_word is not None:
        word_labels.append((current_word, label_map_reverse[current_label]))
    return word_labels

label_map_reverse = {v: k for k, v in label_map.items()}

word_labels = align_tokens_with_words(tokens, predicted_labels)

for word, label in word_labels:
    print(f"{word} - {label}")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[CLS] - O
Я - O
люблю - O
итальянскую - B-кухня
кухню - I-кухня
, - I-кухня
особенно - O
пасту - B-блюдо
и - I-блюдо
пиццу - B-блюдо
, - I-блюдо
какао - B-блюдо
, - I-блюдо
но - O
не - O
хочу - O
бургеры - B-блюдо-негатив
и - O
гуакамоле - B-блюдо-негатив
, - I-блюдо-негатив
а - O
еще - O
не - O
люблю - O
острое - O
, - O
поэтому - O
без - O
тайской - B-кухня-негатив
кухни - I-кухня-негатив
[SEP] - O


#### Уникальные названия кухонь и блюд из БД

In [47]:
with open('data/unique_cuisines.json', 'r', encoding='utf-8') as f:
    unique_cuisines = set(json.load(f))

with open('data/unique_dishes.json', 'r', encoding='utf-8') as f:
    unique_dishes = set(json.load(f))

#### Обработка результатов модели

In [57]:
cuisine_positive = []
cuisine_negative = []
dish_positive = []
dish_negative = []

current_cuisine_positive = []
current_cuisine_negative = []
current_dish_positive = []
current_dish_negative = []

for word, label in word_labels:
    if label.startswith("B-кухня"): 
        if current_cuisine_positive:
            cuisine_positive.append(" ".join(current_cuisine_positive))
        if current_cuisine_negative:
            cuisine_negative.append(" ".join(current_cuisine_negative))
        current_cuisine_positive = [] 
        current_cuisine_negative = []  
        
        if label == "B-кухня":
            current_cuisine_positive = [word]
        else:
            current_cuisine_negative = [word]

    elif label.startswith("I-кухня"):  
        if current_cuisine_positive:
            current_cuisine_positive.append(word)
        if current_cuisine_negative:
            current_cuisine_negative.append(word)

    else:
        if current_cuisine_positive:
            cuisine_positive.append(" ".join(current_cuisine_positive))
            current_cuisine_positive = []  

        if current_cuisine_negative:
            cuisine_negative.append(" ".join(current_cuisine_negative))
            current_cuisine_negative = [] 

    if label.startswith("B-блюдо"): 
        if current_dish_positive:
            dish_positive.append(" ".join(current_dish_positive))
        if current_dish_negative:
            dish_negative.append(" ".join(current_dish_negative))
        current_dish_positive = []  
        current_dish_negative = []  
        
        if label == "B-блюдо":
            current_dish_positive = [word]
        elif label == "B-блюдо-негатив":
            current_dish_negative = [word]

    elif label.startswith("I-блюдо"):  
        if current_dish_positive:
            current_dish_positive.append(word)
        if current_dish_negative:
            current_dish_negative.append(word)

    elif label.startswith("I-блюдо-негатив"): 
        if current_dish_negative:
            current_dish_negative.append(word)

    else:
        if current_dish_positive:
            dish_positive.append(" ".join(current_dish_positive))
            current_dish_positive = []  
        if current_dish_negative:
            dish_negative.append(" ".join(current_dish_negative))
            current_dish_negative = [] 

if current_cuisine_positive:
    cuisine_positive.append(" ".join(current_cuisine_positive))
if current_cuisine_negative:
    cuisine_negative.append(" ".join(current_cuisine_negative))

if current_dish_positive:
    dish_positive.append(" ".join(current_dish_positive))
if current_dish_negative:
    dish_negative.append(" ".join(current_dish_negative))

def clean_and_lemmatize(text):
    text = re.sub(r'[^\w\s]', '', text) 
    text = re.sub(r'[SEP]|[CLS]', '', text)
    words = text.split()  
    return ' '.join(words)


def check_cuisine_similarity(text, threshold=60):
    max_similarity = 0  
    best_match = None 
    
    for cuisine in unique_cuisines:
        similarity = fuzz.ratio(text.lower(), cuisine.lower())  
        if similarity > max_similarity: 
            max_similarity = similarity
            best_match = cuisine
    
    if max_similarity >= threshold:
        return best_match
    return None

def check_dish_similarity(text, threshold=60):
    max_similarity = 0  
    best_match = None 
    
    for dish in unique_dishes:
        similarity = fuzz.ratio(text.lower(), dish.lower())  
        if similarity > max_similarity:  
            max_similarity = similarity
            best_match = dish
    
    if max_similarity >= threshold:
        return best_match
    return None

print (f"Запрос клиента: {text}")

print("Позитивная кухня:")
for cuisine in cuisine_positive:
    cleaned_cuisine = clean_and_lemmatize(cuisine)
    res = check_cuisine_similarity(cleaned_cuisine)
    if res is not None:
        print (res)
    else:
        print (f"({cleaned_cuisine})")

print("\nНегативная кухня:")
for cuisine in cuisine_negative:
    cleaned_cuisine = clean_and_lemmatize(cuisine)
    res = check_cuisine_similarity(cleaned_cuisine)
    if res is not None:
        print (res)
    else:
        print (f"({cleaned_cuisine})")

print("\nПозитивные блюда:")
for dish in dish_positive:
    cleaned_dish = clean_and_lemmatize(dish)
    res = check_dish_similarity(cleaned_dish)
    if res is not None:
        print (res)
    else:
        print (f"({cleaned_dish})")

print("\nНегативные блюда:")
for dish in dish_negative:
    cleaned_dish = clean_and_lemmatize(dish)
    res = check_dish_similarity(cleaned_dish)
    if res is not None:
        print (res)
    else:
        print (f"({cleaned_dish})")

Запрос клиента: Я люблю итальянскую кухню, особенно пасту и пиццу, какао, но не хочу бургеры и гуакамоле, а еще не люблю острое, поэтому без тайской кухни
Позитивная кухня:
итальянская кухня

Негативная кухня:
тайская кухня

Позитивные блюда:
паста
пицца
какао

Негативные блюда:
бургеры
(гуакамоле)


**Вывод**

Таким образом, наша программа может извлекать из текстовых пожеланий клиента предпочитаемые и непредпочитаемые блюда и кухни. В скобках указаны те блюда и кухни, которые отсутствуют в базе данных ресторанов и кафе. Для остальных можно осуществить поиск подходящих заведений.