In [None]:
!pip install transformers -q

In [None]:
!pip install gdown -q

In [None]:
# должно сработать, потому что я открывала доступ,
# но если не работает, то по этой ссылке можно так же попасть в папку с моделями
!gdown --folder "https://drive.google.com/drive/u/0/folders/1QgbnP3IJpMqdGZ-hoJUMtmXM2LAKJypW"

In [None]:
import pandas as pd
import numpy as np
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    pipeline
)
from torch.utils.data import DataLoader, Dataset
import torch
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
import pickle

## Получение результатов работы модели сущностей на тестовых данных

In [None]:
path_to_test_data = "dev_reviews.txt"
path_to_save_preds = "dev_pred_aspects.txt"
path_to_save_preds_cats = "dev_pred_cats.txt"

In [None]:
label2id = {
    'B-Food': 0, 'B-Interior': 1, 'B-Price': 2, 'B-Service': 3,
    'B-Whole': 4, 'I-Food': 5, 'I-Interior': 6, 'I-Price': 7,
    'I-Service': 8, 'I-Whole': 9, 'O': 10}

id2label = {idx: label for label, idx in label2id.items()}

def get_entities(text_id, text):
    result = classifier(text)
    to_return = []
    words = []
    entities = []
    prev_entity = ''
    positions = []

    for i, token in enumerate(result):
        entity_id = int(token['entity'].split('_')[-1])
        entity = id2label[entity_id]
        start_pos = token['start']
        end_pos = token['end']

        if token['word'].startswith('##'):
            words[-1] += token['word'][2:]
            positions[-1][1] = end_pos

        elif entity.startswith('I') and entities:
            words[-1] += f' {token["word"]}'
            positions[-1][1] = end_pos

        else:
            words.append(token['word'])
            entities.append(entity)
            positions.append([start_pos, end_pos])

        prev_entity = entity

    for word, entity, position in zip(words, entities, positions):
        if entity != 'O':
            to_return.append([str(text_id), entity[2:], word, str(position[0]), str(position[1]), 'neutral'])

    return to_return

In [None]:
classifier = pipeline("ner", model="models/bert_wo_crf_deepvk_uncased_1e02")

### Чтение тестовых данных и получение предсказаний

In [None]:
data = pd.read_csv(path_to_test_data, sep='\t', names=['idx', 'text'])

In [None]:
from tqdm import tqdm
all_results = []
for i, row in tqdm(data.iterrows()):
    all_results.extend(get_entities(row['idx'], row['text']))

In [None]:
to_save = '\n'.join(['\t'.join(result) for result in all_results])

In [None]:
with open(path_to_save_preds, 'w') as f:
    f.write(to_save)

### Предсказание тональности аспектов

In [None]:
# Функция для извлечения контекста вокруг аспекта с учетом целых слов
def extract_context_with_category(text, start, end, category, window=50):
    # Находим начало контекста и пробел назад от начальной позиции аспекта
    start_idx = max(0, start - window)
    while start_idx > 0 and text[start_idx] != ' ':
        start_idx -= 1

    # Находим конец контекста и пробел вперед от конечной позиции аспекта
    end_idx = min(len(text), end + window)
    while end_idx < len(text) and text[end_idx] != ' ':
        end_idx += 1

    context = text[start_idx:end_idx].strip()
    return category + " " + context  # Добавление категории аспекта

In [None]:
label_dict = {'negative': 0, 'neutral': 1, 'positive': 2, 'both': 3}
device = "cuda" if torch.cuda.is_available() else "cpu"
model_path = "models/bert_model.pth"
model_name = 'DeepPavlov/rubert-base-cased'

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(label_dict))

# Загрузка сохраненных весов
model.load_state_dict(torch.load(model_path, map_location=device))

model.to(device)

In [None]:
dev_aspects = pd.read_csv(path_to_save_preds, sep='\t', names=['review_id', 'aspect_category', 'aspect_text', 'start_pos', 'end_pos', 'sentiment'])
dev_reviews = pd.read_csv(path_to_test_data, sep='\t', names=['review_id', 'review_text'])

# Объединение данных
merged_data = pd.merge(dev_aspects, dev_reviews, on='review_id')

# Добавление контекста к данным
merged_data['context'] = merged_data.apply(lambda row: extract_context_with_category(row['review_text'], row['start_pos'], row['end_pos'], row['aspect_category']), axis=1)

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_name, model_max_length=512)

def predict_sentiment(text):
    encoded_input = tokenizer.encode_plus(
        text,
        max_length=64,
        add_special_tokens=True,
        return_attention_mask=True,
        padding='max_length',
        return_tensors='pt',
        truncation=True
    )
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        prediction = torch.argmax(outputs.logits, dim=1).item()

    # Возвращаем тональность аспекта
    return list(label_dict.keys())[list(label_dict.values()).index(prediction)]

# Применение модели к каждому контексту и обновление значения тональности
dev_aspects['sentiment'] = merged_data['context'].apply(predict_sentiment)

dev_aspects.to_csv(path_to_save_preds, sep='\t', index=False, header=False)

### Предсказание тональности отзывов по категориям

In [None]:
# загрузка сохраненной модели
with open('models/lr_model.pkl', 'rb') as file:
    clf = pickle.load(file)

In [None]:
test_aspects = pd.read_csv(
    path_to_save_preds,
    delimiter='\t',
    names=['text_id', 'category', 'mention', 'start', 'end', 'sentiment'])

In [None]:
CATEGORIES = ['Food', 'Interior', 'Price', 'Whole', 'Service']

In [None]:
# функция, подсчитывающая количество positives, negatives, neutrals и both для каждой категории
def count(aspects):
    text_ids = aspects['text_id'].unique()

    new_entry = {}
    entries = []
    for text_id in text_ids:
        for c in CATEGORIES:
            positive = 0
            negative = 0
            neutral = 0
            both = 0

            if not aspects[(aspects['text_id']==text_id) & (aspects['category']==c)]['category'].empty:
                for i in range(len(aspects[(aspects['text_id']==text_id) & (aspects['category']==c)]['sentiment'])):
                    if aspects[(aspects['text_id']==text_id) & (aspects['category']==c)]['sentiment'].iloc[i] == 'positive':
                        positive += 1
                    elif aspects[(aspects['text_id']==text_id) & (aspects['category']==c)]['sentiment'].iloc[i] == 'negative':
                        negative += 1
                    elif aspects[(aspects['text_id']==text_id) & (aspects['category']==c)]['sentiment'].iloc[i] == 'neutral':
                        neutral += 1
                    elif aspects[(aspects['text_id']==text_id) & (aspects['category']==c)]['sentiment'].iloc[i] == 'both':
                        both += 1

            new_entry = {'text_id': text_id, 'category': c, 'positive': positive, 'negative': negative, 'neutral': neutral, 'both': both}
            entries.append(new_entry)
    df = pd.DataFrame.from_records(entries)
    return df

In [None]:
test = count(test_aspects)

In [None]:
y_pred = clf.predict(test[list(test.keys()[2::])])

In [None]:
# загрузка классов, использывавшихся при обучении, для энкодера, чтобы дальше вернуться обратно к positive, negative, neutral и both
LE = preprocessing.LabelEncoder()
LE.classes_ = np.load('models/classes.npy', allow_pickle=True)

In [None]:
new_test = test
y = LE.inverse_transform(y_pred)
new_test['sentiment'] = y
new_test.drop(['positive', 'negative', 'neutral', 'both'], axis=1, inplace=True)

In [None]:
new_test.to_csv(path_to_save_preds_cats, sep='\t', header=False, index=False)

## Оценивание

In [None]:
gold_test_path = "dev_aspects.txt"
pred_test_path = "dev_pred_aspects.txt"

In [None]:
from collections import defaultdict
gold_aspect_cats = {}
with open(gold_test_path) as fg:
    for line in fg:
        line = line.rstrip('\r\n').split('\t')
        if line[0] not in gold_aspect_cats:
            gold_aspect_cats[line[0]] = {"starts":[], "ends":[], "cats":[], "sents":[]}
        gold_aspect_cats[line[0]]["starts"].append(int(line[3]))
        gold_aspect_cats[line[0]]["ends"].append(int(line[4]))
        gold_aspect_cats[line[0]]["cats"].append(line[1])
        gold_aspect_cats[line[0]]["sents"].append(line[5])

In [None]:
full_match, partial_match, full_cat_match, partial_cat_match = 0, 0, 0, 0
total = 0
fully_matched_pairs = []
partially_matched_pairs = []
with open(pred_test_path) as fp:
    for line in fp:
        total += 1
        line = line.rstrip('\r\n').split('\t')
        start, end = int(line[3]), int(line[4])
        category = line[1]
        doc_gold_aspect_cats = gold_aspect_cats[line[0]]
        if start in doc_gold_aspect_cats["starts"]:
            i = doc_gold_aspect_cats["starts"].index(start)
            if doc_gold_aspect_cats["ends"][i] == end:
                full_match += 1
                if doc_gold_aspect_cats["cats"][i] == category:
                    full_cat_match += 1
                else:
                    partial_cat_match += 1
                fully_matched_pairs.append(
                    (
                        [
                            doc_gold_aspect_cats["starts"][i],
                            doc_gold_aspect_cats["ends"][i],
                            doc_gold_aspect_cats["cats"][i],
                            doc_gold_aspect_cats["sents"][i]
                        ],
                        line
                    )
                )
                continue
        for s_pos in doc_gold_aspect_cats["starts"]:
            if start <= s_pos:
                i = doc_gold_aspect_cats["starts"].index(s_pos)
                if doc_gold_aspect_cats["ends"][i] == end:
                    partial_match += 1
                    partially_matched_pairs.append(
                        (
                            [
                                doc_gold_aspect_cats["starts"][i],
                                doc_gold_aspect_cats["ends"][i],
                                doc_gold_aspect_cats["cats"][i],
                                doc_gold_aspect_cats["sents"][i]
                            ],
                            line
                        )
                    )
                    if doc_gold_aspect_cats["cats"][i] == category:
                        partial_cat_match += 1
                    continue
                matched = False
                for e_pos in doc_gold_aspect_cats["ends"][i:]:
                    if s_pos <= end <= e_pos:
                        partial_match += 1
                        partially_matched_pairs.append(
                            (
                                [
                                    doc_gold_aspect_cats["starts"][i],
                                    doc_gold_aspect_cats["ends"][i],
                                    doc_gold_aspect_cats["cats"][i],
                                    doc_gold_aspect_cats["sents"][i]
                                ],
                                line
                            )
                        )
                        if doc_gold_aspect_cats["cats"][i] == category:
                            partial_cat_match += 1
                        matched = True
                        break
                if matched:
                    break
            if start > s_pos:
                i = doc_gold_aspect_cats["starts"].index(s_pos)
                if start < doc_gold_aspect_cats["ends"][i] <= end:
                    partial_match += 1
                    partially_matched_pairs.append(
                        (
                            [
                                doc_gold_aspect_cats["starts"][i],
                                doc_gold_aspect_cats["ends"][i],
                                doc_gold_aspect_cats["cats"][i],
                                doc_gold_aspect_cats["sents"][i]
                            ],
                            line
                        )
                    )
                    if doc_gold_aspect_cats["cats"][i] == category:
                        partial_cat_match += 1
                    break

### Метрики по сущностям

In [None]:
gold_size = sum([len(gold_aspect_cats[x]["cats"]) for x in gold_aspect_cats])
print(f"""
Full match precision: {full_match / total}
Full match recall: {full_match / gold_size}
Partial match ratio in pred: {(full_match + partial_match)  / total}
Full category accuracy: {full_cat_match / total}
Partial category accuracy: {(full_cat_match + partial_cat_match) / total}
""")

### Метрики по тональностям аспектов

In [None]:
def sentiment_accuracy(matches):
    matched_sentiment = 0.
    for pair in matches:
        *_, gold_s = pair[0]
        *_, pred_s = pair[1]
        if gold_s == pred_s:
            matched_sentiment += 1
    print(f"Mention sentiment accuracy: {matched_sentiment / len(matches)}")

In [None]:
sentiment_accuracy(fully_matched_pairs)

In [None]:
sentiment_accuracy(partially_matched_pairs)

### Метрика по тональностям отзывов

In [None]:
gold_test_cats_path = "dev_cats.txt"
pred_test_cats_path = "dev_pred_cats.txt"

In [None]:
with open(gold_test_cats_path) as gc, open(pred_test_cats_path) as pc:
    gold_labels = set(gc.readlines())
    pred_labels = set(pc.readlines())
    print(
        "Overall sentiment accuracy:",
        len(gold_labels & pred_labels) / len(gold_labels)
    )