### 1. Подгружаем обёртку NER модели из репозитория

In [1]:
%%capture
! wget https://raw.githubusercontent.com/PhilBurub/Project_NLP/main/NER/NER_model.py
#! wget https://raw.githubusercontent.com/PhilBurub/Project_NLP/main/NER/requirements.txt
#! pip install -r requirements.txt
! pip install wget

In [2]:
from NER_model import NER, Vocab
import torch
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import sent_tokenize
import pandas as pd
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### 2. Применяем модель

In [3]:
weights = 'https://github.com/PhilBurub/Project_NLP/raw/main/NER/model.pth'
tags = 'https://raw.githubusercontent.com/PhilBurub/Project_NLP/main/NER/tag_vocab.json'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
ner_model = NER(weights, tags, device)

In [4]:
texts = pd.read_csv('https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_reviews.txt',
                    header=None, delimiter='\t', index_col=0, names=['text'])

In [5]:
texts_li = texts['text'].tolist()
ids_li = texts.index.tolist()

In [6]:
out_ner = pd.DataFrame(ner_model.get_ne(texts_li, ids_li), columns=['id', 'entity',	'start',	'end',	'aspect']).set_index('id')
out_ner = out_ner[out_ner['entity'] != '']
out_ner['sentiment'] = 0

In [7]:
out_ner

Unnamed: 0_level_0,entity,start,end,aspect,sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
13823,проводила к столу,188,205,Service,0
13823,дала меню,208,217,Service,0
13823,официантка,242,252,Service,0
13823,приняла заказ,254,267,Service,0
13823,удалилась,270,279,Service,0
...,...,...,...,...,...
11770,блюда,267,272,Food,0
11770,ягод,310,314,Food,0
11770,фрукта и,319,327,Food,0
11770,специй,342,348,Food,0


### 4. Классификатор 1

#### 4.1. Препроцессинг

In [8]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import warnings
from copy import deepcopy
from collections import defaultdict
from transformers import BertModel, BertTokenizer, AdamW

rubert_model = AutoModel.from_pretrained("cointegrated/rubert-tiny-sentiment-balanced").to(device)
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny-sentiment-balanced", device=device)

In [9]:
def boudaries(id_, text):
  out = []
  new_text = text
  cur = 0
  for sent in sent_tokenize(text):
    start = new_text.find(sent)
    end = start + len(sent)
    out.append((id_, sent, cur + start, cur + end))
    cur += end
    new_text = new_text[end:]
  return pd.DataFrame(out, columns=['id', 'text', 'start', 'end'])

def create_mask(tokens, asp_tokens):
  mask_li = []
  for token in tokens:
    if token in asp_tokens:
      mask_li.append(1)
    else:
      mask_li.append(0)

  return mask_li

class ContextDataset(Dataset):
  def __init__(self, texts, outputs):

    new = pd.DataFrame()
    for id_, text in texts.iterrows():
      new = pd.concat((new, boudaries(id_, text['text'])))

    out = []
    for text_id, row in outputs.iterrows():
      slice_ = new[(new['id'] == text_id) & (new['start'] <= row['start']) & (new['end'] >= row['end'])]
      if len(slice_) == 0:
        #print(row, '0')
        continue
      if len(slice_) > 1:
        #print(row, '>1')
        continue
      out.append((text_id, row['entity'], row['aspect'], row['start'], row['end'], slice_['text'].item(), row['sentiment']))
    self.contexts = pd.DataFrame(out, columns=['id', 'input', 'aspect', 'start', 'end', 'context', 'sentiment'])

    self.map_sent = {0: -1, 'negative': 0, 'neutral': 1, 'positive': 2,
                     'both': 3}
    self.map_asp = {0: -1, 'Food': 0, 'Interior': 1, 'Price': 2,
                    'Service': 3, 'Whole': 4}

  def __len__(self):
    return len(self.contexts)

  def __getitem__(self, idx):

    row = self.contexts.loc[idx]
    text = row['context']
    tokens = tokenizer.encode_plus(text, max_length=150, add_special_tokens=True, padding='max_length', truncation=True, return_tensors='pt', return_token_type_ids=False)

    aspect_word = row['input']
    aspect_token_ids = tokenizer.encode(aspect_word, add_special_tokens=False, return_tensors='pt')
    asp_tokens = aspect_token_ids[0]
    tokens_list = tokens['input_ids'].tolist()[0]

    attention_mask = create_mask(tokens_list, asp_tokens)

    sentiment = [self.map_sent.get(row['sentiment'])]

    return {'review_text': text, 'input_ids': tokens['input_ids'], 'attention_mask': attention_mask, 'targets': torch.tensor(sentiment, dtype = torch.long).squeeze()}

####4.2. Загрузка модели

Загрузить в файлы чекпойнт модели *best_model_state.bin*

In [10]:
!wget "https://drive.google.com/uc?export=download&id=1ntBgLbvp2E2blZrOESo_ZNC54d_l0Boc" -O best_model_state.bin

--2023-12-29 07:56:18--  https://drive.google.com/uc?export=download&id=1ntBgLbvp2E2blZrOESo_ZNC54d_l0Boc
Resolving drive.google.com (drive.google.com)... 74.125.135.100, 74.125.135.101, 74.125.135.139, ...
Connecting to drive.google.com (drive.google.com)|74.125.135.100|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0o-04-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/rthgn60grllepqb5pnmvbm27s1q5v4ad/1703836575000/00315623077463891901/*/1ntBgLbvp2E2blZrOESo_ZNC54d_l0Boc?e=download&uuid=502ff43d-cb84-4f30-adea-151cca9e271e [following]
--2023-12-29 07:56:24--  https://doc-0o-04-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/rthgn60grllepqb5pnmvbm27s1q5v4ad/1703836575000/00315623077463891901/*/1ntBgLbvp2E2blZrOESo_ZNC54d_l0Boc?e=download&uuid=502ff43d-cb84-4f30-adea-151cca9e271e
Resolving doc-0o-04-docs.googleusercontent.com (doc-0o-04-docs.googleusercontent.com)... 74.125.20.132, 2607:f

In [11]:
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained("cointegrated/rubert-tiny-sentiment-balanced")
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    ).pooler_output

    output = self.drop(pooled_output)

    return self.out(output)


def get_predictions(model, data_loader):
  model = model.eval()

  review_texts = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:
      texts = d["review_text"]
      input_ids = torch.squeeze(d['input_ids']).to(device)
      attention_mask = torch.stack(d['attention_mask']).transpose(0, 1).to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )

      preds = torch.argmax(outputs, dim=1)

      review_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(targets)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return review_texts, predictions, prediction_probs, real_values


In [12]:
def classify_with_ner(texts):
  texts_li = texts['text'].tolist()
  ids_li = texts.index.tolist()

  out_ner = pd.DataFrame(ner_model.get_ne(texts_li, ids_li), columns=['id', 'entity',	'start',	'end',	'aspect']).set_index('id')
  out_ner = out_ner[out_ner['entity'] != '']
  out_ner['sentiment'] = 0

  dev_set = ContextDataset(texts, out_ner)

  model = SentimentClassifier(4)
  model.load_state_dict(torch.load('best_model_state.bin'))
  model.eval()

  model.to(device)

  dev_data_loader = DataLoader(dev_set, batch_size=16, num_workers=4)
  y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(model, dev_data_loader)

  df_pred = dev_set.contexts.copy()
  df_pred['sentiment'] = list(y_pred.tolist())
  df_pred['sentiment'] = df_pred['sentiment'].map({0:'negative', 1:'neutral', 2: 'positive', 3:'both'})

  return df_pred


texts = pd.read_csv('https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_reviews.txt',
                    header=None, delimiter='\t', index_col=0, names=['text'])

out = classify_with_ner(texts)
out.to_csv('dev_pred_aspects.txt', sep = '\t', header=False, columns=['id', 'aspect', 'input', 'start', 'end', 'sentiment'], index=False)



In [13]:
out

Unnamed: 0,id,input,aspect,start,end,context,sentiment
0,13823,проводила к столу,Service,188,205,"Зашли в""аппетит"" случайно.Не смотря на то,что ...",positive
1,13823,меню,Food,213,217,"Зашли в""аппетит"" случайно.Не смотря на то,что ...",positive
2,13823,официантка,Service,242,252,"Зашли в""аппетит"" случайно.Не смотря на то,что ...",positive
3,13823,удалилась,Service,270,279,"Зашли в""аппетит"" случайно.Не смотря на то,что ...",positive
4,13823,ресторан,Whole,337,345,"Зашли в""аппетит"" случайно.Не смотря на то,что ...",positive
...,...,...,...,...,...,...,...
679,11770,Муж,Food,156,159,"Муж - испанец, в ресторан просто влюбился.",positive
680,11770,блюда,Food,267,272,"Для меня же минус был в том, что сами блюда сл...",positive
681,11770,ягод,Food,310,314,С добавлением ягод или фрукта и слишком много ...,positive
682,11770,фрукта,Food,319,325,С добавлением ягод или фрукта и слишком много ...,positive


### 5. Классификатор 2. Способ 1.

In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

CATEGORIES = ['Whole', 'Interior', 'Service', 'Food', 'Price']

model_checkpoint = 'cointegrated/rubert-tiny-sentiment-balanced'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
  model.cuda()

def get_sentiment(text, return_type='label'):
    """ Calculate sentiment of a text. `return_type` can be 'label', 'score' or 'proba' """
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
        proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()[0]
    if return_type == 'label':
        return model.config.id2label[proba.argmax()]
    elif return_type == 'score':
        return proba.dot([-1, 0, 1])
    return proba

def score(proba):
  neg, neut, pos = proba
  if abs(neg - pos) < 0.25:
    return 'both'
  elif abs(neg - neut) < 0.1 and abs(pos - neut) < 0.1:
    return 'both'
  else:
    return model.config.id2label[proba.argmax()]

def get_full_sentiment(text, idx, df_ready):
    pred_cats = list(df_ready.loc[df_ready['id'] == idx]['aspect'])

    for c in CATEGORIES:
        if c not in pred_cats:
            s = 'absence'
        else:
            s = df_ready.loc[(df_ready['id'] == idx) & (df_ready['aspect'] == c)]['sent_pred'].item()
        yield c, s

In [15]:
def category_sentiment(df):

  model_checkpoint = 'cointegrated/rubert-tiny-sentiment-balanced'
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
  if torch.cuda.is_available():
      model.cuda()

  df_ready = df.groupby(['id', 'aspect']).agg({'context': lambda x: ' '.join(x)}).reset_index()
  df_ready['sent_pred'] = df_ready['context'].apply(lambda x: score(get_sentiment(x, 'proba')))

  return df_ready

df_ready = category_sentiment(out)

In [16]:
df_ready

Unnamed: 0,id,aspect,context,sent_pred
0,785,Interior,"Ресторан понравился, интерьер приятный, много ...",positive
1,785,Whole,"Ресторан понравился, интерьер приятный, много ...",positive
2,797,Food,"Один раз днем, все понравилось,второй раз вече...",neutral
3,797,Service,"Один раз днем, все понравилось,второй раз вече...",both
4,797,Whole,"А мы, видимо, были никто, по ее мнению..Но елс...",negative
...,...,...,...,...
217,37819,Service,"Очень понравилась шоу программа и ведущая, кот...",positive
218,37819,Whole,Очень долго выбирали ресторан на Новогодний ка...,positive
219,38299,Interior,"Оказалось не зря... Очень уютная обстановка, н...",positive
220,38299,Service,"Оказалось не зря... Очень уютная обстановка, н...",both


In [17]:
dev_texts = pd.read_csv('https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_reviews.txt', delimiter='\t', header=None, names=['text_id', 'text'])

with open('dev_pred_cats.txt', 'w') as f:
  for text, idx in zip(dev_texts['text'], dev_texts['text_id']):
    for c, s in get_full_sentiment(text, idx, df_ready):
      print(idx, c, s, sep="\t", file=f)

### 6. Классификатор 2. Способ 2.

In [18]:
def get_full_sentiment(text, idx, df_ready):
    pred_cats = list(df_ready.loc[df_ready['id'] == idx]['aspect'])

    for c in CATEGORIES:
        if c not in pred_cats:
            s = 'absence'
        else:
            s = df_ready.loc[(df_ready['id'] == idx) & (df_ready['aspect'] == c)]['sentiment'].item()
        yield c, s

def category_sentiment(df):
  df_total = pd.DataFrame(df.groupby(['id', 'aspect'])['sentiment'].agg(lambda x:x.value_counts().index[0])).reset_index()
  return df_total

In [19]:
df_total = category_sentiment(out)

with open('dev_pred_cats_2.txt', 'w') as f:
  for text, idx in zip(dev_texts['text'], dev_texts['text_id']):
    for c, s in get_full_sentiment(text, idx, df_total):
      print(idx, c, s, sep="\t", file=f)

###7. Оценка качества

####Baseline 1

In [20]:
! wget 'https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_aspects.txt'
! wget 'https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_cats.txt'

--2023-12-29 07:56:40--  https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_aspects.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 57508 (56K) [text/plain]
Saving to: ‘dev_aspects.txt.2’


2023-12-29 07:56:41 (6.36 MB/s) - ‘dev_aspects.txt.2’ saved [57508/57508]

--2023-12-29 07:56:41--  https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_cats.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7400 (7.2K) [text/plain]
Saving to: ‘dev_cats.txt.1’


2023-12-29 07:5

In [21]:
gold_test_path = "dev_aspects.txt"
pred_test_path = "dev_pred_aspects.txt"

gold_aspect_cats = {}
with open(gold_test_path) as fg:
    for line in fg:
        line = line.rstrip('\r\n').split('\t')
        if line[0] not in gold_aspect_cats:
            gold_aspect_cats[line[0]] = {"starts":[], "ends":[], "cats":[], "sents":[]}
        gold_aspect_cats[line[0]]["starts"].append(int(line[3]))
        gold_aspect_cats[line[0]]["ends"].append(int(line[4]))
        gold_aspect_cats[line[0]]["cats"].append(line[1])
        gold_aspect_cats[line[0]]["sents"].append(line[5])

full_match, partial_match, full_cat_match, partial_cat_match = 0, 0, 0, 0
total = 0
fully_matched_pairs = []
partially_matched_pairs = []
with open(pred_test_path) as fp:
    for line in fp:
        total += 1
        line = line.rstrip('\r\n').split('\t')
        start, end = int(line[3]), int(line[4])
        category = line[1]
        doc_gold_aspect_cats = gold_aspect_cats[line[0]]
        if start in doc_gold_aspect_cats["starts"]:
            i = doc_gold_aspect_cats["starts"].index(start)
            if doc_gold_aspect_cats["ends"][i] == end:
                full_match += 1
                if doc_gold_aspect_cats["cats"][i] == category:
                    full_cat_match += 1
                else:
                    partial_cat_match += 1
                fully_matched_pairs.append(
                    (
                        [
                            doc_gold_aspect_cats["starts"][i],
                            doc_gold_aspect_cats["ends"][i],
                            doc_gold_aspect_cats["cats"][i],
                            doc_gold_aspect_cats["sents"][i]
                        ],
                        line
                    )
                )
                continue
        for s_pos in doc_gold_aspect_cats["starts"]:
            if start <= s_pos:
                i = doc_gold_aspect_cats["starts"].index(s_pos)
                if doc_gold_aspect_cats["ends"][i] == end:
                    partial_match += 1
                    partially_matched_pairs.append(
                        (
                            [
                                doc_gold_aspect_cats["starts"][i],
                                doc_gold_aspect_cats["ends"][i],
                                doc_gold_aspect_cats["cats"][i],
                                doc_gold_aspect_cats["sents"][i]
                            ],
                            line
                        )
                    )
                    if doc_gold_aspect_cats["cats"][i] == category:
                        partial_cat_match += 1
                    continue
                matched = False
                for e_pos in doc_gold_aspect_cats["ends"][i:]:
                    if s_pos <= end <= e_pos:
                        partial_match += 1
                        partially_matched_pairs.append(
                            (
                                [
                                    doc_gold_aspect_cats["starts"][i],
                                    doc_gold_aspect_cats["ends"][i],
                                    doc_gold_aspect_cats["cats"][i],
                                    doc_gold_aspect_cats["sents"][i]
                                ],
                                line
                            )
                        )
                        if doc_gold_aspect_cats["cats"][i] == category:
                            partial_cat_match += 1
                        matched = True
                        break
                if matched:
                    break
            if start > s_pos:
                i = doc_gold_aspect_cats["starts"].index(s_pos)
                if start < doc_gold_aspect_cats["ends"][i] <= end:
                    partial_match += 1
                    partially_matched_pairs.append(
                        (
                            [
                                doc_gold_aspect_cats["starts"][i],
                                doc_gold_aspect_cats["ends"][i],
                                doc_gold_aspect_cats["cats"][i],
                                doc_gold_aspect_cats["sents"][i]
                            ],
                            line
                        )
                    )
                    if doc_gold_aspect_cats["cats"][i] == category:
                        partial_cat_match += 1
                    break

gold_size = sum([len(gold_aspect_cats[x]["cats"]) for x in gold_aspect_cats])
print(f"""
Full match precision: {full_match / total}
Full match recall: {full_match / gold_size}
Partial match ratio in pred: {(full_match + partial_match)  / total}
Full category accuracy: {full_cat_match / total}
Partial category accuracy: {(full_cat_match + partial_cat_match) / total}
""")


Full match precision: 0.7339181286549707
Full match recall: 0.4218487394957983
Partial match ratio in pred: 0.881578947368421
Full category accuracy: 0.7119883040935673
Partial category accuracy: 0.8771929824561403



Ср. базовое решение:

```
Full match precision: 0.48
Full match recall: 0.7159663865546219
Partial match ratio in pred: 0.6197183098591549
Full category accuracy: 0.46422535211267607
Partial category accuracy: 0.6033802816901408
```




####Baseline 2

In [22]:
def sentiment_accuracy(matches):
    matched_sentiment = 0.
    for pair in matches:
        *_, gold_s = pair[0]
        *_, pred_s = pair[1]
        if gold_s == pred_s:
            matched_sentiment += 1
    print(f"Mention sentiment accuracy: {matched_sentiment / len(matches)}")

sentiment_accuracy(fully_matched_pairs)
sentiment_accuracy(partially_matched_pairs)

Mention sentiment accuracy: 0.6972111553784861
Mention sentiment accuracy: 0.7425742574257426


Ср. базовое решение:



```
Mention sentiment accuracy: 0.6772300469483568
Mention sentiment accuracy: 0.5720670391061452
```



####Baseline 3


**Способ 1**

In [23]:
gold_test_cats_path = "dev_cats.txt"
pred_test_cats_path = "dev_pred_cats.txt"

with open(gold_test_cats_path) as gc, open(pred_test_cats_path) as pc:
    gold_labels = set(gc.readlines())
    pred_labels = set(pc.readlines())
    print(
        "Overall sentiment accuracy:",
        len(gold_labels & pred_labels) / len(gold_labels)
    )

Overall sentiment accuracy: 0.532394366197183


**Способ 2**

In [24]:
gold_test_cats_path = "dev_cats.txt"
pred_test_cats_path = "dev_pred_cats_2.txt"

with open(gold_test_cats_path) as gc, open(pred_test_cats_path) as pc:
    gold_labels = set(gc.readlines())
    pred_labels = set(pc.readlines())
    print(
        "Overall sentiment accuracy:",
        len(gold_labels & pred_labels) / len(gold_labels)
    )

Overall sentiment accuracy: 0.5971830985915493


Ср. базовое решение:


```
Overall sentiment accuracy: 0.523943661971831
```

