#Начало работы

**Линвистическая задача, на которой проверяются модели: выбор правильной словоформы известной лексемы.**

Проверяются английские BERT, RoBerta и русскоязычная GPT.

In [None]:
!pip install transformers==4.28.1

Collecting transformers==4.28.1
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.1)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstalled tokenizers-0.15.2
  Attempting uninstall: transformers
    Found existing installation: transformers 4.38.2
    Uninstalling transformers-4.38.2:
      Successfully uninstalled transformers-4.38.2
Successfully installed tokenizers-0.13.3 transformers-4.28.1


# Языковая модель с пропусками BERT

In [None]:
from transformers import AutoTokenizer, BertForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = BertForMaskedLM.from_pretrained("bert-base-cased").to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import torch
import numpy as np

def find_word_probability(sentence, word, tokenizer, model):
    """
    sentence -- предложение, содержащее ровно один символ "_", обозначающий пропуск.
    """
    if sentence.count("_") != 1:
        raise ValueError("Предложение должно содержать ровно один пропуск.")
    masked_sentence = sentence.replace("_", tokenizer.mask_token)
    masked_tokenization = tokenizer(masked_sentence)["input_ids"]
    word_tokenization = tokenizer(word, add_special_tokens=False)["input_ids"]
    word_length = len(word_tokenization)
    index = masked_tokenization.index(tokenizer.mask_token_id)
    masked_tokenization[index:index+1] = [tokenizer.mask_token_id] * word_length
    batch = np.array([masked_tokenization] * word_length, dtype=int)
    for prefix_length in range(1, word_length):
        '''[
            [active MASK(a) MASK MASK control]
            [active a MASK(ero) MASK control]
            [active a ero MASK(dynamic) control]
        ]'''
        batch[prefix_length, index:index+prefix_length] = word_tokenization[:prefix_length]
    batch = torch.LongTensor(batch).to("cuda")
    with torch.no_grad():
        logits = model(batch)["logits"]
    log_probs = torch.log_softmax(
        logits[np.arange(word_length),index+np.arange(word_length)], dim=-1
    ).cpu().numpy()
    subtoken_log_probs = log_probs[np.arange(word_length), word_tokenization]
    total_prob = subtoken_log_probs.sum()
    return {"total_log_prob": total_prob, "subtoken_probs": np.exp(subtoken_log_probs)}

Проверим, насколько хорошо модель справляется с согласованием времен в предложениях.

In [None]:
np.set_printoptions(precision=3)

texts = [
    "Mary said she _ go to the party next day.",
    "Abraham is not sure whether he _ reside or not.",
    "I _ had breakfast before I went to school.",
    "Carl commuted more often last week than he _ this week.",
    "I _ whether Sarah would come or not.",
    "She finally _ what her friend was doing.",
    "I found out that the lesson _ lasted for two hours.",
    "I was sure that Maria _ been hiding the truth from all of us for a very long time.",
    "Nobody knew what the teacher _.",
    "I _ sure that he had been misleading me all that time.",
    "You _ me you would help me with my French homework.",
    "A short red-haired man whom I _ never seen before entered the room.",
    "Last week I met a woman who _ believed to be a famous writer of books for children.",
    "Richard didn’t go on a trip to the lake with us yesterday because he _ have a History exam in three days.",
    "The president claimed that people _ to see changes."
]

 # в списке words храним пары слов для каждого предложения, где первое - более грамматичный вариант заполнения пропуска, а второе - менее грамматичный.
words = [
          ["would", "will"], ["will", "would"], ["had", "have"],  ["does", "did"], ["knew", "know"],
          ["realized", "realizes"], ["had", "has"], ["had", "has"], ["meant", "means"], ["was", "am"],
          ["promised", "promise"], ["had", "have"], ["is", "was"], ["will", "would"], ["want", "wanted"]
        ]
for i, text in enumerate(texts):
  #выводим предложение и вероятности соответствующих слов
  print(i+1, text)
  prob_0 = find_word_probability(text, words[i][0], tokenizer, model)["subtoken_probs"]
  print(words[i][0], prob_0)
  prob_1 = find_word_probability(text, words[i][1], tokenizer, model)["subtoken_probs"]
  print(words[i][1], prob_1)
  print("")


1 Mary said she _ go to the party next day.
would [0.941]
will [0.01]

2 Abraham is not sure whether he _ reside or not.
will [0.674]
would [0.026]

3 I _ had breakfast before I went to school.
had [0.064]
have [0.002]

4 Carl commuted more often last week than he _ this week.
does [0.003]
did [0.683]

5 I _ whether Sarah would come or not.
knew [0.019]
know [0.001]

6 She finally _ what her friend was doing.
realized [0.276]
realizes [0.001]

7 I found out that the lesson _ lasted for two hours.
had [0.357]
has [0.016]

8 I was sure that Maria _ been hiding the truth from all of us for a very long time.
had [0.984]
has [0.014]

9 Nobody knew what the teacher _.
meant [0.24]
means [0.003]

10 I _ sure that he had been misleading me all that time.
was [0.919]
am [0.024]

11 You _ me you would help me with my French homework.
promised [0.397]
promise [0.001]

12 A short red-haired man whom I _ never seen before entered the room.
had [0.975]
have [0.025]

13 Last week I met a woman who _ 

Как видно, в более канонических случаях модель справляется хорошо (вероятность для более грамматичного варианта сильно больше, чем вероятность для менее грамматичсного варианта). Однако в менее очевидных случаях, где нужно учитывать больший контекст (предложение 4 - **this week**), характер придаточного (предложения 14 и 15), модель справляется хуже. Видно, что для модели был также немного затруднителен не самый очевидный пример 13, потому что у менее грамматичного варианта не очень низкая вероятность, особенно если сравнивать с разницей в вероятностях в других примерах.

Источник о суждениях о грамматичности: https://englex.ru/sequence-of-tenses-in-english/

#Roberta

In [None]:
from transformers import AutoTokenizer, RobertaForMaskedLM

roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaForMaskedLM.from_pretrained("roberta-base").to("cuda")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]


Посмотрим, как справится эта модель с согласованием времен.

In [None]:
np.set_printoptions(precision=3)

texts = [
    "Mary said she _ go to the party next day.",
    "Abraham is not sure whether he _ reside or not.",
    "I _ had breakfast before I went to school.",
    "Carl commuted more often last week than he _ this week.",
    "I _ whether Sarah would come or not.",
    "She finally _ what her friend was doing.",
    "I found out that the lesson _ lasted for two hours.",
    "I was sure that Maria _ been hiding the truth from all of us for a very long time.",
    "Nobody knew what the teacher _.",
    "I _ sure that he had been misleading me all that time.",
    "You _ me you would help me with my French homework.",
    "A short red-haired man whom I _ never seen before entered the room.",
    "Last week I met a woman who _ believed to be a famous writer of books for children.",
    "Richard didn’t go on a trip to the lake with us yesterday because he _ have a History exam in three days.",
    "The president claimed that people _ to see changes."
]

 # в списке words храним пары слов для каждого предложения, где первое - более грамматичный вариант заполнения пропуска, а второе - менее грамматичный.
words = [
          ["would", "will"], ["will", "would"], ["had", "have"],  ["does", "did"], ["knew", "know"],
          ["realized", "realizes"], ["had", "has"], ["had", "has"], ["meant", "means"], ["was", "am"],
          ["promised", "promise"], ["had", "have"], ["is", "was"], ["will", "would"], ["want", "wanted"]
        ]
for i, text in enumerate(texts):
  #выводим предложение и вероятности соответствующих слов
  print(i+1, text)
  prob_0 = find_word_probability(text, words[i][0], roberta_tokenizer, roberta_model)["subtoken_probs"]
  print(words[i][0], prob_0)
  prob_1 = find_word_probability(text, words[i][1], roberta_tokenizer, roberta_model)["subtoken_probs"]
  print(words[i][1], prob_1)
  print("")

1 Mary said she _ go to the party next day.
would [3.787e-06]
will [1.417e-06]

2 Abraham is not sure whether he _ reside or not.
will [8.18e-06]
would [2.277e-07]

3 I _ had breakfast before I went to school.
had [2.389e-07]
have [1.82e-08]

4 Carl commuted more often last week than he _ this week.
does [5.59e-07]
did [1.825e-06]

5 I _ whether Sarah would come or not.
knew [1.422e-06 2.312e-03]
know [5.267e-08]

6 She finally _ what her friend was doing.
realized [1.303e-08 1.485e-03]
realizes [1.303e-08 3.348e-05]

7 I found out that the lesson _ lasted for two hours.
had [8.608e-07]
has [5.309e-08]

8 I was sure that Maria _ been hiding the truth from all of us for a very long time.
had [1.074e-05]
has [2.916e-06]

9 Nobody knew what the teacher _.
meant [3.324e-07 2.287e-04]
means [3.324e-07 9.152e-05]

10 I _ sure that he had been misleading me all that time.
was [1.606e-05]
am [6.55e-06]

11 You _ me you would help me with my French homework.
promised [3.200e-08 2.025e-05]
promi

Для примера 4 ситуация такак же, как для обычного берта. Однако изменилась ситуация для 14: вероятность стала больше у более грамматичного варианта. Для 15 и 5 предложений сравнить вероятности затруднительно.

# Левосторонняя модель GPT

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("sberbank-ai/mGPT")
model = GPT2LMHeadModel.from_pretrained("sberbank-ai/mGPT").to("cuda")

vocab.json:   0%|          | 0.00/1.89M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.45G [00:00<?, ?B/s]

In [None]:
prefix = "Предложение."
prefix_input_ids = tokenizer(prefix)["input_ids"]
prefix_length = len(prefix_input_ids)

In [None]:
tokenizer.padding_side = 'right'

Будем сравнивать вероятности предложений с глаголами, у которых нет формы 1 лица единсвтенного числа настоящего/будущего времени.

In [None]:
texts = [
     "Я бдю.",
     "Я буду бдеть.",
     "Я победю в каждой гонке.",
     "Я буду побеждать в каждой гонке.",
     "Я галдю, потому что выиграл дом!",
     "Я буду галдеть, если выиграю дом",
     "Я чудю, потому что наступила весна!",
     "Я буду чудить, когда наступит весна",
     "Я убедю его прийти к нам",
     "Я буду убеждать его прийти к нам",
     "Я басю после того, как у меня сломался голос.",
     "Я начал басить после того, как у меня сломался голос.",
     "Я ощуту свободу, как только наступит лето.",
     "Я буду ощущать свободу, как только наступит лето.",
     "Через 10 минут я очутюсь в настоящей сказке.",
     "Через 10 минут я окажусь в настоящей сказке.",
     "Я часто дерзю своей маме.",
     "Мне приходится часто дерзить своей маме.",
     "Я голошу над несчастным ребенком.",
     "Я начала голосить над несчастным ребенком."
]
tokenizer.pad_token = tokenizer.eos_token
batch = tokenizer(texts, return_tensors="pt", padding=True).to("cuda")
batch_prefix_input_ids = torch.LongTensor([prefix_input_ids for _ in texts]).to("cuda")

batch["input_ids"] = torch.cat([
    batch_prefix_input_ids,
    batch["input_ids"]
 ], dim=1)

with torch.no_grad():
    logits = model(batch["input_ids"])["logits"]
probs = torch.softmax(logits, dim=-1).cpu().numpy()
#print(probs.shape)

In [None]:
for i, text in enumerate(texts):
    sum_log_prob = 0.0
    print(i+1, text, end = " ")
    text_token_ids = batch["input_ids"][i,prefix_length:]
    text_tokens = [x.strip("ĠĊ") for x in tokenizer.convert_ids_to_tokens(text_token_ids)]
    for j, (index, token) in enumerate(zip(text_token_ids, text_tokens)):
        decoded_token = tokenizer.decode([index])
        #print(f"{decoded_token}:{probs[i,j+prefix_length-1,index]:.3f}", end=" ")
        if decoded_token.strip() == tokenizer.eos_token:
            break
        sum_log_prob += -np.log(probs[i,j+prefix_length-1,index])
    #print("")
    print(f"Minus log prob: {sum_log_prob:.2f}")

1 Я бдю. Minus log prob: 27.67
2 Я буду бдеть. Minus log prob: 29.40
3 Я победю в каждой гонке. Minus log prob: 39.14
4 Я буду побеждать в каждой гонке. Minus log prob: 35.36
5 Я галдю, потому что выиграл дом! Minus log prob: 57.51
6 Я буду галдеть, если выиграю дом Minus log prob: 53.43
7 Я чудю, потому что наступила весна! Minus log prob: 55.77
8 Я буду чудить, когда наступит весна Minus log prob: 45.54
9 Я убедю его прийти к нам Minus log prob: 44.50
10 Я буду убеждать его прийти к нам Minus log prob: 39.97
11 Я басю после того, как у меня сломался голос. Minus log prob: 62.83
12 Я начал басить после того, как у меня сломался голос. Minus log prob: 56.78
13 Я ощуту свободу, как только наступит лето. Minus log prob: 58.78
14 Я буду ощущать свободу, как только наступит лето. Minus log prob: 53.66
15 Через 10 минут я очутюсь в настоящей сказке. Minus log prob: 48.42
16 Через 10 минут я окажусь в настоящей сказке. Minus log prob: 42.14
17 Я часто дерзю своей маме. Minus log prob: 47.56


Как видно, модель хорошо распознает глаголы, у которых нет формы первого лица единственного числа настоящего/будущего времени, потому что по сравнению с грамматичными вариантами, у предложений с такими глаголами в основном больше minus log prob. Это не так для пар 1-2, 17-18, 19-20. Возможно, оба предложения из пар одинаково "странные" для модели.