In [7]:
!pip install bertviz allennlp-models -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0m

In [2]:
import allennlp_models.tagging
import numpy as np
import pandas as pd
import torch
import requests, json
from tqdm import tqdm

from allennlp.predictors.predictor import Predictor
from bertviz import head_view, model_view
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [187]:
class Model_srl:
    def __init__(self, path_to_model = "../input/xlmrobertamultilang/xlm-roberta"):
        self.model = AutoModelForSequenceClassification.from_pretrained(path_to_model, num_labels=2)
        self.tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
        self.sep_token = " </s> "
        self.srl_model = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")
        
        self.att_threshold = 0.1e-00
        self.att_n_tok_per_word = 1
        self.att_layer = 6
        self.att_head = 2
#         self.ENG_SENTENCE = ENG_SENTENCE
        self.IAM_TOKEN = 't1.9euelZqSiZPKmM-OyZ2MzceemMvNyO3rnpWam46Pjp6Qj5uam5KQxo_Pxo3l9PdOCXti-e9KNAz63fT3Djh4YvnvSjQM-g.a2bezOMde36Yt9BB9m-1CirUQ2tdrtiNPPTuVsrbt7fxNhR1vixnCENNb_wso5wTWAzcIoX8NQd78Tc5BtcACw'
        self.folder_id = 'b1grvnc3e8sgtm7qcsja'
        
    def translate(self, texts, source_language='tt', target_language='en'):
        body = {
            "targetLanguageCode": target_language,
            "texts": texts,
            "folderId": self.folder_id,
            "sourceLanguageCode":  source_language
        }
        headers = {
            "Content-Type": "application/json",
            "Authorization": "Bearer {0}".format(self.IAM_TOKEN)
        }
        url = 'https://translate.api.cloud.yandex.net/translate/v2/translate'
        response = requests.post(url,
            json = body,
            headers = headers
        )
        if response.status_code != 200:
            print('Ожидаю 0.5 секунды...')  
            time.sleep(0.5)
            response = requests.post(url,
            json = body,
            headers = headers
            )
        d = json.loads(response.text)
        translations = d['translations']
        return [t['text'] for t in translations]
        
    def get_vocab(self, sentence):
        self.tokenized = self.tokenizer([sentence], return_tensors='pt').to(self.model.device)
        self.tokens = self.tokenizer.convert_ids_to_tokens(self.tokenized['input_ids'][0].tolist())
        self.tok_to_idx = {token: idx for idx, token in enumerate(self.tokens)}
        self.idx_to_tok = {idx: token  for idx, token in enumerate(self.tokens)}
    
    def get_word_to_tok(self, desired_output, sentence):
        word2tok_dict = {x:desired_output[i] for i, x in enumerate(sentence.split())}
        self.word2tok = pd.DataFrame.from_dict({'word': word2tok_dict.keys() , 'tokens': word2tok_dict.values()})
        self.get_vocab(sentence)
        
    def get_desired_output(self, sentence):
        idx = 1
        enc =[self.tokenizer.encode(x, add_special_tokens=False) for x in sentence.split()]

        desired_output = []

        for token in enc:
            tokenoutput = []
            for ids in token:
                tokenoutput.append(idx)
                idx +=1
            desired_output.append(tokenoutput)
        
        self.get_word_to_tok(desired_output, sentence)
            
        return desired_output
    
    def get_words_attention(self, attention):
        N = self.word2tok.shape[0]
        d = np.array([[0.] * len(self.tokens) for _ in range(N)])
        p = np.array([[0.] * N for _ in range(N)])
        for idx, token in enumerate(self.tokens):
            for i, word_tok in enumerate(self.word2tok.tokens.values):
                if idx in word_tok and d[i].sum() == 0:
                    d[i] = attention[self.word2tok.loc[i,'tokens']].detach().numpy().sum(axis=0)

        for idx, token in enumerate(self.tokens):
            for i, word_tok in enumerate(self.word2tok.tokens.values):
                if idx in word_tok and p[:,i].sum() == 0:
                    p[:,i] = d[:,self.word2tok.loc[i,'tokens']].sum(axis=1)
        return p

    def get_mapping_dict(self, p):
        N = self.word2tok.shape[0]
        mapping_dict = {}
        for word_idx in range(N):
            if self.word2tok.loc[word_idx, 'word'] == "</s>":
                break
            mask = (p[word_idx] >= self.att_threshold)
            attention_word = p[word_idx][mask]
            map_word = list(self.word2tok.word.values[mask])
            # Проверяем наличие исходного слова, если есть удаляем
            clean_map_word = []
            clean_attention_word = []
            for w in map_word:
                i = map_word.index(w)
                if w not in [self.word2tok.loc[word_idx, 'word'], "</s>"]:
                    clean_map_word.append(w)
            clean_map_word = [self.drop_punc(w) for w in clean_map_word]
            if len(clean_map_word) != 0:
                mapping_dict[self.word2tok.loc[word_idx, 'word']] = clean_map_word

        return mapping_dict
                
    
    def drop_punc(self, word):
        punc = '''!()-[]{};:"\,<>./?@#$%^&*_~'''
        for p in punc:
            word = word.replace(p,'')
        return word

    def make_dict(self, description):
        res = {}
        for i,char in enumerate(description):
            if char == '[':
                begin = i + 1
            if char == ']':
                finish = i
                role = description[begin:finish].split(': ')
                res[role[0]] = role[1]
        return res

    def mapping(self, mapping_dict, eng_words):
        res = ''
        eng_words = eng_words.split()
        for  i, word in enumerate(eng_words):
            try:
                for j, w in enumerate(mapping_dict[word]):
                    if w not in res:
                        res += ' '
                        res += w
            except:
                continue
        return res[1:]

    def result(self, mapping_dict, roles, tat_sentence):
        new_roles = []
        for verb in roles['verbs']:
            result = {}
            srl_verb = {}  

            # Делаем маппинг ролей в виде словаря
            description = verb['description']
            description = self.make_dict(description)
            for srl in description:
                mapped_word = self.mapping(mapping_dict, description[srl])
                srl_verb[srl] = mapped_word
            keys = srl_verb.keys()
            
            # Если нет разметки после мапинга, то пропускаем этот глагол
            if len(keys) == 1 and 'V' in keys:
                continue
            
            # Сохраняем глагол предикат
            new_verb = self.mapping(mapping_dict, verb['verb'])
            # Записывваем тэги на основании словаря ролей
            sentence = tat_sentence.split()
            new_tags = []

            for word in sentence:
                for srl in keys:  
                    if self.drop_punc(word) in srl_verb[srl]:
                        role = srl
                        break
                    else:
                        role = 'O'
                new_tags.append(role)

            result['verb'] = new_verb
            result['description'] = srl_verb
            result['tags'] = new_tags

            new_roles.append(result)
        return new_roles
    

    def predict(self, tat_sentence, source_language):
        eng_sentence = self.translate(tat_sentence, source_language)[0]
        roles = self.srl_model.predict(sentence=eng_sentence)
        
        sentence = eng_sentence + self.sep_token + tat_sentence
        
        desired_output = self.get_desired_output(sentence)
        
        attention = self.model(
            input_ids=self.tokenized.input_ids,
            attention_mask=self.tokenized.attention_mask,
            output_attentions = True
        )['attentions'][self.att_layer][0][self.att_head]
        
        att = self.get_words_attention(attention)
        mapping_dict = self.get_mapping_dict(att)
        new_roles = {}
        new_roles['verbs'] = self.result(mapping_dict, roles, tat_sentence)
        new_roles['words'] = tat_sentence.split()
        return new_roles
    


In [188]:
model = Model_srl()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
TAT_SENTENCE = 'Бинага керү өчен кирәк булган ачкычлар машинада бикләнгән иде.'
KZ_SENTENCE = 'Ғимаратқа кіру үшін қажетті кілттер көлікте құлыптаулы болған.'

In [32]:
%%time
print('\n TAT \n', model.predict(TAT_SENTENCE, 'tt'))
print('\n KZ \n', model.predict(KZ_SENTENCE, 'kk'))


 TAT 
 {'verbs': [{'verb': 'кирәк', 'description': {'ARG1': 'ачкычлар', 'V': 'кирәк', 'ARGM-PRP': 'керү өчен машинада'}, 'tags': ['O', 'ARGM-PRP', 'ARGM-PRP', 'V', 'O', 'ARG1', 'ARGM-PRP', 'O', 'O']}, {'verb': 'керү', 'description': {'V': 'керү', 'ARG1': 'машинада'}, 'tags': ['O', 'V', 'O', 'O', 'O', 'O', 'ARG1', 'O', 'O']}, {'verb': 'ачкычлар', 'description': {'ARG1': 'ачкычлар кирәк керү өчен машинада', 'V': 'ачкычлар', 'ARGM-LOC': 'Бинага машинада'}, 'tags': ['ARGM-LOC', 'ARG1', 'ARG1', 'ARG1', 'O', 'ARG1', 'ARG1', 'O', 'O']}], 'words': ['Бинага', 'керү', 'өчен', 'кирәк', 'булган', 'ачкычлар', 'машинада', 'бикләнгән', 'иде.']}

 KZ 
 {'verbs': [{'verb': 'қажетті', 'description': {'ARG1': 'қажетті кілттер', 'V': 'қажетті', 'ARGM-PRP': 'үшін кіру көлікте'}, 'tags': ['O', 'ARGM-PRP', 'ARGM-PRP', 'ARG1', 'ARG1', 'ARGM-PRP', 'O', 'O']}, {'verb': 'кіру', 'description': {'V': 'кіру', 'ARG1': 'көлікте'}, 'tags': ['O', 'V', 'O', 'O', 'O', 'ARG1', 'O', 'O']}, {'verb': 'құлыптаулы', 'descript

In [33]:
TAT_SENTENCE = 'Бүген көн буранлы, әмма салкын түгел'
KZ_SENTENCE = 'Бүгін боран, бірақ суық емес'

In [34]:
%%time
print('\n TAT \n', model.predict(TAT_SENTENCE, 'tt'))
print('\n KZ \n', model.predict(KZ_SENTENCE, 'kk'))


 TAT 
 {'verbs': [{'verb': '', 'description': {'ARG1': '', 'V': '', 'ARG2': 'көн', 'ARGM-TMP': ''}, 'tags': ['O', 'ARG2', 'O', 'O', 'O', 'O']}, {'verb': '', 'description': {'ARG1': '', 'V': '', 'ARGM-NEG': 'түгел', 'ARG2': ''}, 'tags': ['O', 'O', 'O', 'O', 'O', 'ARGM-NEG']}], 'words': ['Бүген', 'көн', 'буранлы,', 'әмма', 'салкын', 'түгел']}

 KZ 
 {'verbs': [{'verb': '', 'description': {'ARG1': 'Бүгін', 'V': '', 'ARG2': 'бірақ емес суық'}, 'tags': ['ARG1', 'O', 'ARG2', 'ARG2', 'ARG2']}], 'words': ['Бүгін', 'боран,', 'бірақ', 'суық', 'емес']}
CPU times: user 948 ms, sys: 6.81 ms, total: 955 ms
Wall time: 3.17 s


In [35]:
TAT_SENTENCE = 'Без дачага киттек'
KZ_SENTENCE = 'Біз коттеджге бардық'

In [36]:
%%time
print('\n TAT \n', model.predict(TAT_SENTENCE, 'tt'))
print('\n KZ \n', model.predict(KZ_SENTENCE, 'kk'))


 TAT 
 {'verbs': [{'verb': 'киттек', 'description': {'ARG0': 'Без', 'V': 'киттек', 'ARG4': 'дачага'}, 'tags': ['ARG0', 'ARG4', 'V']}], 'words': ['Без', 'дачага', 'киттек']}

 KZ 
 {'verbs': [{'verb': 'бардық', 'description': {'ARG0': 'Біз', 'V': 'бардық', 'ARG4': 'бардық коттеджге'}, 'tags': ['ARG0', 'ARG4', 'V']}], 'words': ['Біз', 'коттеджге', 'бардық']}
CPU times: user 753 ms, sys: 5.03 ms, total: 758 ms
Wall time: 2.93 s
