In [1]:
import numpy as np
import pandas as pd

import gensim
import pickle
import scipy
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [2]:
class BertToW2v(torch.nn.Module):
    def __init__(self, bert_model_name, lin_shape_in, lin_shape_out, emb_layer): # -, 768, 100, 6
        super(BertToW2v, self).__init__()
        self.emb_layer = emb_layer
        self.bert_model = BertModel.from_pretrained(bert_model_name)
        #self.bert_model.eval()
        self.linear_model = torch.nn.Linear(lin_shape_in, lin_shape_out, bias=True) # bias?
        torch.nn.init.uniform_(self.linear_model.weight, -0.1, 0.1)
        
    def forward(self, input_sentence): # ожидаем уже токенизированное предложение
        encoded_layers, _ = self.bert_model(input_sentence)
        bert_output = encoded_layers[self.emb_layer][0][1]
        linear_output = self.linear_model(bert_output).unsqueeze(0)
        return linear_output

In [3]:
ozhegov_emb = pd.read_csv('rus/ozhegov/ozhegov_emb.csv')[['word', 'definition']]
ozhegov_no_emb = pd.read_csv('rus/ozhegov/ozhegov_no_emb.csv')

with open('rus/freq/low_freq.pkl', 'rb') as f:
    ulfreq = pickle.load(file=f)
    
ozhegov = pd.concat([ozhegov_emb, ozhegov_no_emb], axis = 0)

ozhegov['lfreq'] = ozhegov['word'].apply(lambda word: word in ulfreq)

ozhegov_lfreq = ozhegov[ozhegov['lfreq'] == True][['word', 'definition']]

In [4]:
defs = set(ozhegov_lfreq['word'])
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
w2v = gensim.models.KeyedVectors.load_word2vec_format('w2v_models/all_norm-sz500-w10-cb0-it3-min5.w2v', binary=True, unicode_errors='ignore')

test = pd.read_csv('russe-evaluation/russe/evaluation/test.csv')

In [5]:
bw2v = BertToW2v('bert-base-multilingual-cased', lin_shape_in=768, lin_shape_out=500, emb_layer=6) # !!!
bw2v.load_state_dict(torch.load('models/SUM_cosine_annealing_v0ep_l6.mdl')) # !!!
bw2v.to('cuda');

In [23]:
def find_embedding(word, model, w2v, ozhegov_lfreq, defs, tokenizer):
    if word in defs:
        print(word)
        defin = ozhegov_lfreq[ozhegov_lfreq['word'] == word].reset_index()['definition'][0]
        defin = '[CLS] [MASK] - ' + defin + ' [SEP]'
        tokens = tokenizer.tokenize(defin)
        tok_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokens)]).to('cuda')
        with torch.no_grad():
            embedding = model(tok_ids)
        return embedding.to('cpu').numpy()
    else:
        try:
            return w2v.get_vector(word)
        except KeyError:
            return np.nan
        
def get_cosine_distance(word1, word2, model, w2v, ozhegov_lfreq, defs, tokenizer):
    emb1 = find_embedding(word1, model, w2v, ozhegov_lfreq, defs, tokenizer)
    emb2 = find_embedding(word2, model, w2v, ozhegov_lfreq, defs, tokenizer)
    if (np.isnan(np.sum(emb1)) or np.isnan(np.sum(emb2))):
        return np.nan
    return 1 - scipy.spatial.distance.cosine(emb1, emb2)

#assert abs(get_cosine_distance('абрикос', 'год', bw2v, w2v, ozhegov_lfreq, defs, tokenizer) - (1 - w2v.distance('абрикос', 'год'))) < 0.000001

In [24]:
get_cosine_distance('диск-жокей', 'зачем-то', bw2v, w2v, ozhegov_lfreq, defs, tokenizer)

nan

In [25]:
# defin = ozhegov_lfreq[ozhegov_lfreq['word'] == 'зачем-то'].reset_index()['definition'][0]
# defin = '[CLS] [MASK] - ' + defin + ' [SEP]'
# tokens = tokenizer.tokenize(defin)
# tok_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokens)]).to('cuda')

In [26]:
# with torch.no_grad():
#     tmp = bw2v.bert_model(tok_ids)

In [27]:
test['sim'] = test.apply(lambda row: get_cosine_distance(row['word1'], row['word2'], bw2v, w2v, ozhegov_lfreq, defs, tokenizer), axis=1)

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [28]:
test['sim'] = test['sim'].fillna(test['sim'].mean())

In [29]:
test.to_csv('russe-evaluation/russe/evaluation/v0ep_l6.csv', index=None)

In [30]:
pd.set_option('max_colwidth', 160)

In [14]:
ozhegov_lfreq.shape

(8503, 2)

In [15]:
ozhegov_lfreq.sample(20)

Unnamed: 0,word,definition
11971,подсудный,подлежащий суду (вообще или в данной судебной инстанции)
19683,ксендз,польский католический священник
12140,неустройство,"отсутствие порядка, правильного устройства"
24750,прострел,об острой боли (обычно в результате простуды)
22233,рагу,"кушанье из мелких тушеных кусочков мяса, рыбы или овощей"
11370,мракобес,"реакционер, враг прогресса, культуры, науки"
21076,бессердечный,"чуждый мягкости, сердечности, бездушный, бездушный, жестокий"
11235,попрошайка,"человек который назойливо выпрашивает, надоедает просьбами"
6667,забастовщик,участник забастовки
638,грейдер,"колесная землеройная машина, употребляемая в дорожном строительстве"


In [16]:
## необязательная часть, проверка:
with open('tmp_words.txt', 'r') as f:
    words = set(f.read().split('\n'))
    print(len(words))

2683


In [17]:
# for word in words:
#     if word != '':
#         print(ozhegov_lfreq[ozhegov_lfreq['word'] == word].reset_index()['definition'][0])