In [1]:
import ast
import math
import numpy as np
import pandas as pd
import gensim
import scipy
import torch
import torch.nn.functional as F
import pickle

from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM, AdamW
from torch.utils.data import Dataset, DataLoader
from tensorboardX import SummaryWriter

#np.random.seed = 1

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [2]:
class BertToW2v(torch.nn.Module):
    def __init__(self, bert_model_name, lin_shape_in, lin_shape_out, emb_layer): # -, 768, 100, 6
        super(BertToW2v, self).__init__()
        self.emb_layer = emb_layer
        self.bert_model = BertModel.from_pretrained(bert_model_name, output_hidden_states=True)
        #self.bert_model.eval()
        self.linear_model = torch.nn.Linear(lin_shape_in, lin_shape_out, bias=True) # bias?
        torch.nn.init.uniform_(self.linear_model.weight, -0.1, 0.1)
        
    def forward(self, input_sentence, mask): # ожидаем уже токенизированное предложение
        _, _, encoded_layers = self.bert_model(input_sentence, attention_mask=mask)
        bert_output = encoded_layers[self.emb_layer][:,1]
        linear_output = self.linear_model(bert_output)
        return linear_output

In [3]:
ozhegov_emb = pd.read_csv('rus/ozhegov/ozhegov_emb.csv')[['word', 'definition']]
ozhegov_no_emb = pd.read_csv('rus/ozhegov/ozhegov_no_emb.csv')

dfreq = pd.read_csv('rusfreq/freqrnc2011.csv', sep = '\t')

model_name = 'batchify_ep20_l6_5_15'

In [4]:
with open('rus/freq/low_freq.pkl', 'rb') as f:
    ulfreq = pickle.load(file=f)
    
ozhegov = pd.concat([ozhegov_emb, ozhegov_no_emb], axis = 0)

ozhegov['lfreq'] = ozhegov['word'].apply(lambda word: word in ulfreq)

ozhegov_lfreq = ozhegov[ozhegov['lfreq'] == True][['word', 'definition']]

In [5]:
#defs = set(ozhegov_lfreq['word'])
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
w2v = gensim.models.KeyedVectors.load_word2vec_format('w2v_models/all_norm-sz500-w10-cb0-it3-min5.w2v', binary=True, unicode_errors='ignore')

test = pd.read_csv('russe-evaluation/russe/evaluation/test.csv')

In [6]:
bw2v = BertToW2v('bert-base-multilingual-cased', lin_shape_in=768, lin_shape_out=500, emb_layer=6) # !!!
bw2v.load_state_dict(torch.load(f'models/{model_name}.mdl')) # !!!
bw2v.to('cuda');
bw2v.eval()

BertToW2v(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BertInter

In [7]:
test_words = pd.Series(np.hstack([test['word1'].unique(), test['word2'].unique()])).unique()

In [16]:
max_len = 128

def find_definition(word, dataset):
    try:
        return dataset[dataset['word'] == word].reset_index().iloc[0]['definition']
    except:
        return ''
    
def find_w2v(word, w2v):
    try:
        return w2v.get_vector(word)
    except KeyError:
        return np.nan
    
def get_bw2v(row, bw2v, tokenizer):
    if (row['defin'] == ''):
        return np.nan
    else:
        word = row['word']
        inp = row['input'].unsqueeze(0)
        attention = row['attention'].unsqueeze(0)
        inp = inp.to('cuda')
        attention = attention.to('cuda')
        with torch.no_grad():
            res = bw2v(inp, attention)
        return res.cpu()[0].numpy()

In [17]:
data = pd.DataFrame(pd.Series(np.hstack([test['word1'].unique(), test['word2'].unique()])).unique(), columns=['word'])

freq = pd.read_csv('rusfreq/freqrnc2011.csv', sep = '\t')

In [19]:
%%time

data['defin'] = data['word'].apply(lambda x: find_definition(x, ozhegov))

data['w2v'] = data['word'].apply(lambda x: find_w2v(x, w2v))  # 307 nan'ов

data['input'] = data['defin'].apply(lambda x: torch.tensor(tokenizer.convert_tokens_to_ids(tokenizer.tokenize('[CLS] [MASK] - ' + x + ' [SEP]'))))

data['input'] = data['input'].apply(lambda x: F.pad(input=x, pad=(0, max_len-(x.shape[0])), mode='constant', value=0))

data['attention'] = data['input'].apply(lambda x: torch.tensor([float(el>0) for el in x]))

data['bw2v'] = data.apply(lambda x: get_bw2v(x, bw2v, tokenizer), axis=1)

data['freq'] = data['word'].apply(lambda x: freq[freq['Lemma'] == x]['Freq(ipm)'].max())

CPU times: user 1min 43s, sys: 113 ms, total: 1min 43s
Wall time: 1min 43s


In [20]:
def numpy_nan(x):
    try:
        return math.isnan(x)
    except TypeError:
        return False

In [21]:
def get_result(row, threshold):
    row1 = data.iloc[(data[data['word'] == row['word1']]).index[0]]
    row2 = data.iloc[(data[data['word'] == row['word2']]).index[0]]
    if (row1['freq'] > threshold) and (row2['freq'] > threshold):
        if (numpy_nan(row1['w2v'])) or (numpy_nan(row2['w2v'])):
            return np.nan
        else:
            return 1 - scipy.spatial.distance.cosine(row1['w2v'], row2['w2v'])
    elif (row1['freq'] <= threshold) and (row2['freq'] > threshold):
        if ((numpy_nan(row1['w2v'])) and numpy_nan(row1['bw2v'])) or (numpy_nan(row2['w2v'])):
            return np.nan
        else:
            if (numpy_nan(row1['bw2v'])):
                return 1 - scipy.spatial.distance.cosine(row1['w2v'], row2['w2v'])
            else:
                return 1 - scipy.spatial.distance.cosine(row1['bw2v'], row2['w2v'])
    elif (row1['freq'] > threshold) and (row2['freq'] <= threshold):
        if ((numpy_nan(row2['w2v'])) and numpy_nan(row2['bw2v'])) or (numpy_nan(row1['w2v'])):
            return np.nan
        else:
            if (numpy_nan(row2['bw2v'])):
                return 1 - scipy.spatial.distance.cosine(row2['w2v'], row1['w2v'])
            else:
                return 1 - scipy.spatial.distance.cosine(row2['bw2v'], row1['w2v'])
    else:
        if ((numpy_nan(row2['w2v'])) and numpy_nan(row2['bw2v'])) or ((numpy_nan(row1['w2v'])) and numpy_nan(row1['bw2v'])):
            return np.nan
        else:
            if (numpy_nan(row1['bw2v'])) and (numpy_nan(row2['bw2v'])):
                return 1 - scipy.spatial.distance.cosine(row1['w2v'], row2['w2v'])
            elif (numpy_nan(row1['bw2v'])) and (not numpy_nan(row2['bw2v'])):
                return 1 - scipy.spatial.distance.cosine(row1['w2v'], row2['bw2v'])
            elif (not numpy_nan(row1['bw2v'])) and (numpy_nan(row2['bw2v'])):
                return 1 - scipy.spatial.distance.cosine(row1['bw2v'], row2['w2v'])
            else:
                return 1 - scipy.spatial.distance.cosine(row1['bw2v'], row2['bw2v'])

In [22]:
freq['percentile'] = pd.qcut(freq['Freq(ipm)'], 10, labels=False)

freqs_gb = freq.groupby(by='percentile')['Freq(ipm)'].max()

In [23]:
freqs_gb

percentile
0        0.5
1        0.7
2        0.9
3        1.3
4        1.9
5        2.8
6        4.3
7        7.7
8       20.0
9    35801.8
Name: Freq(ipm), dtype: float64

In [24]:
for percentile in range(10):
    thresh = freqs_gb[percentile]
    sim = test.apply(lambda x: get_result(x, thresh), axis=1)
    test['sim'] = sim
    test['sim'] = test['sim'].fillna(test['sim'].mean())
    test.to_csv(f'russe-evaluation/russe/evaluation/{model_name}_percentile{(percentile*10+10)}.csv')

In [25]:
data

Unnamed: 0,word,defin,w2v,input,attention,bw2v,freq
0,абрикос,"южное фруктовое дерево семейства розоцветных, ...","[0.10378932, -0.14052582, 0.1942631, -0.305896...","[tensor(101), tensor(103), tensor(118), tensor...","[tensor(1.), tensor(1.), tensor(1.), tensor(1....","[-0.12012055, -0.10375254, 0.023982286, 0.0764...",3.3
1,авангард,"передовая, ведущая часть какой-нибудь обществе...","[-0.15341343, -0.13074593, -0.051256265, 0.167...","[tensor(101), tensor(103), tensor(118), tensor...","[tensor(1.), tensor(1.), tensor(1.), tensor(1....","[-0.120121464, -0.10375175, 0.023977919, 0.076...",9.8
2,аватар,,"[0.014951972, 0.0035384058, 0.24791141, 0.0169...","[tensor(101), tensor(103), tensor(118), tensor...","[tensor(1.), tensor(1.), tensor(1.), tensor(1....",,
3,авиапорт,,"[-0.2978616, -0.4235942, 0.016439708, 0.116158...","[tensor(101), tensor(103), tensor(118), tensor...","[tensor(1.), tensor(1.), tensor(1.), tensor(1....",,
4,авиасообщение,,"[-0.37638554, 0.14005096, -0.050324634, 0.3548...","[tensor(101), tensor(103), tensor(118), tensor...","[tensor(1.), tensor(1.), tensor(1.), tensor(1....",,
5,австриец,,"[-0.1491604, -0.27055186, 0.29353207, 0.142309...","[tensor(101), tensor(103), tensor(118), tensor...","[tensor(1.), tensor(1.), tensor(1.), tensor(1....",,2.6
6,автовокзал,большая станция пригородного или междугородног...,"[-0.4247384, -0.14860067, -0.023689255, -0.336...","[tensor(101), tensor(103), tensor(118), tensor...","[tensor(1.), tensor(1.), tensor(1.), tensor(1....","[-0.12011934, -0.10375419, 0.02398602, 0.07646...",1.0
7,автограф,подлинная рукопись автора,"[-0.294718, -0.30333897, 0.2978587, -0.0615298...","[tensor(101), tensor(103), tensor(118), tensor...","[tensor(1.), tensor(1.), tensor(1.), tensor(1....","[-0.120118864, -0.103755176, 0.023987792, 0.07...",10.3
8,автодеталь,,"[-0.055288125, -0.12642151, -0.023027416, -0.0...","[tensor(101), tensor(103), tensor(118), tensor...","[tensor(1.), tensor(1.), tensor(1.), tensor(1....",,
9,автокомбинат,,"[0.0454641, 0.15902324, 0.058329105, -0.059688...","[tensor(101), tensor(103), tensor(118), tensor...","[tensor(1.), tensor(1.), tensor(1.), tensor(1....",,0.4


In [26]:
row = data.iloc[21]

In [27]:
row['defin']

'такое ведение дел, при котором каждая операция регистрируется немедленно после ее совершения'

In [28]:
word = row['word']
inp = row['input'].unsqueeze(0)
attention = row['attention'].unsqueeze(0)
inp = inp.to('cuda')
attention = attention.to('cuda')

In [50]:
x = 'такое ведение дел, при котором каждая операция регистрируется немедленно после ее совершения'
inp = torch.tensor([tokenizer.convert_tokens_to_ids(tokenizer.tokenize('[CLS] [CLS]' + x + '[SEP]'))]).to('cuda')

In [51]:
with torch.no_grad():
    _, _, tmp  = bw2v.bert_model(inp)

In [52]:
tmp

(tensor([[[-0.4456,  0.0627, -0.1807,  ...,  0.4191, -0.4707,  0.1631],
          [-0.4584, -0.0287,  0.4728,  ...,  0.6191, -0.6317,  0.0543],
          [-0.1495,  0.4490, -0.4619,  ...,  0.1357,  0.4095, -1.4835],
          ...,
          [-0.9577,  0.5846, -0.7484,  ..., -0.3848, -0.6820,  0.3802],
          [-0.7268,  0.7438,  0.7163,  ...,  0.4393, -0.4528, -0.6298],
          [ 0.6444,  0.8613,  1.1060,  ..., -0.0604,  0.8569, -0.2070]]],
        device='cuda:0'),
 tensor([[[ 0.3961, -0.1015, -0.9602,  ..., -0.6380, -0.4597,  1.6722],
          [ 0.4232, -0.1048, -0.9306,  ..., -0.6701, -0.4992,  1.7689],
          [ 0.4018, -0.0315, -1.1045,  ..., -0.7215, -0.3786,  1.7110],
          ...,
          [ 0.3745, -0.0692, -1.1252,  ..., -0.7881, -0.4482,  1.8852],
          [ 0.3728, -0.1073, -1.0238,  ..., -0.7000, -0.4310,  1.7903],
          [ 0.4850, -0.0590, -0.9849,  ..., -0.7394, -0.4821,  1.7925]]],
        device='cuda:0'),
 tensor([[[ 0.3661,  1.0728,  0.1908,  ...,  0.040

In [31]:
tmp2 = tmp[2][6][:,1]

IndexError: index 6 is out of bounds for dimension 0 with size 1

In [None]:
bw2v.linear_model(tmp2)