In [1]:
import ast
import numpy as np
import pandas as pd
import gensim
import scipy
import torch

from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [2]:
train = pd.read_csv('rus/bert/bert_train.csv')
valid = pd.read_csv('rus/bert/bert_valid.csv')

In [3]:
class BertToW2v(torch.nn.Module):
    def __init__(self, bert_model_name, lin_shape_in, lin_shape_out, emb_layer): # -, 768, 100, 6
        super(BertToW2v, self).__init__()
        self.emb_layer = emb_layer
        self.bert_model = BertModel.from_pretrained(bert_model_name)
        #self.bert_model.eval()
        self.linear_model = torch.nn.Linear(lin_shape_in, lin_shape_out, bias=True) # bias?
        torch.nn.init.uniform_(self.linear_model.weight, -0.1, 0.1)
        
    def forward(self, input_sentence): # ожидаем уже токенизированное предложение
        encoded_layers, _ = self.bert_model(input_sentence)
        bert_output = encoded_layers[self.emb_layer][0][1]
        linear_output = self.linear_model(bert_output).unsqueeze(0)
        return linear_output

In [4]:
bw2v = BertToW2v('bert-base-multilingual-cased', lin_shape_in=768, lin_shape_out=500, emb_layer=6)
bw2v.to('cuda');

In [5]:
train['embedding'] = train['embedding'].apply(lambda x: ast.literal_eval(x))
valid['embedding'] = valid['embedding'].apply(lambda x: ast.literal_eval(x))

In [6]:
loss_function = torch.nn.MSELoss()
optimizer = torch.optim.Adam(bw2v.parameters(), lr=0.0001)

small_valid = valid[:(train.shape[0] // 7)]

In [7]:
%%time

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

n_epochs = 5

for i in range(n_epochs):
    losses = []
    for row in train.itertuples():
        
        defin = row.definition
        defin = '[CLS] [MASK] - ' + defin + ' [SEP]'
        tok_def = tokenizer.tokenize(defin)
        tok_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tok_def)])
        tok_ids = tok_ids.to('cuda')
        
        optimizer.zero_grad()
        
        linear_output = bw2v(tok_ids)

        #tensor_ones = torch.ones(1).to('cuda')
        y = torch.tensor(row.embedding).unsqueeze(0).to('cuda')
        loss = loss_function(linear_output, y)#, tensor_ones)
        
        loss.backward()
        optimizer.step()

        losses.append(float(loss.cpu()))
    
    valid_losses = []
    
    for row in small_valid.itertuples():
        with torch.no_grad():
            defin = row.definition
            defin = '[CLS] [MASK] ' + defin + ' [SEP]'
            tok_def = tokenizer.tokenize(defin)
            tok_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tok_def)])
            tok_ids = tok_ids.to('cuda')
            
            linear_output = bw2v(tok_ids)
            
            y = torch.tensor(row.embedding).unsqueeze(0).to('cuda')
            loss = loss_function(linear_output, y)
            
            valid_losses.append(float(loss.cpu()))
        
    print('TRAIN_LOSS: {0}, VALID_LOSS: {1}'.format((sum(losses) / len(losses)), (sum(valid_losses) / len(valid_losses))))

TRAIN_LOSS: 0.030604032842510034, VALID_LOSS: 0.03289801505167968
TRAIN_LOSS: 0.02357221544748391, VALID_LOSS: 0.03132070761178135
TRAIN_LOSS: 0.022162485717559046, VALID_LOSS: 0.031117166834700883
TRAIN_LOSS: 0.021210889714224575, VALID_LOSS: 0.030762541722109977
TRAIN_LOSS: 0.020553067121799266, VALID_LOSS: 0.030778037495744256
CPU times: user 23min 15s, sys: 3min 11s, total: 26min 26s
Wall time: 26min 29s


In [8]:
torch.save(bw2v.state_dict(), 'models/SUM_5epochs_hyphen.mdl')