In [1]:
!pip install pytorch-pretrained-bert



In [0]:
from scipy.stats.stats import pearsonr
from os.path import exists
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import numpy as np
from sklearn import linear_model
from sklearn.svm import SVR
from torch.utils.data import DataLoader, TensorDataset
import torch.optim.adam
import torch.nn.functional as F
import torch.nn as nn


In [0]:
if not exists('enzh_data.zip'):
    !wget -O enzh_data.zip https://competitions.codalab.org/my/datasets/download/03e23bd7-8084-4542-997b-6a1ca6dd8a5f
    !unzip enzh_data.zip


In [4]:
print("---EN-ZH---")
print()

with open("./train.enzh.src", "r") as enzh_src:
  enzh_train_src = enzh_src.readlines()
with open("./train.enzh.mt", "r",encoding="utf-8") as enzh_mt:
  enzh_train_mt = enzh_mt.readlines()
with open("./train.enzh.scores", "r") as enzh_scores:
  enzh_train_scores = enzh_scores.readlines()

---EN-ZH---



In [0]:
with open("./dev.enzh.src", "r") as enzh_src:
  enzh_dev_src = enzh_src.readlines()
with open("./dev.enzh.mt", "r",encoding="utf-8") as enzh_mt:
  enzh_dev_mt = enzh_mt.readlines()
with open("./dev.enzh.scores", "r") as enzh_scores:
  enzh_dev_scores = enzh_scores.readlines()

In [6]:
import torch
use_GPU = torch.cuda.is_available()
device = torch.device("cuda" if use_GPU else "cpu")
if use_GPU:
    torch.cuda.manual_seed(0)
print("Using GPU: {}".format(use_GPU))

Using GPU: True


In [0]:
  # Example
  # marked_text_en = [
  #             "The last conquistador then rides on with his sword drawn.",
  #             "He shoves Owen into the pit where Digger rips out his son's heart.",
  #             "Alpha Phi Alpha also participates in the March of Dimes' WalkAmerica and raised over $181,000 in 2006.",
  #             "In 1995, Deftones released their debut album Adrenaline.",
  #             "Kyrgios also supports the North Melbourne Kangaroos Football Club in the Australian Football League."
  # ]
  # marked_text_zh = [、
  #           "最后的征服者骑着他的剑继续前进.",
  #           "他把欧文扔进了挖掘机挖出儿子心脏的坑里.",
  #           "Alpha Phi Alpha 还参加了 Dimes WalkAmerica 的 3 月活动 ， 并在 2006 年筹集了 181 000 美元。",
  #           "1995 年 ， Deftones 发行了首张专辑《肾上腺素》。",
  #           "基尔吉奥斯还在澳大利亚足球联盟中支持北墨尔本袋鼠足球俱乐部."
  # ]

In [0]:
def tokenization(marked_text_en, marked_text_zh, tokenizer):

  indexed_tokens = []
  tokenized_text = []
  segments_ids = []

  for i in range(len(marked_text_en)):

    txt = "[CLS] "+ marked_text_en[i] +" [SEP] " + marked_text_zh[i] + " [SEP]"
    tokens = tokenizer.tokenize(txt)

    tmp = tokens.index("[SEP]")
    sep1 = [0]*(tmp+1)
    sep2 = [1]*(len(tokens)-tmp - 1)
    segments_ids.append(torch.tensor([sep1+sep2]))

    tokenized_text.append(tokens)
    indexed_tokens.append(torch.tensor([tokenizer.convert_tokens_to_ids(tokens)]))

  return indexed_tokens, segments_ids 

In [0]:
def bertProcessing(indexed_tokens,segments_ids,model):

  sentences_embedding = []
  
  with torch.no_grad():
      
      for i in range(len(indexed_tokens)):

        # "encoded_layers" has shape [12 x 1 x 22 x 768]
        encoded_layers, _ = model(indexed_tokens[i].to(device), segments_ids[i].to(device))

        # print("-"*30)
        # print ("Number of layers:", len(encoded_layers))
        # layer_i = 0
        # print ("Number of batches:", len(encoded_layers[layer_i]))
        # batch_i = 0
        # print ("Number of tokens:", len(encoded_layers[layer_i][batch_i]))
        # token_i = 0
        # print ("Number of hidden units:", len(encoded_layers[layer_i][batch_i][token_i]))
        
        token_embeddings = torch.stack(encoded_layers, dim=0)

        token_embeddings = torch.squeeze(token_embeddings, dim=1)

        token_embeddings = token_embeddings.permute(1,0,2)

        # print(token_embeddings.size())    

        # "token_embeddings" has shape [22 x 12 x 768]
        token_vecs_sum = []

        for token in token_embeddings:
          sum_vec = torch.sum(token[-4:],dim=0)
          token_vecs_sum.append(sum_vec)

        # "token_vecs" is a tensor with shape [22 x 768]
        token_vecs = torch.stack(token_vecs_sum,dim=0)

        # Calculate the average of all 22 token vectors.        
        sentence_embedding = torch.mean(token_vecs, dim=0)

        # print(sentence_embedding.size())
        sentences_embedding.append(sentence_embedding.cpu().detach().numpy())

  return np.array(sentences_embedding)

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


In [11]:
## Bert Transformer 1: Text1 Delim Text2
tokens_train_1, seg_ids_train_1 = tokenization(enzh_train_src,enzh_train_mt,tokenizer)
tokens_val_1, seg_ids_val_1 = tokenization(enzh_dev_src,enzh_dev_mt,tokenizer)

model_1 = model.to(device)
model_1.eval()

sentences_embedding_train_1 = bertProcessing(tokens_train_1,seg_ids_train_1,model_1)
sentences_embedding_val_1 = bertProcessing(tokens_val_1,seg_ids_val_1,model_1)

## Bert Transformer 2: Text2 Delim Text1
tokens_train_2, seg_ids_train_2 = tokenization(enzh_train_mt,enzh_train_src,tokenizer)
tokens_val_2, seg_ids_val_2 = tokenization(enzh_dev_mt,enzh_dev_src,tokenizer)

model_2 = model.to(device)
model_2.eval()

sentences_embedding_train_2 = bertProcessing(tokens_train_2,seg_ids_train_2,model_2)
sentences_embedding_val_2 = bertProcessing(tokens_val_2,seg_ids_val_2,model_2)


## Concatenate
sentences_embedding_train = []
# for i in range(len(sentences_embedding_train_1)):
#   sentences_embedding_train.append(np.add((sentences_embedding_train_1[i], sentences_embedding_train_2[i]),axis = 0))
# # sentences_embedding_train= [np.array(sentences_embedding_train_1),np.array(sentences_embedding_train_2)]
# X_train = np.array(sentences_embedding_train_1)
X_train = np.sum([sentences_embedding_train_1,sentences_embedding_train_2],axis=0)

sentences_embedding_val = []
# for i in range(len(sentences_embedding_val_1)):
#   sentences_embedding_val.append(np.add((sentences_embedding_val_1[i], sentences_embedding_val_2[i]),axis=0))
# # sentences_embedding_val = [np.array(sentences_embedding_val_1),np.array(sentences_embedding_val_1)]
# X_val = np.array(sentences_embedding_val_1)
X_val = np.sum([sentences_embedding_val_1, sentences_embedding_val_2],axis = 0)

print(X_train.shape)
print(X_val.shape)

y_train = np.array(enzh_train_scores).astype(np.float32)
y_val = np.array(enzh_dev_scores).astype(np.float32)

(7000, 768)
(1000, 768)


In [0]:
# RMSE
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [0]:
# SVM
for k in ['linear','poly','rbf','sigmoid']:
    clf_t = SVR(kernel=k)
    clf_t.fit(X_train, y_train)
    print(k)
    predictions = clf_t.predict(X_val)
    pearson = pearsonr(y_val, predictions)
    print(f'RMSE: {rmse(predictions,y_val)} Pearson {pearson[0]}')
    print()

In [0]:
# Bayes Regression
reg = linear_model.BayesianRidge()
reg.fit(X_train, y_train)
predictions = reg.predict(X_val)
pearson = pearsonr(y_val, predictions)
print(f'RMSE: {rmse(predictions,y_val)} Pearson {pearson[0]}')
print()

In [0]:
# FFNN
class FeedForwardClassification(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(768,400)
        self.fc2 = nn.Linear(400,100)
        self.fc3 = nn.Linear(100,1)

    def forward(self, x):
        output = self.fc3(F.relu(self.fc2(F.relu(self.fc1(x)))))
        return output

# # RNN
# class RNNClassification(nn.Module):
#     def __init__(self, embedding, hiddenStates, target):
#         super().__init__()
#         self.lstm1 = nn.LSTM(embedding,hiddenStates) 
#         self.fc = nn.Linear(hiddenStates,target)
#
#     def forward(self,embedding)
#           # embedding shape(seq_len, batch_size, embedding_size)
#           x,_ = F.self.lstm1(embedding)
#           # x reshape(embedding) to (seq_len, embedding_size)
#           output = self.fc(x.)
#           return output


In [0]:
def training(model,train_loader, criterion, opt):
  training_loss = 0
  model.train()

  for batch_idx,(X_train, y_train) in enumerate(train_loader):
    
    opt.zero_grad()

    X_train = X_train.to(device)
    y_train = y_train.to(device)

    output = model(X_train)
    
    loss = criterion(output,y_train)

    loss.backward()

    opt.step()

    training_loss += torch.sqrt(loss).item()

    if batch_idx % 200 == 199:
      print('[batch: %d]  loss: %.3f'%(batch_idx+1, training_loss/200))
      training_loss = 0

def testing(model, test_loader, criterion):

  model.eval()
  predictions = []

  with torch.no_grad():
    for X_test, y_test in test_loader:
      
      X_test = X_test.to(device)
      
      y_test = y_test.to(device)

      output = model(X_test)

      predictions.append(output.item())
    
  testing_loss = np.sqrt(((np.array(predictions) - np.array(y_val)) ** 2).mean())
  
  pearson = pearsonr(y_val, predictions)

  print(f'testing_loss: {testing_loss} Pearson {pearson[0]}')


In [0]:
def classification():
  
  train_dat = TensorDataset(torch.tensor(X_train),torch.tensor(y_train))
  test_dat = TensorDataset(torch.tensor(X_val),torch.tensor(y_val))

  train_loader = DataLoader(train_dat, batch_size=5, shuffle=True,num_workers=2)
  val_loader = DataLoader(test_dat, batch_size=1, shuffle=False)

  epochs = 25
  model = FeedForwardClassification().to(device)
  opt = torch.optim.Adam(model.parameters(),lr = 0.0001)
  criterion = nn.MSELoss()

  for i in range(epochs):
    print("Epoch: %d"%(i))
    print("-"*30)
    training(model,train_loader,criterion,opt)
    testing(model,val_loader,criterion)
    print("-"*30)

In [0]:
classification()