<a href="https://colab.research.google.com/github/Rt247/Not_NLP_CW/blob/BERT_method/BERT_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

Download datasets:

In [0]:
from os.path import exists

if not exists('enzh_data.zip'):
    !wget -O enzh_data.zip https://competitions.codalab.org/my/datasets/download/03e23bd7-8084-4542-997b-6a1ca6dd8a5f
    !unzip enzh_data.zip

--2020-02-26 16:10:42--  https://competitions.codalab.org/my/datasets/download/03e23bd7-8084-4542-997b-6a1ca6dd8a5f
Resolving competitions.codalab.org (competitions.codalab.org)... 129.175.22.230
Connecting to competitions.codalab.org (competitions.codalab.org)|129.175.22.230|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://newcodalab.lri.fr/prod-private/dataset_data_file/None/630ec/en-zh.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=788df68e375672f4b565fc2ad87b612636088d1ee0737afdd952146c75355834&X-Amz-Date=20200226T161042Z&X-Amz-Credential=AZIAIOSAODNN7EX123LE%2F20200226%2Fnewcodalab%2Fs3%2Faws4_request [following]
--2020-02-26 16:10:42--  https://newcodalab.lri.fr/prod-private/dataset_data_file/None/630ec/en-zh.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=788df68e375672f4b565fc2ad87b612636088d1ee0737afdd952146c75355834&X-Amz-Date=20200226T161

Check data downloaded successfully:

In [0]:
with open("./train.enzh.src", "r") as enzh_src:
  print("Source: ",enzh_src.readline())
with open("./train.enzh.mt", "r") as enzh_mt:
  print("Translation: ",enzh_mt.readline())
with open("./train.enzh.scores", "r") as enzh_scores:
  print("Score: ",enzh_scores.readline())

Source:  The last conquistador then rides on with his sword drawn.

Translation:  最后的征服者骑着他的剑继续前进.

Score:  -1.5284005772625449



## BERT embedding Setup


### Import Libraries

In [0]:
!pip install transformers
import torch
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/13/33/ffb67897a6985a7b7d8e5e7878c3628678f553634bd3836404fef06ef19b/transformers-2.5.1-py3-none-any.whl (499kB)
[K     |▋                               | 10kB 14.9MB/s eta 0:00:01[K     |█▎                              | 20kB 4.8MB/s eta 0:00:01[K     |██                              | 30kB 6.8MB/s eta 0:00:01[K     |██▋                             | 40kB 6.3MB/s eta 0:00:01[K     |███▎                            | 51kB 5.4MB/s eta 0:00:01[K     |████                            | 61kB 5.9MB/s eta 0:00:01[K     |████▋                           | 71kB 6.4MB/s eta 0:00:01[K     |█████▎                          | 81kB 6.9MB/s eta 0:00:01[K     |██████                          | 92kB 7.3MB/s eta 0:00:01[K     |██████▋                         | 102kB 7.2MB/s eta 0:00:01[K     |███████▏                        | 112kB 7.2MB/s eta 0:00:01[K     |███████▉                        | 122kB 7.2M

### Set GPU usage

In [0]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


### Get BERT pretrained model

In [0]:
BERT_model = BertModel.from_pretrained("bert-base-multilingual-cased")
BERT_model.cuda()

HBox(children=(IntProgress(value=0, description='Downloading', max=569, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=714314041, style=ProgressStyle(description_…




BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

### Helper functions
For tokenisation and extract features

In [0]:
def token_raw_inputs(original, translated):
  # Load pre-trained model tokenizer (vocabulary)
  tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

  text_pairs = list(zip(original, translated))
  MAX_LENGTH = 128

  inputs = [tokenizer.encode_plus(original, text_pair=translated, add_special_tokens = True, max_length=MAX_LENGTH, pad_to_max_length=True) for original, translated in text_pairs]

  return [d['input_ids'] for d in inputs], [d['attention_mask'] for d in inputs]


def get_BERT_embedding(input_tokens, attention_masks):
  input_tensors = torch.tensor(input_tokens).to(device)
  attention_mask_tensors = torch.tensor(attention_masks).to(device)

  with torch.no_grad():
    last_hidden_states = BERT_model(input_tensors, attention_mask=attention_mask_tensors)
  return last_hidden_states[0][:,0,:].cpu().numpy()


def get_BERT_words_embedding(input_tokens, attention_masks):
  input_tensors = torch.tensor(input_tokens).to(device)
  attention_mask_tensors = torch.tensor(attention_masks).to(device)

  with torch.no_grad():
    last_hidden_states = BERT_model(input_tensors, attention_mask=attention_mask_tensors)
    feature_maps = last_hidden_states[0][:,1:,:].cpu()
  return feature_maps.numpy()

## Process Scores

In [0]:
f_train_scores = open("./train.enzh.scores", 'r')
zh_train_scores = f_train_scores.readlines()

f_val_scores = open("./dev.enzh.scores", 'r')
zh_val_scores = f_val_scores.readlines()

train_scores = np.array(zh_train_scores).astype(float)
y_train_zh = train_scores

val_scores = np.array(zh_val_scores).astype(float)
y_val_zh = val_scores

In [0]:
# Setup
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

from sklearn.svm import SVR
from scipy.stats.stats import pearsonr


# BERT embedding with regression model


## Feature extraction using BERT

Use BERT to generate sentence level embedding

In [0]:
# Tokenize our sentence with the BERT tokenizer.
original_texts = open("./train.enzh.src").readlines()
translated_texts = open("./train.enzh.mt").readlines()
input_ids, input_attention_masks = token_raw_inputs(original_texts, translated_texts)

train_features = get_BERT_embedding(input_ids[0:1000], input_attention_masks[0:1000])
for i in range(1, 7):
  features = get_BERT_embedding(input_ids[i*1000:(i + 1)*1000], input_attention_masks[i*1000:(i + 1)*1000])
  train_features = np.concatenate((train_features, features))
train_labels = y_train_zh


print('Extracting BERT features batch 0')
train_words_features = get_BERT_words_embedding(input_ids[0:1000], input_attention_masks[0:1000])
for i in range(1, 7):
  print(f'Extracting BERT features batch {i}')
  features = get_BERT_words_embedding(input_ids[i*1000:(i + 1)*1000], input_attention_masks[i*1000:(i + 1)*1000])
  train_words_features = np.concatenate((train_words_features, features))

# Tokenize our sentence with the BERT tokenizer.
dev_original_texts = open("./dev.enzh.src").readlines()
dev_translated_texts = open("./dev.enzh.mt").readlines()

test_labels = y_val_zh

test_input_ids, test_attention_masks = token_raw_inputs(dev_original_texts, dev_translated_texts)

test_features = get_BERT_embedding(test_input_ids, test_attention_masks)
test_words_features = get_BERT_words_embedding(test_input_ids, test_attention_masks)


HBox(children=(IntProgress(value=0, description='Downloading', max=995526, style=ProgressStyle(description_wid…




## Ridge Regression

In [0]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=2)
clf.fit(train_features, y_train_zh)

predictions = clf.predict(test_features)

pearson = pearsonr(y_val_zh, predictions)
print(f'RMSE: {rmse(predictions, y_val_zh)} Pearson {pearson[0]}')
'RMSE: 0.861979277558717 Pearson 0.3621685325738159'

## SVR with different kernel

In [0]:
for k in ['linear','poly','rbf','sigmoid']:
    clf_t = SVR(kernel=k)
    clf_t.fit(train_features, y_train_zh)
    print(k)
    predictions = clf_t.predict(test_features)
    pearson = pearsonr(y_val_zh, predictions)
    print(f'RMSE: {rmse(predictions, y_val_zh)} Pearson {pearson[0]}')
    print()


'''linear
RMSE: 0.8847769222998597 Pearson 0.3541554106726864

poly
RMSE: 0.8800424227174211 Pearson 0.3812763230624585

rbf
RMSE: 0.8846797573743066 Pearson 0.3739785444174556

sigmoid
RMSE: 0.9047304999833146 Pearson 0.3336589772215751'''

In [0]:
clf_poly = SVR(kernel='poly')
clf_poly.fit(train_features, y_train_zh)

## RNN with BERT word embedding

In [0]:
# Create RNN Model
class RNNModel(nn.Module):
  def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout):
    super(RNNModel, self).__init__()
    # Number of hidden dimensions
    self.hidden_dim = hidden_dim
    
    # Number of hidden layers
    self.layer_dim = layer_dim
    
    # RNN
    self.rnn = nn.GRU(input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout)
    
    # Readout layer
    self.fc = nn.Linear(hidden_dim, output_dim)
    
  def forward(self, x):
    # Initialize hidden state with zeros
    h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
    # One time step
    out, hn = self.rnn(x, h0)
    out = self.fc(out[:, -1, :]) 
    return out

In [0]:
# Pytorch train and test sets
train_tensor = torch.from_numpy(train_words_features)
test_tensor = torch.from_numpy(test_words_features)
train_labels_tensor = torch.from_numpy(y_train_zh)
test_labels_tensor = torch.from_numpy(y_val_zh)
train_dataset = torch.utils.data.TensorDataset(train_tensor, train_labels_tensor)
test_dataset = torch.utils.data.TensorDataset(test_tensor, test_labels_tensor)

# batch_size, epoch and iteration
batch_size = 100
n_iters = 2100
num_epochs = n_iters / (len(train_words_features) / batch_size)
num_epochs = int(num_epochs)

# data loader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = batch_size, shuffle = False)
    
# Create RNN
input_dim = 768   # input dimension
hidden_dim = 400  # hidden layer dimension
layer_dim = 1     # number of hidden layers
output_dim = 1   # output dimension
dropout = 0

model = RNNModel(input_dim, hidden_dim, layer_dim, output_dim, dropout)

# SGD Optimizer
learning_rate = 0.05
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

### Training

In [0]:
loss_list = []

error = nn.MSELoss()

print(f'Start training')
for epoch in range(num_epochs):
  for i, (features, labels) in enumerate(train_loader):
    
    # Clear gradients
    optimizer.zero_grad()
    
    # Forward propagation
    outputs = model(features).double()
    
    outputs = torch.reshape(outputs, (100, ))
    # Calculate softmax and ross entropy loss
    loss = error(outputs, labels)
    # Calculating gradients
    loss.backward()
    
    # Update parameters
    optimizer.step()

    # store loss and iteration
    loss_list.append(loss.data)
    pearson = pearsonr(labels, outputs.detach().numpy().reshape(100))

    # Print Loss
    print(f'Epoch: {epoch} batch: {i}  Loss: {loss.data.item()} Pearson: {pearson[0]}')
  # Validation
  with torch.no_grad():
    outputs = model(test_tensor).cpu().numpy()

  outputs = outputs.reshape(1000)
  loss = rmse(outputs, test_labels)
  pearson = pearsonr(test_labels, outputs)

  print(f'Validation pearson: {pearson[0]}')

# Results

(Haven't tested the function yet...)

In [0]:
import os
from google.colab import files
from zipfile import ZipFile

def writeScores(scores):
    fn = "predictions.txt"
    print("")
    with open(fn, 'w') as output_file:
        for idx,x in enumerate(scores):
            #out =  metrics[idx]+":"+str("{0:.2f}".format(x))+"\n"
            #print(out)
            output_file.write(f"{x}\n")


def downloadScores(method_name, scores):
  writeScores(scores)
  with ZipFile(f"en-zh_{method_name}.zip", "w") as newzip:
    newzip.write("predictions.txt")
  
  files.download(f"en-zh_{method_name}.zip")

In [0]:
# Tokenize our sentence with the BERT tokenizer.
dev_original_texts = open("./test.enzh.src").readlines()
dev_translated_texts = open("./test.enzh.mt").readlines()

test_input_ids, test_attention_masks = token_raw_inputs(dev_original_texts, dev_translated_texts)
test_words_features = get_BERT_words_embedding(test_input_ids, test_attention_masks)

test_tensor = torch.from_numpy(test_words_features)
with torch.no_grad():
  outputs = model(test_tensor).cpu().numpy()

outputs = outputs.reshape(1000)

downloadScores('RNN_model', outputs)

In [0]:
# Tokenize our sentence with the BERT tokenizer.
dev_original_texts = open("./test.enzh.src").readlines()
dev_translated_texts = open("./test.enzh.mt").readlines()

test_input_ids, test_attention_masks = token_raw_inputs(dev_original_texts, dev_translated_texts)
test_features = get_BERT_embedding(test_input_ids, test_attention_masks)
outputs = clf_poly.predict(test_features)
downloadScores('BERT_SVR', outputs)