# NLP Coursework

## Dowdload and Import Libraries

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F 
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import numpy as np

from scipy.stats.stats import pearsonr

## Import Data

In [0]:
def read_data(path):
    """
    Read data from the data path.

    Args: 
        path: the path of the dataset, normally in local folder.
    
    Returns:
        Loaded raw dataset. 
    """
    with open(path) as dataset:
        raw_data = dataset.readlines()

    return raw_data


# Define the path of the train dataset
english_train_path = "train.enzh.src"
chinese_train_path = "train.enzh.mt"   
scores_train_path = "train.enzh.scores"
# Define the path of the validatin dataset
english_validation_path = "dev.enzh.src"
chinese_validation_path = "dev.enzh.mt"   
scores_validation_path = "dev.enzh.scores"
# Define the path of the test dataset
english_test_path = "test.enzh.src"
chinese_test_path = "test.enzh.mt"


# Read train, validation, test data
raw_english_train = read_data(english_train_path)
raw_chinese_train = read_data(chinese_train_path)
raw_english_validation = read_data(english_validation_path)
raw_chinese_validation = read_data(chinese_validation_path)
raw_english_test = read_data(english_test_path)
raw_chinese_test = read_data(chinese_test_path)

# read scores for train and validation dataset 
score_train = read_data(scores_train_path)
score_validation = read_data(scores_validation_path)

## Sentence Embedding - BERT - Multilingual models

Download and Import:

In [0]:
!pip install bert-serving-server # server
!pip install bert-serving-client # client, independent of 'bert-serving-server'
from bert_serving.client import BertClient

#### Multilingual model 


Base model:

(12-layer, 768-hidden, 12-heads, 110M parameters)

In [0]:
# download pretrained BERT model cased_L-12_H-768_A-12
!wget https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
!unzip multi_cased_L-12_H-768_A-12.zip
ml_model = "multi_cased_L-12_H-768_A-12"

--2020-02-28 15:08:02--  https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.189.128, 2404:6800:4008:c03::80
Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.189.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 662903077 (632M) [application/zip]
Saving to: ‘multi_cased_L-12_H-768_A-12.zip’


2020-02-28 15:08:06 (151 MB/s) - ‘multi_cased_L-12_H-768_A-12.zip’ saved [662903077/662903077]

Archive:  multi_cased_L-12_H-768_A-12.zip
   creating: multi_cased_L-12_H-768_A-12/
  inflating: multi_cased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: multi_cased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: multi_cased_L-12_H-768_A-12/vocab.txt  
  inflating: multi_cased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: multi_cased_L-12_H-768_A-12/bert_config.json  


Obtain English results:

In [0]:
# start server
!nohup bert-serving-start -model_dir=./{ml_model} > out.file 2>&1 &
# open client
bc_ml = BertClient()
# encode English raw corpus
english_train_embeddings = bc_ml.encode(raw_english_train)
english_val_embeddings = bc_ml.encode(raw_english_validation)
english_test_embeddings = bc_ml.encode(raw_english_test)
chninese_train_embeddings = bc_ml.encode(raw_chinese_train)
chinese_val_embeddings = bc_ml.encode(raw_chinese_validation)
chinese_test_embeddings = bc_ml.encode(raw_chinese_test)
# close client
bc_ml.close()

here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


### Concatenate vectors

In [0]:
# Concatenate vectors
sentence_embeddings_train = []
sentence_embeddings_val = []    
sentence_embeddings_test = []

# Concatenate train vectors
for i in range(len(english_train_embeddings)):
    english = list(english_train_embeddings[i])
    chinese = list(chninese_train_embeddings[i])
    english.extend(chinese)
    sentence_embeddings_train.append(english)

# Concatenate validation vectors
for i in range(len(english_val_embeddings)):
    english = list(english_val_embeddings[i])
    chinese = list(chinese_val_embeddings[i])
    english.extend(chinese)
    sentence_embeddings_val.append(english)
    
# Concatenate test vectors
for i in range(len(english_test_embeddings)):
    english = list(english_test_embeddings[i])
    chinese = list(chinese_test_embeddings[i])
    english.extend(chinese)
    sentence_embeddings_test.append(english)

In [0]:
sentence_embeddings_train = np.asarray(sentence_embeddings_train).astype(float)
sentence_embeddings_val = np.asarray(sentence_embeddings_val).astype(float)
sentence_embeddings_test = np.asarray(sentence_embeddings_test).astype(float)

score_train = np.asarray(score_train).astype(float)
score_validation = np.asarray(score_validation).astype(float)

## Model: SVR

It achieves the best result when k is set as 'rbf' based on experiment (in the commented part).

In [0]:
from sklearn.svm import SVR

def rmse(predictions, targets):
    """
    Method to calculate the root mean squared error.

    Args: 
        predictions: the prediction of the model.
        targets: the ground truth.
    
    Returns:
        The sentence vector.
    """
    return np.sqrt(((predictions - targets) ** 2).mean())

# Train and fit into the model.
clf_t = SVR(kernel='rbf')
clf_t.fit(sentence_embeddings_train, score_train)
predictions = clf_t.predict(sentence_embeddings_val)
pearson = pearsonr(score_validation, predictions)
print(f'RMSE: {rmse(predictions,score_validation)} Pearson {pearson[0]}')

RMSE: 0.8879345425076625 Pearson 0.35773168904694735
