# NLP Coursework

## Dowdload and Import Libraries

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F 
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import numpy as np

from scipy.stats.stats import pearsonr

## Import Data

In [0]:
def read_data(path):
    """
    Read data from the data path.

    Args: 
        path: the path of the dataset, normally in local folder.
    
    Returns:
        Loaded raw dataset. 
    """
    with open(path) as dataset:
        raw_data = dataset.readlines()

    return raw_data


# Define the path of the train dataset
english_train_path = "train.enzh.src"
chinese_train_path = "train.enzh.mt"   
scores_train_path = "train.enzh.scores"
# Define the path of the validatin dataset
english_validation_path = "dev.enzh.src"
chinese_validation_path = "dev.enzh.mt"   
scores_validation_path = "dev.enzh.scores"
# Define the path of the test dataset
english_test_path = "test.enzh.src"
chinese_test_path = "test.enzh.mt"


# Read train, validation, test data
raw_english_train = read_data(english_train_path)
raw_chinese_train = read_data(chinese_train_path)
raw_english_validation = read_data(english_validation_path)
raw_chinese_validation = read_data(chinese_validation_path)
raw_english_test = read_data(english_test_path)
raw_chinese_test = read_data(chinese_test_path)

# read scores for train and validation dataset 
score_train = read_data(scores_train_path)
score_validation = read_data(scores_validation_path)

## Sentence Embedding - BERT - large models

Download and Import:

In [3]:
!pip install bert-serving-server # server
!pip install bert-serving-client # client, independent of 'bert-serving-server'
from bert_serving.client import BertClient

Collecting bert-serving-server
[?25l  Downloading https://files.pythonhosted.org/packages/b0/bd/cab677bbd0c5fb08b72e468371d2bca6ed9507785739b4656b0b5470d90b/bert_serving_server-1.10.0-py3-none-any.whl (61kB)
[K     |█████▎                          | 10kB 25.4MB/s eta 0:00:01[K     |██████████▋                     | 20kB 4.9MB/s eta 0:00:01[K     |████████████████                | 30kB 7.0MB/s eta 0:00:01[K     |█████████████████████▎          | 40kB 8.8MB/s eta 0:00:01[K     |██████████████████████████▋     | 51kB 5.7MB/s eta 0:00:01[K     |███████████████████████████████▉| 61kB 6.7MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 4.6MB/s 
Collecting GPUtil>=1.3.0
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Collecting pyzmq>=17.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/41/fa/e6e10410f01b03d10ab0705717d1246f63cdbbc0676140c78f0f757db332/pyzmq-

Collecting bert-serving-client
  Downloading https://files.pythonhosted.org/packages/1f/09/aae1405378a848b2e87769ad89a43d6d71978c4e15534ca48e82e723a72f/bert_serving_client-1.10.0-py2.py3-none-any.whl
Installing collected packages: bert-serving-client
Successfully installed bert-serving-client-1.10.0


##### English Model 


Large model:

(24-layer, 1024-hidden, 16-heads, 340M parameters)

In [0]:
# download pretrained BERT model wwm_cased_L-24_H-1024_A-16 (Whole Word Masking)
!wget https://storage.googleapis.com/bert_models/2019_05_30/wwm_cased_L-24_H-1024_A-16.zip
!unzip wwm_cased_L-24_H-1024_A-16.zip
en_model = "wwm_cased_L-24_H-1024_A-16"

--2020-02-28 14:51:41--  https://storage.googleapis.com/bert_models/2019_05_30/wwm_cased_L-24_H-1024_A-16.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.126.128, 2a00:1450:4013:c07::80
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.126.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1242589256 (1.2G) [application/zip]
Saving to: ‘wwm_cased_L-24_H-1024_A-16.zip’


2020-02-28 14:52:09 (42.2 MB/s) - ‘wwm_cased_L-24_H-1024_A-16.zip’ saved [1242589256/1242589256]

Archive:  wwm_cased_L-24_H-1024_A-16.zip
   creating: wwm_cased_L-24_H-1024_A-16/
  inflating: wwm_cased_L-24_H-1024_A-16/bert_model.ckpt.meta  
  inflating: wwm_cased_L-24_H-1024_A-16/bert_model.ckpt.data-00000-of-00001  
  inflating: wwm_cased_L-24_H-1024_A-16/vocab.txt  
  inflating: wwm_cased_L-24_H-1024_A-16/bert_model.ckpt.index  
  inflating: wwm_cased_L-24_H-1024_A-16/bert_config.json  


Obtain English results:

In [0]:
# start server
!nohup bert-serving-start -model_dir=./{en_model} > out.file 2>&1 &
# open client
bc_en = BertClient()
# encode English raw corpus
english_train_embeddings = bc_en.encode(raw_english_train)
english_val_embeddings = bc_en.encode(raw_english_validation)
english_test_embeddings = bc_en.encode(raw_english_test)
# close client
bc_en.close()

here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


Save English results:

We save results before we manually shut down the server.

In [0]:
import pandas as pd

# Save English embedding results to csv files
pd_english_train = pd.DataFrame(english_train_embeddings)
pd_english_train.to_csv('english_train_berteb.csv', index=False)

pd_english_dev = pd.DataFrame(english_val_embeddings)
pd_english_dev.to_csv('english_dev_berteb.csv', index=False)

pd_english_test = pd.DataFrame(english_test_embeddings)
pd_english_test.to_csv('english_test_berteb.csv', index=False)

Read English results:

In [0]:
import pandas as pd

# Read English embedding results from csv files
english_train_embeddings = pd.read_csv('english_train_berteb.csv')
english_train_embeddings = english_train_embeddings.values

english_val_embeddings = pd.read_csv('english_dev_berteb.csv')
english_val_embeddings = english_val_embeddings.values

english_test_embeddings = pd.read_csv('english_test_berteb.csv')
english_test_embeddings = english_test_embeddings.values

##### Chinese Model

Large model:

(24-layer, 1024-hidden, 16-heads, 340M parameters)

In [5]:
# download pretrained BERT model chinese_roberta_wwm_large_ext_L-24_H-1024_A-16 (Whole Word Masking)
!wget https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip
!unzip chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip -d chinese_roberta_wwm_large_ext_L-24_H-1024_A-16
ch_model = "chinese_roberta_wwm_large_ext_L-24_H-1024_A-16"

--2020-02-28 15:45:16--  https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.15.128, 2a00:1450:400c:c0b::80
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.15.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1213043734 (1.1G) [application/zip]
Saving to: ‘chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip’


2020-02-28 15:45:32 (75.7 MB/s) - ‘chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip’ saved [1213043734/1213043734]

Archive:  chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip
  inflating: chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/bert_config.json  
  inflating: chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/bert_model.ckpt.data-00000-of-00001  
  inflating: chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/bert_model.ckpt.index  
  inflating: chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/bert_model.c

Obtain Chinese results:

In [6]:
# start server
!nohup bert-serving-start -model_dir=./{ch_model} > out.file 2>&1 &
# open client
bc_ch = BertClient()
# encode Chinese raw corpus
chninese_train_embeddings = bc_ch.encode(raw_chinese_train)
chinese_val_embeddings = bc_ch.encode(raw_chinese_validation)
chinese_test_embeddings = bc_ch.encode(raw_chinese_test)
# close client
bc_ch.close()

here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


#### Concatenate vectors

In [0]:
# Concatenate vectors
sentence_embeddings_train = []
sentence_embeddings_val = []    
sentence_embeddings_test = []

# Concatenate train vectors
for i in range(len(english_train_embeddings)):
    english = list(english_train_embeddings[i])
    chinese = list(chninese_train_embeddings[i])
    english.extend(chinese)
    sentence_embeddings_train.append(english)

# Concatenate validation vectors
for i in range(len(english_val_embeddings)):
    english = list(english_val_embeddings[i])
    chinese = list(chinese_val_embeddings[i])
    english.extend(chinese)
    sentence_embeddings_val.append(english)
# Concatenate test vectors
for i in range(len(english_test_embeddings)):
    english = list(english_test_embeddings[i])
    chinese = list(chinese_test_embeddings[i])
    english.extend(chinese)
    sentence_embeddings_test.append(english)

In [0]:
sentence_embeddings_train = np.asarray(sentence_embeddings_train).astype(float)
sentence_embeddings_val = np.asarray(sentence_embeddings_val).astype(float)
sentence_embeddings_test = np.asarray(sentence_embeddings_test).astype(float)

score_train = np.asarray(score_train).astype(float)
score_validation = np.asarray(score_validation).astype(float)

## Model: SVR

It achieves the best result when k is set as 'rbf' based on experiment (in the commented part).

In [9]:
from sklearn.svm import SVR

def rmse(predictions, targets):
    """
    Method to calculate the root mean squared error.

    Args: 
        predictions: the prediction of the model.
        targets: the ground truth.
    
    Returns:
        The sentence vector.
    """
    return np.sqrt(((predictions - targets) ** 2).mean())

# Train and fit into the model.
clf_t = SVR(kernel='rbf')
clf_t.fit(sentence_embeddings_train, score_train)
predictions = clf_t.predict(sentence_embeddings_val)
pearson = pearsonr(score_validation, predictions)
print(f'RMSE: {rmse(predictions,score_validation)} Pearson {pearson[0]}')

RMSE: 0.8533648151501044 Pearson 0.447425094596136


Save test results:

In [0]:
import os

def writeScores(method_name,scores):
    """
    Method to write scores to a file.

    Args: 
        method_name: the name of the method.
        scores: the predicted scores of the model.
    
    """
    fn = "predictions.txt"
    print("")
    with open(fn, 'w') as output_file:
        for idx,x in enumerate(scores):
            #out =  metrics[idx]+":"+str("{0:.2f}".format(x))+"\n"
            #print(out)
            output_file.write(f"{x}\n")

In [11]:
predictions_zh = clf_t.predict(sentence_embeddings_test)


#EN_ZH

from google.colab import files
from zipfile import ZipFile


writeScores("SVR",predictions_zh)

with ZipFile("en-zh_svr.zip","w") as newzip:
	newzip.write("predictions.txt")
 
files.download('en-zh_svr.zip')


