# NLP Coursework

## Dowdload and Import Libraries

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F 
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import numpy as np

from scipy.stats.stats import pearsonr

## Import Data

In [0]:
def read_data(path):
    """
    Read data from the data path.

    Args: 
        path: the path of the dataset, normally in local folder.
    
    Returns:
        Loaded raw dataset. 
    """
    with open(path) as dataset:
        raw_data = dataset.readlines()

    return raw_data


# Define the path of the train dataset
english_train_path = "train.enzh.src"
chinese_train_path = "train.enzh.mt"   
scores_train_path = "train.enzh.scores"
# Define the path of the validatin dataset
english_validation_path = "dev.enzh.src"
chinese_validation_path = "dev.enzh.mt"   
scores_validation_path = "dev.enzh.scores"
# Define the path of the test dataset
english_test_path = "test.enzh.src"
chinese_test_path = "test.enzh.mt"


# Read train, validation, test data
raw_english_train = read_data(english_train_path)
raw_chinese_train = read_data(chinese_train_path)
raw_english_validation = read_data(english_validation_path)
raw_chinese_validation = read_data(chinese_validation_path)
raw_english_test = read_data(english_test_path)
raw_chinese_test = read_data(chinese_test_path)

# read scores for train and validation dataset 
score_train = read_data(scores_train_path)
score_validation = read_data(scores_validation_path)

## Sentence Embedding - SentenceTransformers

### Sentence Embedding

In [3]:
!pip install -U sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/c9/91/c85ddef872d5bb39949386930c1f834ac382e145fcd30155b09d6fb65c5a/sentence-transformers-0.2.5.tar.gz (49kB)
[K     |████████████████████████████████| 51kB 3.3MB/s 
[?25hCollecting transformers==2.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |████████████████████████████████| 450kB 17.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 41.6MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |██████████

In [4]:
# Import sentence transformer for sentence embedding.
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distiluse-base-multilingual-cased')

100%|██████████| 504M/504M [02:42<00:00, 3.11MB/s]


Using raw corpus:

In [0]:
# Convert preprocessed corpus to sentence embeddings
english_train_embeddings = model.encode(raw_english_train)
chninese_train_embeddings = model.encode(raw_chinese_train)

english_val_embeddings = model.encode(raw_english_validation)
chinese_val_embeddings = model.encode(raw_chinese_validation)

#### Concatenate vectors

In [0]:
# Concatenate vectors
sentence_embeddings_train = []
sentence_embeddings_val = []    
sentence_embeddings_test = []

# Concatenate train vectors
for i in range(len(english_train_embeddings)):
    english = list(english_train_embeddings[i])
    chinese = list(chninese_train_embeddings[i])
    english.extend(chinese)
    sentence_embeddings_train.append(english)

# Concatenate validation vectors
for i in range(len(english_val_embeddings)):
    english = list(english_val_embeddings[i])
    chinese = list(chinese_val_embeddings[i])
    english.extend(chinese)
    sentence_embeddings_val.append(english)

In [0]:
sentence_embeddings_train = np.asarray(sentence_embeddings_train).astype(float)
sentence_embeddings_val = np.asarray(sentence_embeddings_val).astype(float)
sentence_embeddings_test = np.asarray(sentence_embeddings_test).astype(float)

score_train = np.asarray(score_train).astype(float)
score_validation = np.asarray(score_validation).astype(float)

## Model: SVR

It achieves the best result when k is set as 'rbf' based on experiment (in the commented part).

In [8]:
from sklearn.svm import SVR

def rmse(predictions, targets):
    """
    Method to calculate the root mean squared error.

    Args: 
        predictions: the prediction of the model.
        targets: the ground truth.
    
    Returns:
        The sentence vector.
    """
    return np.sqrt(((predictions - targets) ** 2).mean())

# Train and fit into the model.
clf_t = SVR(kernel='rbf')
clf_t.fit(sentence_embeddings_train, score_train)
predictions = clf_t.predict(sentence_embeddings_val)
pearson = pearsonr(score_validation, predictions)
print(f'RMSE: {rmse(predictions,score_validation)} Pearson {pearson[0]}')

RMSE: 0.875252344881576 Pearson 0.34235343131352036
