# NLP Coursework

## Dowdload and Import Libraries

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F 
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import numpy as np

from scipy.stats.stats import pearsonr

## Import Data

In [0]:
def read_data(path):
    """
    Read data from the data path.

    Args: 
        path: the path of the dataset, normally in local folder.
    
    Returns:
        Loaded raw dataset. 
    """
    with open(path) as dataset:
        raw_data = dataset.readlines()

    return raw_data


# Define the path of the train dataset
english_train_path = "train.enzh.src"
chinese_train_path = "train.enzh.mt"   
scores_train_path = "train.enzh.scores"
# Define the path of the validatin dataset
english_validation_path = "dev.enzh.src"
chinese_validation_path = "dev.enzh.mt"   
scores_validation_path = "dev.enzh.scores"
# Define the path of the test dataset
english_test_path = "test.enzh.src"
chinese_test_path = "test.enzh.mt"


# Read train, validation, test data
raw_english_train = read_data(english_train_path)
raw_chinese_train = read_data(chinese_train_path)
raw_english_validation = read_data(english_validation_path)
raw_chinese_validation = read_data(chinese_validation_path)
raw_english_test = read_data(english_test_path)
raw_chinese_test = read_data(chinese_test_path)

# read scores for train and validation dataset 
score_train = read_data(scores_train_path)
score_validation = read_data(scores_validation_path)

## Sentence Embedding - SentenceTransformer

In [3]:
!pip install -U sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/c9/91/c85ddef872d5bb39949386930c1f834ac382e145fcd30155b09d6fb65c5a/sentence-transformers-0.2.5.tar.gz (49kB)
[K     |████████████████████████████████| 51kB 9.7MB/s 
[?25hCollecting transformers==2.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |████████████████████████████████| 450kB 42.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 71.5MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |██████████

In [4]:
# Import sentence transformer for sentence embedding.
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distiluse-base-multilingual-cased')

100%|██████████| 504M/504M [10:44<00:00, 782kB/s]


Using raw corpus:

In [0]:
# Convert preprocessed corpus to sentence embeddings
english_train_embeddings = model.encode(raw_english_train)
chninese_train_embeddings = model.encode(raw_chinese_train)

english_val_embeddings = model.encode(raw_english_validation)
chinese_val_embeddings = model.encode(raw_chinese_validation)

#### Concatenate vectors

In [0]:
# Concatenate vectors
sentence_embeddings_train = []
sentence_embeddings_val = []    
sentence_embeddings_test = []

# Concatenate train vectors
for i in range(len(english_train_embeddings)):
    english = list(english_train_embeddings[i])
    chinese = list(chninese_train_embeddings[i])
    english.extend(chinese)
    sentence_embeddings_train.append(english)

# Concatenate validation vectors
for i in range(len(english_val_embeddings)):
    english = list(english_val_embeddings[i])
    chinese = list(chinese_val_embeddings[i])
    english.extend(chinese)
    sentence_embeddings_val.append(english)

In [0]:
sentence_embeddings_train = np.asarray(sentence_embeddings_train).astype(float)
sentence_embeddings_val = np.asarray(sentence_embeddings_val).astype(float)
sentence_embeddings_test = np.asarray(sentence_embeddings_test).astype(float)

score_train = np.asarray(score_train).astype(float)
score_validation = np.asarray(score_validation).astype(float)

## Model: Feedforward Neural Network

Uses Sentence Embedding:

In [0]:
# Feed data into data loader
batch_size = 32

train_data = []
val_data = []

for i in range(len(sentence_embeddings_train)):
    train_data.append([sentence_embeddings_train[i], score_train[i]])

for i in range(len(sentence_embeddings_val)):
    val_data.append([sentence_embeddings_val[i], score_validation[i]])

loader_train = torch.utils.data.DataLoader(train_data, batch_size = batch_size)
loader_val = torch.utils.data.DataLoader(val_data)


In [9]:
USE_GPU = True

dtype = torch.float32 # we will be using float throughout this tutorial

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

# Constant to control how frequently we print train loss
print_every = 50

print('using device:', device)

using device: cuda:0


In [0]:
class FFNN(nn.Module):
    """
    Fully Connected Feedforward Neural Network.
    4 linear hidden layers 
    1 output layer

    Attributes:
        fc1: layer 1
        fc2: layer 2
        fc3: layer 3
        fc4: layer 4
        fc5: output layer

    """
    def __init__(self):  
        super(FFNN, self).__init__()
        self.fc1 = nn.Linear(1024, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x

### Train

In [0]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

def check_train_performance(loader, model):
    """
    Method to check the performance of the train process.

    Args: 
        loader: the dataloder which contains the train data 
        model: the model to be trained.
    
    """
    predictions = []
    real_scores = []
    model.eval()
    with torch.no_grad():
        for x, y in loader:

            x = x.to(device=device, dtype=dtype)
            y = y.to(device=device, dtype=float)

            score = model(x)
            

            predictions.append(score.cpu().detach().numpy())
            real_scores.append(y.cpu().detach().numpy())

    predictions = np.asarray([i for item in predictions for i in item]).squeeze(1)
    
    pearson = pearsonr(score_train, predictions)
    print(f'RMSE: {rmse(predictions,score_train)} Pearson {pearson[0]}')
    print()


def train_part(model, optimizer, scheduler=None, epochs=1):
    """
    Method to train the model.

    Args: 
        model: the model to be trained.
        optimizer: the optimizer used for optimisation.
        scheduler: the scheduler used in this training process
        epochs: the number of epochs , default is 1
    
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        for t, (x, y) in enumerate(loader_train):
            model.train()  # put model to training mode
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.float)


            scores = model(x)

            scores = scores.squeeze(1)

            loss = F.mse_loss(scores, y)

            optimizer.zero_grad()

            loss.backward()

            optimizer.step()

            if t % print_every == 0:
                print('Epoch: %d, Iteration %d, loss = %.4f' % (e, t, loss.item()))
                #check_accuracy(loader_val, model)
                print()
        check_train_performance(loader_train, model)
        # Adjust the learning rate
        if scheduler is not None:
            scheduler.step()

In [12]:
# Initialisation of the model and train.
baseline_model = FFNN()
print(baseline_model)
optimizer = optim.Adam(baseline_model.parameters())
scheduler = optim.lr_scheduler.StepLR(optimizer, 10)
train_part(baseline_model, optimizer, scheduler, epochs=20)

FFNN(
  (fc1): Linear(in_features=1024, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=64, bias=True)
  (fc5): Linear(in_features=64, out_features=1, bias=True)
)
Epoch: 0, Iteration 0, loss = 0.8997

Epoch: 0, Iteration 50, loss = 0.9164

Epoch: 0, Iteration 100, loss = 0.7175

Epoch: 0, Iteration 150, loss = 0.7734

Epoch: 0, Iteration 200, loss = 0.7546

RMSE: 0.8531191006600798 Pearson 0.42970294767892236

Epoch: 1, Iteration 0, loss = 0.8022

Epoch: 1, Iteration 50, loss = 0.7442

Epoch: 1, Iteration 100, loss = 0.5506

Epoch: 1, Iteration 150, loss = 0.6270

Epoch: 1, Iteration 200, loss = 0.8567

RMSE: 0.7913576009851085 Pearson 0.5512048578689073

Epoch: 2, Iteration 0, loss = 0.8291

Epoch: 2, Iteration 50, loss = 0.7209

Epoch: 2, Iteration 100, loss = 0.4152

Epoch: 2, Iteration 150, loss = 0.4559

Epoch: 2, Iteration 200, lo

### Validation Performance

In [13]:
def check_performance(loader, model):
    """
    Method to check the performance on validation set.

    Args: 
        loader: the dataloder which contains the train data 
        model: the model to be trained.
    
    """
    predictions = []
    model.eval()
    with torch.no_grad():
        for x, y in loader:

            x = x.to(device=device, dtype=dtype)
            y = y.to(device=device, dtype=float)

            score = model(x)

            predictions.append(score.cpu().detach().numpy())
    
    predictions = np.asarray(predictions).ravel()

    pearson = pearsonr(score_validation, predictions)
    print(f'RMSE: {rmse(predictions,score_validation)} Pearson {pearson[0]}')
    print()


check_performance(loader_val, baseline_model)

RMSE: 0.9572063781921932 Pearson 0.2430570430092266

