# NLP Coursework

## Dowdload and Import Libraries

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F 
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import numpy as np

from scipy.stats.stats import pearsonr

## Import Data

In [0]:
def read_data(path):
    """
    Read data from the data path.

    Args: 
        path: the path of the dataset, normally in local folder.
    
    Returns:
        Loaded raw dataset. 
    """
    with open(path) as dataset:
        raw_data = dataset.readlines()

    return raw_data


# Define the path of the train dataset
english_train_path = "train.enzh.src"
chinese_train_path = "train.enzh.mt"   
scores_train_path = "train.enzh.scores"
# Define the path of the validatin dataset
english_validation_path = "dev.enzh.src"
chinese_validation_path = "dev.enzh.mt"   
scores_validation_path = "dev.enzh.scores"
# Define the path of the test dataset
english_test_path = "test.enzh.src"
chinese_test_path = "test.enzh.mt"


# Read train, validation, test data
raw_english_train = read_data(english_train_path)
raw_chinese_train = read_data(chinese_train_path)
raw_english_validation = read_data(english_validation_path)
raw_chinese_validation = read_data(chinese_validation_path)
raw_english_test = read_data(english_test_path)
raw_chinese_test = read_data(chinese_test_path)

# read scores for train and validation dataset 
score_train = read_data(scores_train_path)
score_validation = read_data(scores_validation_path)

## Pre-processing 

### English

Download and Import:

In [3]:
import spacy
from nltk import download
from nltk.corpus import stopwords

# RUN ONCE

# Downloading spacy models for english
!spacy download en_core_web_md
!spacy link en_core_web_md en300

# downloading stopwords from the nltk package
download('stopwords') # stopwords dictionary

Collecting en_core_web_md==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz (95.4MB)
[K     |████████████████████████████████| 95.4MB 799kB/s 
[?25hBuilding wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.1.0-cp36-none-any.whl size=97126236 sha256=f882d519ae8fd9f841b53ffb3ff7cdd511c0cc9ed9709033b243aff4cf261397
  Stored in directory: /tmp/pip-ephem-wheel-cache-io9i_mdy/wheels/c1/2c/5f/fd7f3ec336bf97b0809c86264d2831c5dfb00fc2e239d1bb01
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_md -->
/usr/local/lib/py

True

In [0]:
# tokenizer model
nlp_en = spacy.load('en300')

stop_words_en = set(stopwords.words('english'))


def preprocess_en(raw_corpus):
    """
    Method to preprocesss raw English corpus, including lower casing, stop word 
    removal, etc.

    Args: 
        raw_corpus: the raw dataset needs to be processed.
    
    Returns:
        The processed corpus.
    """
    preprocessed_corpus = []
    for sentence in raw_corpus:
        text = sentence.lower()
        doc = [token.lemma_ for token in  nlp_en.tokenizer(text)]
        doc = [word for word in doc if word not in stop_words_en]
        doc = [word for word in doc if word.isalpha()] # restricts string to alphabetic characters only
        preprocessed_corpus.append(" ".join(doc))
    return preprocessed_corpus


# Preprocess the train, validation, test dataset.
preprocessed_english_train = preprocess_en(raw_english_train)
preprocessed_english_validation = preprocess_en(raw_english_validation)
preprocessed_english_test = preprocess_en(raw_english_test)

### Chinese

Download and Import:

In [5]:
# Download the package used to process Chinese
!wget -c https://github.com/Tony607/Chinese_sentiment_analysis/blob/master/data/chinese_stop_words.txt

import jieba

--2020-02-28 13:22:05--  https://github.com/Tony607/Chinese_sentiment_analysis/blob/master/data/chinese_stop_words.txt
Resolving github.com (github.com)... 192.30.253.112
Connecting to github.com (github.com)|192.30.253.112|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘chinese_stop_words.txt’

chinese_stop_words.     [   <=>              ] 419.57K   471KB/s    in 0.9s    

2020-02-28 13:22:07 (471 KB/s) - ‘chinese_stop_words.txt’ saved [429642]



In [6]:
stop_words = [ line.rstrip() for line in open('./chinese_stop_words.txt',"r", encoding="utf-8") ]


def processing_zh(raw_corpus):
    """
    Method to preprocesss Chinese corpus, tokenization, stop word removal, etc.

    Args: 
        raw_corpus: the raw dataset needs to be processed.
    
    Returns:
        The processed corpus.
    """
    preprocessed_corpus = []
    for sentence in raw_corpus:
        # seg_list = jieba.lcut(sentence,cut_all=True) # full mode
        seg_list = jieba.lcut(sentence) # precise mode
        doc = [word for word in seg_list if word not in stop_words]
        docs = [e for e in doc if e.isalnum()]
        preprocessed_corpus.append(" ".join(docs))
    return preprocessed_corpus

# Preprocess the train, validation, test dataset.
preprocessed_chinese_train = processing_zh(raw_chinese_train)
preprocessed_chinese_validation = processing_zh(raw_chinese_validation)
preprocessed_chinese_test = processing_zh(raw_chinese_test)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.912 seconds.
Prefix dict has been built successfully.


## Word Embedding - Word2Vec and GloVe

English:

In [7]:
import torchtext

glove = torchtext.vocab.GloVe(name='6B', dim=100)

def get_word_vector(word):
    """
    Method to get word vector from glove model.

    Args: 
        word: the original word.
    
    Returns:
        The word vector.
    """
    try:
        vec = glove.vectors[glove.stoi[word]]
        if vec is not None:
            return vec
    except KeyError:
      # print(f"Word {word} does not exist")
        return torch.zeros((100))

def get_sentence_vector(line):
    """
    Method to get sentence vector for each line.

    Args: 
        line: the original line.
    
    Returns:
        The sentence vector.
    """

    vectors = []
    for w in line:
        emb = get_word_vector(w)
        # do not add if the word is out of vocabulary
        if emb is not None:
            vectors.append(emb)
    
    return np.stack(vectors)


def get_embeddings_en(corpus):
    """
    Method to get English embedding for the corpus.

    Args: 
        corpus: the original line.
    
    Returns:
        The sentence vector.
    """
    sentences_vectors =[]
    for l in corpus:
        try:
            vec = get_sentence_vector(l)
            sentences_vectors.append(vec)
        except:
            sentences_vectors.append(0)

    return sentences_vectors

.vector_cache/glove.6B.zip: 862MB [06:31, 2.20MB/s]                           
100%|█████████▉| 398893/400000 [00:22<00:00, 18721.21it/s]

Chinese:

In [8]:
!wget -O zh.zip http://vectors.nlpl.eu/repository/20/35.zip
!unzip zh.zip

--2020-02-28 13:29:54--  http://vectors.nlpl.eu/repository/20/35.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.225
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.225|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1458485917 (1.4G) [application/zip]
Saving to: ‘zh.zip’

zh.zip                5%[>                   ]  73.57M  9.36MB/s    eta 4m 6s  

100%|█████████▉| 398893/400000 [00:40<00:00, 18721.21it/s]


2020-02-28 13:32:30 (9.02 MB/s) - ‘zh.zip’ saved [1458485917/1458485917]

Archive:  zh.zip
  inflating: LIST                    
  inflating: meta.json               
  inflating: model.bin               
  inflating: model.txt               
  inflating: README                  


In [9]:
import gensim 
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

wv_from_bin = KeyedVectors.load_word2vec_format("model.bin", binary=True) 

def get_sentence_vector_zh(line):
    """
    Method to get sentence vector for each line.

    Args: 
        line: the original line.
    
    Returns:
        The sentence vector.
    """
    vectors = []
    for w in line:
        try:
            emb = wv_from_bin[w]
            vectors.append(emb)
        except:
            emb = np.zeros(100)
            vectors.append(emb)
    if vectors:
        return np.stack(vectors)
    else:
        return np.zeros((100,))


def get_embeddings_zh(corpus):
    """
    Method to get English embedding for the corpus.

    Args: 
        corpus: the original line.
    
    Returns:
        The sentence vector.
    """
    sentences_vectors =[]
    for l in corpus:
        vec = get_sentence_vector_zh(l)
        if vec is not None:
            sentences_vectors.append(vec)
        else:
            print(l)
    return sentences_vectors

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# Convert data into word embedding.
word_embeddings_train_en = get_embeddings_en(preprocessed_english_train) 
word_embeddings_train_zh = get_embeddings_zh(preprocessed_chinese_train) 

word_embeddings_val_en = get_embeddings_en(preprocessed_english_validation)
word_embeddings_val_zh = get_embeddings_zh(preprocessed_chinese_validation)

score_train = np.asarray(score_train).astype(float)
score_validation = np.asarray(score_validation).astype(float)

## Model: Recurrent Neural Network with LSTM

LSTM takes in word embeddings(Word2Vec and GloVe) to capture the order within a sentnce sequence.

In [0]:
# Feed data into data loader
lstm_train = []
lstm_val = []
for i in range(len(word_embeddings_train_en)):
    lstm_train.append([word_embeddings_train_en[i], word_embeddings_train_zh[i], score_train[i]])

for i in range(len(word_embeddings_val_en)):
    lstm_val.append([word_embeddings_val_en[i], word_embeddings_val_zh[i], score_validation[i]])

batch_size = 32
loader_train = torch.utils.data.DataLoader(lstm_train, batch_size=batch_size)
loader_val = torch.utils.data.DataLoader(lstm_val)

In [12]:
USE_GPU = True

dtype = torch.float32 # we will be using float throughout this tutorial

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

# Constant to control how frequently we print train loss
print_every = 50

print('using device:', device)

using device: cuda:0


In [0]:
from torch.autograd import Variable
class LSTM(nn.Module):
    """
    Recurrent Neural Network with LSTM.
    2 LSTM Layers 
    1 fully connected neural network
    1 output layer

    Attributes:
        lstm_en: hidden lstm layer for English
        lstm_cn: hidden lstm layer for Chinese
        fc1: fully connected layer for regression
        out: the output of the model

    """
    def __init__(self):
        super(LSTM, self).__init__()

        self.lstm_en = nn.LSTM(100, 32, batch_first=True)
        self.lstm_cn = nn.LSTM(100, 32, batch_first=True)
        self.fc1 = nn.Linear(64, 32)
        self.out = nn.Linear(32, 1)

    def forward(self, cn, en):
        lstm_out_en, _ = self.lstm_en(en.view(1,-1,100))
        lstm_out_cn, _ = self.lstm_cn(cn.view(1,-1,100))

        fc_en = lstm_out_en[:,-1,:]
        fc_cn = lstm_out_cn[:,-1,:]
        fc_input = torch.cat((fc_en,fc_cn),1)

        f1 = self.fc1(fc_input)
        out = self.out(f1)

        return out


### Train

In [0]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

def check_train_performance(model):
    """
    Method to check the performance of the train process.

    Args: 
        loader: the dataloder which contains the train data 
        model: the model to be trained.
    
    """
    predictions = []
    model.eval()
    with torch.no_grad():
        for i in range(len(word_embeddings_train_en)):
            x = torch.tensor(word_embeddings_train_en[i])
            y = torch.tensor(word_embeddings_train_zh[i])
            z = torch.tensor(score_train[i])
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=dtype)
            z = z.to(device=device, dtype=torch.float)

            score = model(x, y)
            

            predictions.append(score.cpu().detach().numpy())

    predictions = np.asarray([i for item in predictions for i in item]).squeeze(1)
    
    pearson = pearsonr(score_train, predictions)
    print(f'RMSE: {rmse(predictions, score_train)} Pearson {pearson[0]}')
    print()

def train_part(model, optimizer, scheduler=None, epochs=1):
    """
    Method to train the model.

    Args: 
        model: the model to be trained.
        optimizer: the optimizer used for optimisation.
        scheduler: the scheduler used in this training process
        epochs: the number of epochs , default is 1
    
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        #for t, (x, y) in enumerate(loader_train):
        for i in range(len(word_embeddings_train_en)):
            x = torch.tensor(word_embeddings_train_en[i])
            y = torch.tensor(word_embeddings_train_zh[i])
            z = torch.tensor(score_train[i])
            model.train()  # put model to training mode
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=dtype)
            z = z.to(device=device, dtype=torch.float)


            scores = model(x, y)

            scores = scores.squeeze(1)

            loss = F.mse_loss(scores, z)

            optimizer.zero_grad()

            loss.backward()

            optimizer.step()

            #if i % print_every == 0:
               # print('Epoch: %d, Iteration %d, loss = %.4f' % (e, i, loss.item()))
                #check_accuracy(loader_val, model)
               # print()
        check_train_performance(model)
        # Adjust the learning rate
        if scheduler is not None:
            scheduler.step()

In [15]:
lstm_model = LSTM()
print(lstm_model)
optimizer = optim.Adam(lstm_model.parameters())
scheduler = optim.lr_scheduler.StepLR(optimizer, 10)
train_part(lstm_model, optimizer, scheduler, epochs=20)

LSTM(
  (lstm_en): LSTM(100, 32, batch_first=True)
  (lstm_cn): LSTM(100, 32, batch_first=True)
  (fc1): Linear(in_features=64, out_features=32, bias=True)
  (out): Linear(in_features=32, out_features=1, bias=True)
)




RMSE: 0.9057440071183247 Pearson 0.2521790764573601

RMSE: 0.9000784628598647 Pearson 0.2847856949002749

RMSE: 0.897998146600625 Pearson 0.30976558223653033

RMSE: 0.8863858058103699 Pearson 0.3505853426084062

RMSE: 0.8713254110486406 Pearson 0.39863721281214903

RMSE: 0.8530323569733004 Pearson 0.4333885149137533

RMSE: 0.8404054320514607 Pearson 0.468044271124997

RMSE: 0.8336450881728089 Pearson 0.48850841347079493

RMSE: 0.8276255400554453 Pearson 0.5101086370945245

RMSE: 0.8176153942151702 Pearson 0.5306453914350168

RMSE: 0.724504688302701 Pearson 0.6268259808005847

RMSE: 0.710854920789603 Pearson 0.644266663610362

RMSE: 0.7001807537072474 Pearson 0.6573975606943294

RMSE: 0.6906660895339831 Pearson 0.6688136545528136

RMSE: 0.6817020411934557 Pearson 0.6793101024854478

RMSE: 0.6730130078652536 Pearson 0.6892299347939951

RMSE: 0.6644910433519302 Pearson 0.6987170443552866

RMSE: 0.6560631482526605 Pearson 0.7078689058337039

RMSE: 0.6476640637026668 Pearson 0.7167651787086

### Validation Performance

In [16]:
def val_performance_check(model):
    """
    Method to check the performance of the validation set.

    Args: 
        model: the model after training.
    
    """
    predictions = []
    model.eval()
    with torch.no_grad():
        for i in range(len(word_embeddings_val_en)):
            x = torch.tensor(word_embeddings_val_en[i])
            y = torch.tensor(word_embeddings_val_zh[i])
            z = torch.tensor(score_validation[i])
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=dtype)
            z = z.to(device=device, dtype=torch.float)

            score = model(x, y)
            

            predictions.append(score.cpu().detach().numpy())

    predictions = np.asarray([i for item in predictions for i in item]).squeeze(1)
    
    pearson = pearsonr(score_validation, predictions)
    print(f'RMSE: {rmse(predictions, score_validation)} Pearson {pearson[0]}')
    print()

val_performance_check(lstm_model)

RMSE: 1.037653168591964 Pearson 0.15092400248407953

