# NLP Coursework

## Dowdload and Import Libraries

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F 
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import numpy as np

from scipy.stats.stats import pearsonr

## Import Data

In [0]:
def read_data(path):
    """
    Read data from the data path.

    Args: 
        path: the path of the dataset, normally in local folder.
    
    Returns:
        Loaded raw dataset. 
    """
    with open(path) as dataset:
        raw_data = dataset.readlines()

    return raw_data


# Define the path of the train dataset
english_train_path = "train.enzh.src"
chinese_train_path = "train.enzh.mt"   
scores_train_path = "train.enzh.scores"
# Define the path of the validatin dataset
english_validation_path = "dev.enzh.src"
chinese_validation_path = "dev.enzh.mt"   
scores_validation_path = "dev.enzh.scores"
# Define the path of the test dataset
english_test_path = "test.enzh.src"
chinese_test_path = "test.enzh.mt"


# Read train, validation, test data
raw_english_train = read_data(english_train_path)
raw_chinese_train = read_data(chinese_train_path)
raw_english_validation = read_data(english_validation_path)
raw_chinese_validation = read_data(chinese_validation_path)
raw_english_test = read_data(english_test_path)
raw_chinese_test = read_data(chinese_test_path)

# read scores for train and validation dataset 
score_train = read_data(scores_train_path)
score_validation = read_data(scores_validation_path)


## Pre-processing 

### English

Download and Import:

In [0]:
import spacy
from nltk import download
from nltk.corpus import stopwords

# RUN ONCE

# Downloading spacy models for english
!spacy download en_core_web_md
!spacy link en_core_web_md en300

# downloading stopwords from the nltk package
download('stopwords') # stopwords dictionary

Collecting en_core_web_md==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz (95.4MB)
[K     |████████████████████████████████| 95.4MB 1.3MB/s 
[?25hBuilding wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.1.0-cp36-none-any.whl size=97126236 sha256=148c3e7bf235453bc92d415fdd04af8445434683009132b3f04b49cca3b9a17e
  Stored in directory: /tmp/pip-ephem-wheel-cache-ccx2r4ja/wheels/c1/2c/5f/fd7f3ec336bf97b0809c86264d2831c5dfb00fc2e239d1bb01
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_md -->
/usr/local/lib/py

True

In [0]:
# tokenizer model
nlp_en = spacy.load('en300')

stop_words_en = set(stopwords.words('english'))


def preprocess_en(raw_corpus):
    """
    Method to preprocesss raw English corpus, including lower casing, stop word 
    removal, etc.

    Args: 
        raw_corpus: the raw dataset needs to be processed.
    
    Returns:
        The processed corpus.
    """
    preprocessed_corpus = []
    for sentence in raw_corpus:
        text = sentence.lower()
        doc = [token.lemma_ for token in  nlp_en.tokenizer(text)]
        doc = [word for word in doc if word not in stop_words_en]
        doc = [word for word in doc if word.isalpha()] # restricts string to alphabetic characters only
        preprocessed_corpus.append(" ".join(doc))
    return preprocessed_corpus


# Preprocess the train, validation, test dataset.
preprocessed_english_train = preprocess_en(raw_english_train)
preprocessed_english_validation = preprocess_en(raw_english_validation)
preprocessed_english_test = preprocess_en(raw_english_test)

### Chinese

Download and Import:

In [0]:
# Download the package used to process Chinese
!wget -c https://github.com/Tony607/Chinese_sentiment_analysis/blob/master/data/chinese_stop_words.txt

import jieba

--2020-02-27 21:49:04--  https://github.com/Tony607/Chinese_sentiment_analysis/blob/master/data/chinese_stop_words.txt
Resolving github.com (github.com)... 13.250.177.223
Connecting to github.com (github.com)|13.250.177.223|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘chinese_stop_words.txt’

chinese_stop_words.     [<=>                 ]       0  --.-KB/s               chinese_stop_words.     [ <=>                ] 417.17K  --.-KB/s    in 0.03s   

2020-02-27 21:49:05 (15.3 MB/s) - ‘chinese_stop_words.txt’ saved [427178]



In [0]:
stop_words = [ line.rstrip() for line in open('./chinese_stop_words.txt',"r", encoding="utf-8") ]


def processing_zh(raw_corpus):
    """
    Method to preprocesss Chinese corpus, tokenization, stop word removal, etc.

    Args: 
        raw_corpus: the raw dataset needs to be processed.
    
    Returns:
        The processed corpus.
    """
    preprocessed_corpus = []
    for sentence in raw_corpus:
        # seg_list = jieba.lcut(sentence,cut_all=True) # full mode
        seg_list = jieba.lcut(sentence) # precise mode
        doc = [word for word in seg_list if word not in stop_words]
        docs = [e for e in doc if e.isalnum()]
        preprocessed_corpus.append(" ".join(docs))
    return preprocessed_corpus

# Preprocess the train, validation, test dataset.
preprocessed_chinese_train = processing_zh(raw_chinese_train)
preprocessed_chinese_validation = processing_zh(raw_chinese_validation)
preprocessed_chinese_test = processing_zh(raw_chinese_test)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.798 seconds.
Prefix dict has been built successfully.


## Word Embedding - Bert

English (preprocessed corpus):

In [0]:
# download pretrained BERT model cased_L-12_H-768_A-12
!wget https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip
!unzip cased_L-12_H-768_A-12.zip

!pip install bert-serving-server
!pip install bert-serving-client
!nohup bert-serving-start -model_dir=./cased_L-12_H-768_A-12 > out.file 2>&1 &

--2020-02-27 21:49:32--  https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.24.128, 2404:6800:4003:c00::80
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.24.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 404261442 (386M) [application/zip]
Saving to: ‘cased_L-12_H-768_A-12.zip’


2020-02-27 21:49:42 (38.9 MB/s) - ‘cased_L-12_H-768_A-12.zip’ saved [404261442/404261442]

Archive:  cased_L-12_H-768_A-12.zip
   creating: cased_L-12_H-768_A-12/
  inflating: cased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: cased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: cased_L-12_H-768_A-12/vocab.txt  
  inflating: cased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: cased_L-12_H-768_A-12/bert_config.json  
Collecting bert-serving-server
[?25l  Downloading https://files.pythonhosted.org/packages/b0/bd/cab677bbd0c5fb08b72

Collecting bert-serving-client
  Downloading https://files.pythonhosted.org/packages/1f/09/aae1405378a848b2e87769ad89a43d6d71978c4e15534ca48e82e723a72f/bert_serving_client-1.10.0-py2.py3-none-any.whl
Installing collected packages: bert-serving-client
Successfully installed bert-serving-client-1.10.0


In [0]:
from bert_serving.client import BertClient
bc_en = BertClient(show_server_config=False)
# encode English preprocessed corpus by word
english_validation_wbert = []
for i in range(len(preprocessed_english_validation)):
  sentence = preprocessed_english_validation[i].split()
  word_embedding = bc_en.encode(sentence)
  english_validation_wbert.append(word_embedding)
english_validation_wbert_np = np.array(english_validation_wbert)
np.save('english_validation_wbert.npy', english_validation_wbert_np)

english_train_wbert = []
for i in range(len(preprocessed_english_train)):
  sentence = preprocessed_english_train[i].split()
  word_embedding = bc_en.encode(sentence)
  english_train_wbert.append(word_embedding)
english_train_wbert_np = np.array(english_train_wbert)
np.save('english_train_wbert.npy', english_train_wbert_np)

english_test_wbert = []
for i in range(len(preprocessed_english_test)):
  sentence = preprocessed_english_test[i].split()
  word_embedding = bc_en.encode(sentence)
  english_test_wbert.append(word_embedding)
english_test_wbert_np = np.array(english_test_wbert)
np.save('english_test_wbert.npy', english_test_wbert_np)

bc_en.close()

Chinese (preprocessed corpus):

In [0]:
# download pretrained BERT model chinese_L-12_H-768_A-12
!wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip
!unzip chinese_L-12_H-768_A-12.zip

!pip install bert-serving-server
!pip install bert-serving-client
!nohup bert-serving-start -model_dir=./chinese_L-12_H-768_A-12 > out.file 2>&1 &

--2020-02-27 21:53:39--  https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.68.128, 2404:6800:4003:c03::80
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.68.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 381892918 (364M) [application/zip]
Saving to: ‘chinese_L-12_H-768_A-12.zip’


2020-02-27 21:53:55 (26.0 MB/s) - ‘chinese_L-12_H-768_A-12.zip’ saved [381892918/381892918]

Archive:  chinese_L-12_H-768_A-12.zip
   creating: chinese_L-12_H-768_A-12/
  inflating: chinese_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: chinese_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: chinese_L-12_H-768_A-12/vocab.txt  
  inflating: chinese_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: chinese_L-12_H-768_A-12/bert_config.json  


In [0]:
from bert_serving.client import BertClient
bc_ch = BertClient(show_server_config=False)
# encode Chinese preprocessed corpus by word
chinese_validation_wbert = []
for i in range(len(preprocessed_chinese_validation)):
  sentence = preprocessed_chinese_validation[i].split()
  word_embedding = bc_ch.encode(sentence)
  chinese_validation_wbert.append(word_embedding)
chinese_validation_wbert_np = np.array(chinese_validation_wbert)
np.save('chinese_validation_wbert.npy', chinese_validation_wbert_np)

chinese_train_wbert = []
for i in range(len(preprocessed_chinese_train)):
  sentence = preprocessed_chinese_train[i].split()
  word_embedding = bc_ch.encode(sentence)
  chinese_train_wbert.append(word_embedding)
chinese_train_wbert_np = np.array(chinese_train_wbert)
np.save('chinese_train_wbert.npy', chinese_train_wbert_np)

chinese_test_wbert = []
for i in range(len(preprocessed_chinese_test)):
  sentence = preprocessed_chinese_test[i].split()
  word_embedding = bc_ch.encode(sentence)
  chinese_test_wbert.append(word_embedding)
chinese_test_wbert_np = np.array(chinese_test_wbert)
np.save('chinese_test_wbert.npy', chinese_test_wbert_np)

bc_ch.close()

## Model: LSTM (BERT word embedding)

In [0]:
from google.colab import drive
drive.mount('/content/drive')


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!ls "/content/drive/My Drive/BERT word embedding"
english_train_bert = np.load('/content/drive/My Drive/BERT word embedding/english_train_wbert.npy', allow_pickle=True)
chinese_train_bert = np.load('/content/drive/My Drive/BERT word embedding/chinese_train_wbert.npy', allow_pickle=True)
english_val_bert = np.load('/content/drive/My Drive/BERT word embedding/english_validation_wbert.npy', allow_pickle=True)
chinese_val_bert = np.load('/content/drive/My Drive/BERT word embedding/chinese_validation_wbert.npy', allow_pickle=True)
chinese_test_bert = np.load('/content/drive/My Drive/BERT word embedding/chinese_test_wbert.npy', allow_pickle=True)
english_test_bert = np.load('/content/drive/My Drive/BERT word embedding/english_test_wbert.npy', allow_pickle=True)


ls: cannot access '/content/drive/My Drive/BERT word embedding': No such file or directory


FileNotFoundError: ignored

In [0]:
class LSTM_with_BERT(nn.Module):
    """
    Recurrent Neural Network with LSTM.
    2 LSTM Layers 
    4 fully connected neural network
    1 output layer

    Attributes:
        lstm_en: hidden lstm layer for English
        lstm_cn: hidden lstm layer for Chinese
        fc1: FNN layer 1
        fc2: FNN layer 2
        fc3: FNN layer 3
        out: the output of the model

    """
    def __init__(self):
        super(LSTM_with_BERT, self).__init__()

        self.lstm_en = nn.LSTM(768, 256, batch_first=True)
        self.lstm_cn = nn.LSTM(768, 256, batch_first=True)
        self.fc1 = nn.Linear(512, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 32)
        self.out = nn.Linear(32, 1)

    def forward(self, cn, en):
        lstm_out_en, _ = self.lstm_en(en.view(1,-1,768))
        lstm_out_cn, _ = self.lstm_cn(cn.view(1,-1,768))

        fc_en = lstm_out_en[:,-1,:]
        fc_cn = lstm_out_cn[:,-1,:]
        fc_input = torch.cat((fc_en,fc_cn),1)

        f1 = F.relu(self.fc1(fc_input))
        f1 = F.relu(self.fc2(f1))
        f1 = F.relu(self.fc3(f1))
        f1 = F.relu(self.fc4(f1))
        out = self.out(f1)

        return out

### Train

In [0]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

def check_train_performance(model):
    predictions = []
    model.eval()
    with torch.no_grad():
        for i in range(len(english_train_bert)):
            x = torch.tensor(english_train_bert[i])
            y = torch.tensor(chinese_train_bert[i])
            z = torch.tensor(score_train[i])
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=dtype)
            z = z.to(device=device, dtype=torch.float)

            score = model(x, y)
            

            predictions.append(score.cpu().detach().numpy())

    predictions = np.asarray([i for item in predictions for i in item]).squeeze(1)
    
    pearson = pearsonr(score_train, predictions)
    print(f'RMSE: {rmse(predictions, score_train)} Pearson {pearson[0]}')
    print()

def train_part(model, optimizer, scheduler=None, epochs=1):
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        #for t, (x, y) in enumerate(loader_train):
        for i in range(len(english_train_bert)):
            x = torch.tensor(english_train_bert[i])
            y = torch.tensor(chinese_train_bert[i])
            z = torch.tensor(score_train[i])
            model.train()  # put model to training mode
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=dtype)
            z = z.to(device=device, dtype=torch.float)


            scores = model(x, y)

            scores = scores.squeeze(1)

            loss = F.mse_loss(scores, z)

            optimizer.zero_grad()

            loss.backward()

            optimizer.step()

            #if i % print_every == 0:
               # print('Epoch: %d, Iteration %d, loss = %.4f' % (e, i, loss.item()))
                #check_accuracy(loader_val, model)
               # print()
        check_train_performance(model)
        # Adjust the learning rate
        if scheduler is not None:
            scheduler.step()

In [0]:
lstm_bert_model = LSTM_with_BERT()
print(lstm_bert_model)
optimizer = optim.Adam(lstm_bert_model.parameters())
scheduler = optim.lr_scheduler.StepLR(optimizer, 10)
train_part(lstm_bert_model, optimizer, scheduler, epochs=20)

### Validation Performance

In [0]:
def val_performance_check(model):
    predictions = []
    model.eval()
    with torch.no_grad():
        for i in range(len(english_val_bert)):
            x = torch.tensor(english_val_bert[i])
            y = torch.tensor(chinese_val_bert[i])
            z = torch.tensor(score_validation[i])
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=dtype)
            z = z.to(device=device, dtype=torch.float)

            score = model(x, y)
            

            predictions.append(score.cpu().detach().numpy())

    predictions = np.asarray([i for item in predictions for i in item]).squeeze(1)
    
    pearson = pearsonr(score_validation, predictions)
    print(f'RMSE: {rmse(predictions, score_validation)} Pearson {pearson[0]}')
    print()

val_performance_check(lstm_bert_model)