In [1]:
import pandas as pd
import torch
import numpy as np

# **Dataset Build**

In [2]:
with open('/content/drive/MyDrive/research task/train.txt', encoding='UTF-8') as f:
  train_contents = f.read()


with open('/content/drive/MyDrive/research task/test.txt', encoding='UTF-8') as f:
  test_contents = f.read()

In [3]:
def get_dataset_from_contents(contents):
  result = []
  start = 0
  while start < len(contents):
    
    idx_source_end = contents.find('\n', start+1)
    idx_reference_end = contents.find('\n', idx_source_end+1)
    idx_candidate_end = contents.find('\n', idx_reference_end+1)
    idx_bleu_score_end = contents.find('\n', idx_candidate_end+1)
    idx_label_end = contents.find('\n', idx_bleu_score_end+1)
    
    source = contents[start:idx_source_end].strip('\n')
    reference = contents[idx_source_end:idx_reference_end].strip('\n')
    candidate = contents[idx_reference_end:idx_candidate_end].strip('\n')
    bleu_score = contents[idx_candidate_end:idx_bleu_score_end].strip('\n')
    label = contents[idx_bleu_score_end:idx_label_end].strip('\n')

    start = idx_label_end + len('\n')
    
    result.append((source, reference, candidate, bleu_score, label))

  return result

In [4]:
train_read_dataset = get_dataset_from_contents(train_contents)
test_read_dataset = get_dataset_from_contents(test_contents)

# **Dataset Exploration**

In [5]:
def get_dataset_statistics(dataset):
  max_len_source = 0
  max_len_reference = 0
  max_len_candidate = 0

  total_len_source = 0
  total_len_reference = 0
  total_len_candidate = 0
  total_machine_candidate = 0
  total_human_candidate = 0
  for (source, reference, candidate, bleu_score, label) in dataset:
    max_len_source = max(max_len_source, len(source))
    max_len_reference = max(max_len_reference, len(reference))
    max_len_candidate = max(max_len_candidate, len(candidate))

    total_len_source += len(source)
    total_len_reference += len(reference)
    total_len_candidate += len(candidate)
    total_machine_candidate += (label == 'M')
    total_human_candidate += (label == 'H')

  return max_len_source, max_len_reference, max_len_candidate, total_len_source/len(dataset), total_len_reference/len(dataset), total_len_candidate/len(dataset), total_machine_candidate, total_human_candidate

In [6]:
max_len_source, max_len_reference, max_len_candidate, av_len_source, av_len_reference, av_len_candidate, total_machine_candidate, total_human_candidate = get_dataset_statistics(train_read_dataset+test_read_dataset)
print('Max length of source:', max_len_source)
print('Max length of reference:', max_len_reference)
print('Max length of candidate:', max_len_candidate)
print('Average length of source:', av_len_source)
print('Average length of reference:', av_len_reference)
print('Average length of candidate:', av_len_candidate)
print('Total machine candidates:', total_machine_candidate)
print('Total human candidates:', total_human_candidate)

Max length of source: 238
Max length of reference: 637
Max length of candidate: 634
Average length of source: 76.10554089709763
Average length of reference: 174.7467018469657
Average length of candidate: 168.08839050131925
Total machine candidates: 354
Total human candidates: 404


On average, source sentences (in Chinese) are smaller in length than reference and candidate sentences (in English). 

47% of candidates are machine candidates and 53% are human candidates

# Model development

In [7]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download zh_core_web_sm
!python -m spacy download en_core_web_sm

2021-02-07 01:08:07.004315: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
Collecting zh-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.0.0/zh_core_web_sm-3.0.0-py3-none-any.whl (49.5 MB)
[K     |████████████████████████████████| 49.5 MB 48 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('zh_core_web_sm')
2021-02-07 01:08:24.710141: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 101 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [8]:
import nltk
from collections import Counter
import spacy

nlp = spacy.load("en_core_web_sm")
chinese_nlp = spacy.load("zh_core_web_sm")
import json
import math
import os
from pathlib import Path
import random
import time
from tqdm.notebook import tqdm, trange
from typing import Dict, List, Set, Tuple

import numpy as np
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from tqdm.notebook import tqdm, trange
from time import time
from torch import autograd

In [9]:
get_device = lambda : "cuda:0" if torch.cuda.is_available() else "cpu"

In [10]:
def get_one_hot(one_loc, vector_size):
  vec = [0] * vector_size
  vec[one_loc] = 1
  return vec  

In [11]:
POS_TAGS_DICT = {'.': 0, ',': 1, '-LRB-': 2, '-RRB-': 3, '``': 4, '""': 5, "''": 6, '$': 7, '#': 8, 'AFX': 9, 'CC': 10, 'CD': 11, 'DT': 12, 'EX': 13, 'FW': 14, 'HYPH': 15, 'IN': 16, 'JJ': 17, 'JJR': 18, 'JJS': 19, 'LS': 20, 'MD': 21, 'NIL': 22, 'NN': 23, 'NNP': 24, 'NNPS': 25, 'NNS': 26, 'PDT': 27, 'POS': 28, 'PRP': 29, 'PRP$': 30, 'RB': 31, 'RBR': 32, 'RBS': 33, 'RP': 34, 'SP': 35, 'SYM': 36, 'TO': 37, 'UH': 38, 'VB': 39, 'VBD': 40, 'VBG': 41, 'VBN': 42, 'VBP': 43, 'VBZ': 44, 'WDT': 45, 'WP': 46, 'WP$': 47, 'WRB': 48, 'ADD': 49, 'NFP': 50, 'GW': 51, 'XX': 52, 'BES': 53, 'HVS': 54, '_SP': 55}

In [13]:
NER_TAGS_DICT = {'PERSON': 0, 'NORP': 1, 'FAC': 2, 'ORG': 3, 'GPE': 4, 'LOC': 5, 'PRODUCT': 6, 'EVENT': 7, 'WORK_OF_ART': 8, 'LAW': 9, 'LANGUAGE': 10, 'DATE': 11, 'TIME': 12, 'PERCENT': 13, 'MONEY': 14, 'QUANTITY': 15, 'ORDINAL': 16, 'CARDINAL': 17, '':18}

In [16]:
PAD = "PAD_TOK"

In [21]:
SOURCE_INPUT_SEQUENCE_LENGTH = 240
CAND_REF_INPUT_SEQUENCE_LENGTH = 640
INPUT_SIZE = 173 #96 (word embeddings) + 20 (one-hot NER_TAG) + 57 (one-hot POS_TAG)
PAD_VAL = 0.0

In [22]:
def get_input_representation(list_of_words, language):
  '''input is nlp(text)'''
  result = []
  
  for token in list_of_words:
    ner_vec = get_one_hot(NER_TAGS_DICT.get(token.ent_type_,len(NER_TAGS_DICT)),
                          len(NER_TAGS_DICT)+1)
    pos_vec = get_one_hot(POS_TAGS_DICT.get(token.tag_,len(POS_TAGS_DICT)),
                          len(POS_TAGS_DICT)+1)
    
    result.append(token.vector.tolist() + ner_vec + pos_vec)

  if language == 'Chinese':
    max_len = SOURCE_INPUT_SEQUENCE_LENGTH
  else: #language='English'
    max_len = CAND_REF_INPUT_SEQUENCE_LENGTH
  
  result += [[PAD_VAL]*INPUT_SIZE]*(max_len - len(result))

  return result


In [25]:
class Mt_Classification_Dataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):

        source, reference, candidate, bleu_score, label = self.data[index]

        source_input_representation = torch.tensor(get_input_representation(chinese_nlp(source), 'Chinese'))

        reference_input_representation = torch.tensor(get_input_representation(nlp(reference), 'English'))
        
        candidate_input_representation = torch.tensor(get_input_representation(nlp(candidate), 'English'))
        
        
        int_label = torch.tensor(float(label == 'M')) #M = 1, H = 0

        return source_input_representation,\
        reference_input_representation,\
        candidate_input_representation,\
        torch.tensor([float(bleu_score)]),\
        int_label


In [26]:
train_dataset = Mt_Classification_Dataset(train_read_dataset)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers = 0)#, drop_last=True)

test_dataset = Mt_Classification_Dataset(test_read_dataset)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True, num_workers = 0)#, drop_last=True)

In [33]:
class Classification_Model(nn.Module):
  def __init__(self): 

    super(Classification_Model, self).__init__()

    self.source_lstm = nn.LSTM(INPUT_SIZE, SOURCE_LSTM_HIDDEN_SIZE, SOURCE_LSTM_NUM_LAYERS, batch_first=True, dropout=0, bidirectional=True)
    self.reference_lstm = nn.LSTM(INPUT_SIZE, REFERENCE_LSTM_HIDDEN_SIZE, REFERENCE_LSTM_NUM_LAYERS, batch_first=True, dropout=0, bidirectional=True)
    self.candidate_lstm = nn.LSTM(INPUT_SIZE, CANDIDATE_LSTM_HIDDEN_SIZE, CANDIDATE_LSTM_NUM_LAYERS, batch_first=True, dropout=0, bidirectional=True)

    self.W1 = nn.Linear(1 + SOURCE_INPUT_SEQUENCE_LENGTH*SOURCE_LSTM_HIDDEN_SIZE*2 + CAND_REF_INPUT_SEQUENCE_LENGTH*REFERENCE_LSTM_HIDDEN_SIZE*2 + CAND_REF_INPUT_SEQUENCE_LENGTH*CANDIDATE_LSTM_HIDDEN_SIZE*2, FFNN_HIDDEN_SIZE)
    self.activation = nn.ReLU()
    self.W2 = nn.Linear(FFNN_HIDDEN_SIZE, 1)

    self.loss = nn.BCEWithLogitsLoss()
    self.softmax = nn.LogSoftmax(dim=1)

    if torch.cuda.is_available():
      self.source_lstm.cuda()
      self.reference_lstm.cuda()
      self.candidate_lstm.cuda()
      
  
  def compute_Loss(self, prediction, gold):
    return self.loss(prediction, gold)

  def forward(self, source, reference, candidate, bleu_score):

    this_batch_size = source.size()[0]

    hidden_source_layer = autograd.Variable(torch.randn(SOURCE_LSTM_NUM_LAYERS*2, this_batch_size, SOURCE_LSTM_HIDDEN_SIZE)).to(get_device())
    cell_source_layer = autograd.Variable(torch.randn(SOURCE_LSTM_NUM_LAYERS*2, this_batch_size, SOURCE_LSTM_HIDDEN_SIZE)).to(get_device())

    hidden_reference_layer = autograd.Variable(torch.randn(REFERENCE_LSTM_NUM_LAYERS*2, this_batch_size, REFERENCE_LSTM_HIDDEN_SIZE)).to(get_device())
    cell_reference_layer = autograd.Variable(torch.randn(REFERENCE_LSTM_NUM_LAYERS*2, this_batch_size, REFERENCE_LSTM_HIDDEN_SIZE)).to(get_device())

    hidden_candidate_layer = autograd.Variable(torch.randn(CANDIDATE_LSTM_NUM_LAYERS*2, this_batch_size, CANDIDATE_LSTM_HIDDEN_SIZE)).to(get_device())
    cell_candidate_layer = autograd.Variable(torch.randn(CANDIDATE_LSTM_NUM_LAYERS*2, this_batch_size, CANDIDATE_LSTM_HIDDEN_SIZE)).to(get_device())

    source_out, _ = self.source_lstm(source, (hidden_source_layer, cell_source_layer))
    source_out = source_out.reshape(source_out.shape[0], -1)

    reference_out, _ = self.reference_lstm(reference, (hidden_reference_layer, cell_reference_layer))
    reference_out = reference_out.reshape(reference_out.shape[0], -1)

    candidate_out, _ = self.candidate_lstm(candidate, (hidden_candidate_layer, cell_candidate_layer))
    candidate_out = candidate_out.reshape(candidate_out.shape[0], -1)

    cat_vector = torch.cat((bleu_score, source_out, reference_out, candidate_out), dim=1)

    z1 = self.W1(cat_vector)
    z2 = self.activation(z1)

    z3 = self.W2(z2)

    predicted_vector = self.softmax(z3)

    return predicted_vector

  def load_model(self, save_path):
    self.load_state_dict(torch.load(save_path))
  
  def save_model(self, save_path):
    torch.save(self.state_dict(), save_path)

In [34]:
def train_epoch(model, train_loader, optimizer):
  model.train()
  total_loss = 0
  batches = 0
  for (source, reference, candidate, bleu_score, label) in tqdm(train_loader, leave=False, desc="Training Batches"):
    optimizer.zero_grad()

    prediction = model(source.to(get_device()),
                       reference.to(get_device()),
                       candidate.to(get_device()),
                       bleu_score.to(get_device()))

    prediction = prediction.squeeze()
    
    loss = model.compute_Loss(prediction, label)
    loss.backward()
    # torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()
    # print('Loss:',loss.item())
    total_loss += loss.item()

    batches += 1

  
  print('Loss on epoch: ', total_loss/batches)

def train_model(number_of_epochs, model, train_loader, lr):
  optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
  for epoch in trange(number_of_epochs, desc="Epochs"):
    train_epoch(model, train_loader, optimizer)

In [37]:
def test(model, test_loader):
  model.eval()
  total_correct = 0
  total_batches = 0
  # let positive be machine
  tp_machine = 0
  fp_machine = 0
  fn_machine = 0
  tp_human = 0
  fp_human = 0
  fn_human = 0
  
  for (source, reference, candidate, bleu_score, label_batch) in tqdm(test_loader, leave=False, desc="Test Batches"):

    prediction_batch = model(source, reference, candidate, bleu_score)
    prediction_batch = prediction_batch.squeeze()
    print(prediction_batch)
    # p = tp / tp + fp
    # r = tp / tp + fn
    for i, label in enumerate(label_batch):
      prediction = prediction_batch[i]
      if label == 1 and prediction == 1: #M = 1, H = 0
        tp_machine += 1
      elif label == 1 and prediction == 0:
        fn_machine += 1
        fp_human += 1
      elif label == 0 and prediction == 1:
        fp_machine += 1
        fn_human += 1
      else: #label == 0 and prediction == 0
        tp_human += 1

    total_batches += source.size(0)

  try:
    precision_machine = tp_machine / (tp_machine + fp_machine)
    recall_machine = tp_machine / (tp_machine + fn_machine)
    f1_machine = (2 * precision_machine * recall_machine) / (precision_machine + recall_machine)
  except ZeroDivisionError:
    f1_machine = 0

  try:
    precision_human = tp_human / (tp_human + fp_human)
    recall_human = tp_human / (tp_human + fn_human)
    f1_human = (2 * precision_human * recall_human) / (precision_human + recall_human)
  except ZeroDivisionError:
    f1_human = 0
  
  average_f1 = (f1_machine + f1_human) / 2

  return average_f1

In [38]:
SOURCE_LSTM_HIDDEN_SIZE = 100
REFERENCE_LSTM_HIDDEN_SIZE = 100
CANDIDATE_LSTM_HIDDEN_SIZE  = 100
FFNN_HIDDEN_SIZE = 100
SOURCE_LSTM_NUM_LAYERS = 1
REFERENCE_LSTM_NUM_LAYERS = 1
CANDIDATE_LSTM_NUM_LAYERS = 1

classification_model = Classification_Model()
train_model(2, classification_model, train_loader, 0.0001)

test(classification_model, test_loader)

HBox(children=(FloatProgress(value=0.0, description='Epochs', max=2.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=37.0, style=ProgressStyle(descript…

Loss on epoch:  0.6931471824645996


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=37.0, style=ProgressStyle(descript…

Loss on epoch:  0.6931471824645996



HBox(children=(FloatProgress(value=0.0, description='Test Batches', max=11.0, style=ProgressStyle(description_…

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       grad_fn=<SqueezeBackward0>)
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       grad_fn=<SqueezeBackward0>)
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       grad_fn=<SqueezeBackward0>)
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       grad_fn=<SqueezeBackward0>)
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       grad_fn=<SqueezeBackward0>)
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       grad_fn=<SqueezeBackward0>)
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       grad_fn=<SqueezeBackward0>)
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       grad_fn=<SqueezeBackward0>)
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       grad_fn=<SqueezeBackward0>)
tensor([0., 0., 0., 0., 0., 

0.3458646616541354

Average F1-Score: 0.346