In [None]:
!pip install datasets==1.0.1
!pip install transformers==3.1.0



In [None]:
import torch
import torch.nn as nn
import os
import matplotlib.pyplot as plt
import copy
import torch.optim as optim
import random
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [None]:
train = open("//train.txt", "r").read()
train_sep = train.rsplit("\n")


In [None]:
test = open("//test.txt", "r").read()
test_sep = test.rsplit("\n")


In [None]:
train_set = pd.DataFrame(train_sep)
train_res = pd.DataFrame(train_set.values.reshape(584,6))
train_dataset, val_dataset = train_test_split(train_res.drop(columns=[5]), test_size=0.2)
train_dataset = train_dataset.reset_index(drop=True)
val_dataset = val_dataset.reset_index(drop=True)

In [None]:
test_set = pd.DataFrame(test_sep)
test_res = pd.DataFrame(test_set.values.reshape(174,6))
test_dataset = test_res.drop(columns=[5])

In [None]:
#transforming the dataset
def transform_label(label):
    if label == "H":
        return 0
    elif label == "M":
        return 1
    else:
        raise
train_dataset[4] = train_dataset[4].apply(transform_label)
val_dataset[4] = val_dataset[4].apply(transform_label)
test_dataset[4] = test_dataset[4].apply(transform_label)


In [None]:
val_dataset

Unnamed: 0,0,1,2,3,4
0,"布 斯 瑞 迪 和数 以千计 的 其他 灾民 一样 , 因为 海啸 而 失去 身份证 和 其...","busriadi , like thousands of other refugees , ...","like thousands of other victims , busriadi los...",0.6781,0
1,连续 六 年 减少 援外 预算 的 日本 也 迅速 以 行动 粉碎 了 国力 日 衰 的 说...,"japan , which has been reducing its foreign ai...","japan , which has cut its foreign aid budget i...",0.6304,0
2,"外务 省 发言人 说 , 小 泉 所 说 的 五 亿 美元 也 包括 这 三 千万 美元 .",a foreign ministry spokesman said that the amo...,a spokesman for the foreign ministry said that...,0.4194,0
3,联合国 秘书长 安南 委托 以前 哈佛 经济学家 沙 克 斯 为首 的 开发 专家 编撰 这...,un secretary general annan has commissioned fo...,un secretary general annan commissioned develo...,0.7114,0
4,"成千上万 的 尤 申 科 支持 者 已 在 首都 基辅 聚集 两 周 , 他们 包围 政府 ...",tens of thousands of yushchenko's supporters h...,thousands of supporters in the capital gathere...,0.4522,1
...,...,...,...,...,...
112,数 千 信徒 赶 往 伯 利 恒 参加 平安 夜 弥 撒,thousands of worshippers head to bethlehem for...,thousands of believers to bethlehem to partici...,0.4167,1
113,"南韩 大使馆 则 说 , 李 海 瓒 将 在 斯里兰卡 总理 陪同 下 , 访 视 西部 海...",the south korean embassy said in a statement t...,"south korean embassy , said that li hai , refl...",0.5828,1
114,"ZEW 指数 十一月 由 前 月 的 三十 一点 三 , 大 幅 跌 至 十三点 九 .",the zew index fell sharply to 13.9 in november...,the zew index plummeted to 13.9 in november fr...,0.8667,0
115,"欧盟 在 与 中国 总理 温 家宝 举行 峰 会 后 发布 的 联合声明 中 说 : "" 欧...",""" the eu side has confirmed its political will...",in a joint declaration released after the summ...,0.8611,0


In [None]:
class TranslationDataset(Dataset):

    def __init__(self, data, token_length, EtoEmodel='albert-base-v2'):

        self.data = data
        self.tokenizer1 = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
        self.tokenizer2 = AutoTokenizer.from_pretrained(EtoEmodel)
        self.tokenlength = token_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sentence1 = str(self.data.loc[index, 0])
        sentence2 = str(self.data.loc[index, 1])
        sentence3 = str(self.data.loc[index, 2])

        encoded_pair1 = self.tokenizer1(sentence1, sentence2, 
                                      padding='max_length',
                                      truncation=True,
                                      max_length=self.tokenlength,  
                                      return_tensors='pt')
        
        encoded_pair2 = self.tokenizer2(sentence2, sentence3, 
                                      padding='max_length',
                                      truncation=True,
                                      max_length=self.tokenlength,  
                                      return_tensors='pt')
        
        encoded_pair3 = self.tokenizer1(sentence1, sentence3, 
                                      padding='max_length',
                                      truncation=True,
                                      max_length=self.tokenlength,  
                                      return_tensors='pt')
        
        token_ids1 = encoded_pair1['input_ids']
        attn_masks1 = encoded_pair1['attention_mask']
        token_type_ids1 = encoded_pair1['token_type_ids']
        token_ids2 = encoded_pair2['input_ids']
        attn_masks2 = encoded_pair2['attention_mask']
        token_type_ids2 = encoded_pair2['token_type_ids']
        token_ids3 = encoded_pair3['input_ids']
        attn_masks3 = encoded_pair3['attention_mask']
        token_type_ids3 = encoded_pair3['token_type_ids']
        bleu_score = self.data.loc[index, 3]
        label = self.data.loc[index, 4]
        return token_ids1, attn_masks1, token_type_ids1, token_ids2, attn_masks2, token_type_ids2, token_ids3, attn_masks3, token_type_ids3, torch.Tensor([float(bleu_score)]), torch.Tensor([float(label)])


In [None]:
class MachineHumanTranslationClassifier(nn.Module):
    def __init__(self, EtoEmodel="albert-base-v2", EtoEhs=768, freeze_bert=False):
        super(MachineHumanTranslationClassifier, self).__init__()
        
        self.bert_layer1 = AutoModel.from_pretrained("bert-base-multilingual-cased")
        self.bert_layer2 = AutoModel.from_pretrained(EtoEmodel)
        self.bert_layer3 = AutoModel.from_pretrained("bert-base-multilingual-cased")
        if freeze_bert:
            for p in self.bert_layer1.parameters():
                p.requires_grad = False
            for p in self.bert_layer2.parameters():
                p.requires_grad = False
            for p in self.bert_layer3.parameters():
                p.requires_grad = False
        self.hidden_layer1 = nn.Linear(768, 1)
        self.hidden_layer2 = nn.Linear(EtoEhs, 1)
        self.hidden_layer3 = nn.Linear(768, 1)
        #Final layer converges the results of the 3 hidden layers, and merges it into one out put with the bleu score
        self.final_layer = nn.Linear(4, 1)
        self.dropout_reg = nn.Dropout(p=0.1)
        self.hidden_act = torch.nn.ReLU()
        self.final_activation = torch.nn.Sigmoid()
        
        
    @autocast()
    def forward(self, token_ids1, attn_masks1, token_type_ids1, token_ids2, attn_masks2, token_type_ids2, token_ids3, attn_masks3, token_type_ids3, bleu_score):
        _, pooler_output1 = self.bert_layer1(token_ids1, attn_masks1, token_type_ids1, return_dict=False)
        _, pooler_output2 = self.bert_layer2(token_ids2, attn_masks2, token_type_ids2, return_dict=False)
        _, pooler_output3 = self.bert_layer3(token_ids3, attn_masks3, token_type_ids3, return_dict=False)
        res1 = self.hidden_layer1(self.dropout_reg(pooler_output1))
        res2 = self.hidden_layer2(self.dropout_reg(pooler_output2))
        res3 = self.hidden_layer3(self.dropout_reg(pooler_output3))
        res_fin = torch.cat([res1, res2, res3, torch.Tensor([[bleu_score]])])
        final_act = self.hidden_act(torch.transpose(res_fin, 0, 1))
        final_out = self.final_layer(final_act)
        
        return self.final_activation(final_out)
        
             
        



In [None]:

def evaluate_loss(classifier, criterion, dataloader):
    classifier.eval()

    mean_loss = 0
    count = 0
    dataset_size = len(dataloader)
    with torch.no_grad():
        for i in range(dataset_size):
            token_ids1, attn_masks1, token_type_ids1, token_ids2, attn_masks2, token_type_ids2, token_ids3, attn_masks3, token_type_ids3, bleu_score, label = dataloader[i]
            result = classifier(token_ids1, attn_masks1, token_type_ids1, token_ids2, attn_masks2, token_type_ids2, token_ids3, attn_masks3, token_type_ids3, bleu_score)
            mean_loss += criterion(result.squeeze(-1), label.float()).item()
            count += 1

    return mean_loss / count

def accuracy_score(classifier, criterion, dataloader):
    classifier.eval()

    total_correct = 0
    count = 0
    dataset_size = len(dataloader)
    with torch.no_grad():
        for i in range(dataset_size):
            token_ids1, attn_masks1, token_type_ids1, token_ids2, attn_masks2, token_type_ids2, token_ids3, attn_masks3, token_type_ids3, bleu_score, label = dataloader[i]
            result = classifier(token_ids1, attn_masks1, token_type_ids1, token_ids2, attn_masks2, token_type_ids2, token_ids3, attn_masks3, token_type_ids3, bleu_score)
            if result.item() >= 0.5:
                pred = 1
            else:
                pred = 0
            if pred == int(label.item()):
                total_correct += 1
            count += 1
    return total_correct / count

def evaluate_loss_acc(classifier, criterion, dataloader):
    classifier.eval()

    mean_loss = 0
    total_correct = 0
    count = 0
    dataset_size = len(dataloader)
    with torch.no_grad():
        for i in tqdm(range(dataset_size)):
            token_ids1, attn_masks1, token_type_ids1, token_ids2, attn_masks2, token_type_ids2, token_ids3, attn_masks3, token_type_ids3, bleu_score, label = dataloader[i]
            result = classifier(token_ids1, attn_masks1, token_type_ids1, token_ids2, attn_masks2, token_type_ids2, token_ids3, attn_masks3, token_type_ids3, bleu_score)
            mean_loss += criterion(result.squeeze(-1), label.float()).item()
            if result.item() >= 0.5:
                pred = 1
            else:
                pred = 0
            if pred == int(label.item()):
                total_correct += 1
            count += 1
    return mean_loss / count, total_correct / count



In [None]:
def train_bert_clf(model, opti, lr, lr_scheduler, train_loader, val_loader, epochs, mini_batch):
    total_iter = len(train_loader)
    check = total_iter // 5  # print the training loss 5 times per epoch
    loss_fn = nn.BCELoss()
    check_loss = 0.0
    dataset_size = len(train_loader)
    for ep in range(epochs):
        model.train()
        for i in tqdm(range(dataset_size)):
            token_ids1, attn_masks1, token_type_ids1, token_ids2, attn_masks2, token_type_ids2, token_ids3, attn_masks3, token_type_ids3, bleu_score, label = train_loader[i]
            with autocast():
                result = model(token_ids1, attn_masks1, token_type_ids1, token_ids2, attn_masks2, token_type_ids2, token_ids3, attn_masks3, token_type_ids3, bleu_score)
                loss = loss_fn(result.squeeze(-1), label.float())
                loss = loss / mini_batch
                loss.backward()
                
            if (i + 1) % mini_batch == 0:
                opti.step()
                lr_scheduler.step()
                opti.zero_grad()
                
            check_loss += loss.item()
            
            if (i+1) % check == 0:
                print("Iteration {} / {} of epoch {} loss is: {}"
                      .format(i+1, total_iter, ep + 1, (check_loss / check)))
                check_loss = 0.0
        
    train_loss, train_acc = evaluate_loss_acc(model, loss_fn, train_loader)
    val_loss, val_acc = evaluate_loss_acc(model, loss_fn, val_loader)
    print("Train Loss: {}".format(train_loss))
    print("Train Accuracy: {}".format(train_acc))
    print("Validation Loss: {}".format(val_loss))
    print("Validation Accuracy: {}".format(val_acc))




        
    return train_loss, train_acc, val_loss, val_acc, model
        
        
            
            
            
                
                

In [None]:
bert_model = "albert-base-v2"  
freeze_bert = False
token_length = 128
mini_batch = 2 
lr = 2e-5
epochs = 4
loss_fn = nn.BCELoss()

In [None]:
train_loader = TranslationDataset(train_dataset, token_length, bert_model)
val_loader = TranslationDataset(val_dataset, token_length, bert_model)
test_dataset = TranslationDataset(test_dataset, token_length, bert_model)
model = MachineHumanTranslationClassifier()
opti = AdamW(model.parameters(), lr=lr, weight_decay=1e-2)
total_steps = (len(train_loader) // mini_batch) * epochs 
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=0, num_training_steps=total_steps)


In [None]:
train_losses, val_losses, train_accuracies, val_accuracies, model = train_bert_clf(model, opti, lr, lr_scheduler, train_loader, val_loader, epochs, mini_batch)


 20%|█▉        | 93/467 [09:27<36:06,  5.79s/it]

Iteration 93 / 467 of epoch 1 loss is: 0.31444803489151824


 40%|███▉      | 186/467 [18:56<30:10,  6.44s/it]

Iteration 186 / 467 of epoch 1 loss is: 0.31135692471458065


 60%|█████▉    | 279/467 [28:17<17:58,  5.74s/it]

Iteration 279 / 467 of epoch 1 loss is: 0.2897777224900902


 80%|███████▉  | 372/467 [37:39<09:56,  6.27s/it]

Iteration 372 / 467 of epoch 1 loss is: 0.27636370615613076


100%|█████████▉| 465/467 [46:54<00:11,  5.68s/it]

Iteration 465 / 467 of epoch 1 loss is: 0.2648846418146164


100%|██████████| 467/467 [47:06<00:00,  6.05s/it]
 20%|█▉        | 93/467 [09:11<35:31,  5.70s/it]

Iteration 93 / 467 of epoch 2 loss is: 0.21232041396120543


 40%|███▉      | 186/467 [18:33<29:00,  6.19s/it]

Iteration 186 / 467 of epoch 2 loss is: 0.2234598145850243


 60%|█████▉    | 279/467 [27:52<18:06,  5.78s/it]

Iteration 279 / 467 of epoch 2 loss is: 0.23067563571917113


 80%|███████▉  | 372/467 [37:13<09:58,  6.30s/it]

Iteration 372 / 467 of epoch 2 loss is: 0.24865091011248608


100%|█████████▉| 465/467 [46:36<00:11,  5.70s/it]

Iteration 465 / 467 of epoch 2 loss is: 0.21095804337372062


100%|██████████| 467/467 [46:48<00:00,  6.01s/it]
 20%|█▉        | 93/467 [09:17<35:56,  5.77s/it]

Iteration 93 / 467 of epoch 3 loss is: 0.18188822021087012


 40%|███▉      | 186/467 [18:45<30:15,  6.46s/it]

Iteration 186 / 467 of epoch 3 loss is: 0.1945588959641354


 60%|█████▉    | 279/467 [28:53<19:54,  6.36s/it]

Iteration 279 / 467 of epoch 3 loss is: 0.20824040985235603


 80%|███████▉  | 372/467 [39:28<11:25,  7.22s/it]

Iteration 372 / 467 of epoch 3 loss is: 0.18624564494577148


100%|█████████▉| 465/467 [50:25<00:13,  6.75s/it]

Iteration 465 / 467 of epoch 3 loss is: 0.1825745945135432


100%|██████████| 467/467 [50:40<00:00,  6.51s/it]
 20%|█▉        | 93/467 [11:51<46:03,  7.39s/it]

Iteration 93 / 467 of epoch 4 loss is: 0.13970046949082165


 40%|███▉      | 186/467 [23:53<38:19,  8.18s/it]

Iteration 186 / 467 of epoch 4 loss is: 0.17743328197668956


 60%|█████▉    | 279/467 [35:52<23:28,  7.49s/it]

Iteration 279 / 467 of epoch 4 loss is: 0.1806336275992855


 80%|███████▉  | 372/467 [48:13<12:55,  8.16s/it]

Iteration 372 / 467 of epoch 4 loss is: 0.16907327243637654


100%|█████████▉| 465/467 [59:59<00:14,  7.10s/it]

Iteration 465 / 467 of epoch 4 loss is: 0.18393851980887432


100%|██████████| 467/467 [1:00:14<00:00,  7.74s/it]
100%|██████████| 467/467 [12:50<00:00,  1.65s/it]
100%|██████████| 117/117 [03:15<00:00,  1.67s/it]

Train Loss: 0.320592951492209
Train Accuracy: 0.9914346895074947
Validation Loss: 0.4257999953742211
Validation Accuracy: 0.9487179487179487





In [None]:
train_loss, train_acc, val_loss, val_acc = train_losses, val_losses, train_accuracies, val_accuracies
train_loss, train_acc, val_loss, val_acc

(0.320592951492209, 0.9914346895074947, 0.4257999953742211, 0.9487179487179487)

In [None]:
evaluate_loss_acc(model, loss_fn, test_dataset)

100%|██████████| 174/174 [04:53<00:00,  1.69s/it]


(0.4880423700201443, 0.867816091954023)

In [None]:
optimal_model = model

In [None]:
def evaluateF1score(classifier, dataloader):
    classifier.eval()
    true_positives = 0
    false_positives = 0
    true_negatives = 0
    false_negatives = 0
    total_correct = 0
    count = 0
    dataset_size = len(dataloader)
    with torch.no_grad():
        for i in tqdm(range(dataset_size)):
            token_ids1, attn_masks1, token_type_ids1, token_ids2, attn_masks2, token_type_ids2, token_ids3, attn_masks3, token_type_ids3, bleu_score, label = dataloader[i]
            result = classifier(token_ids1, attn_masks1, token_type_ids1, token_ids2, attn_masks2, token_type_ids2, token_ids3, attn_masks3, token_type_ids3, bleu_score)
            if result.item() >= 0.5:
                pred = 1
            else:
                pred = 0
            if pred == int(label.item()):
                if pred == 1:
                    true_positives += 1
                else:
                    true_negatives += 1
            else:
                if pred == 1:
                    false_positives += 1
                else:
                    false_negatives += 1
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    F1 = 2 * (precision * recall) / (precision + recall)
    return F1

In [None]:
train_f1 = evaluateF1score(optimal_model, train_loader)
val_f1 = evaluateF1score(optimal_model, val_loader)
test_f1 = evaluateF1score(optimal_model, test_dataset)
print("Train F1 score: {}".format(train_f1))
print("Validation F1 score: {}".format(val_f1))
print("Test F1 score: {}".format(test_f1))




  0%|          | 0/467 [00:00<?, ?it/s][A
  0%|          | 1/467 [00:01<13:38,  1.76s/it][A
  0%|          | 2/467 [00:03<13:31,  1.75s/it][A
  1%|          | 3/467 [00:05<13:29,  1.74s/it][A
  1%|          | 4/467 [00:06<13:25,  1.74s/it][A
  1%|          | 5/467 [00:08<13:21,  1.74s/it][A
  1%|▏         | 6/467 [00:10<13:17,  1.73s/it][A
  1%|▏         | 7/467 [00:12<13:14,  1.73s/it][A
  2%|▏         | 8/467 [00:13<13:08,  1.72s/it][A
  2%|▏         | 9/467 [00:15<13:11,  1.73s/it][A
  2%|▏         | 10/467 [00:17<13:07,  1.72s/it][A
  2%|▏         | 11/467 [00:19<13:06,  1.72s/it][A
  3%|▎         | 12/467 [00:20<13:04,  1.72s/it][A
  3%|▎         | 13/467 [00:22<13:02,  1.72s/it][A
  3%|▎         | 14/467 [00:24<13:02,  1.73s/it][A
  3%|▎         | 15/467 [00:25<12:59,  1.72s/it][A
  3%|▎         | 16/467 [00:27<12:56,  1.72s/it][A
  4%|▎         | 17/467 [00:29<12:51,  1.71s/it][A
  4%|▍         | 18/467 [00:31<12:48,  1.71s/it][A
  4%|▍         | 19/467 [00:3

Train F1 score: 0.9907407407407408
Validation F1 score: 0.9444444444444444
Test F1 score: 0.8413793103448277



