# Train Triple-based Model (Run only once to train the model)

## Dataset

In [None]:
import os
import pickle
import numpy as np
import multiprocessing
import pandas as pd
import pyarrow.feather as feather
from datetime import datetime, timedelta
from sklearn.metrics import accuracy_score

import torch
from torch import nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import RandomSampler, Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint
seed_everything(2)
ngpus = 1 #ngpus = torch.cuda.device_count()

labels_transform_dict = {"X":0, "Y":1}

import warnings
warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning)
warnings.filterwarnings("ignore", message=r"Passing", category=UserWarning)

In [None]:
class CreateDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_row = self.data.iloc[index]
        question = data_row["question"]
        answer = data_row["answer"]
        question_answer = question+" "+answer
        para_text = data_row["para_text"]
        text_pair = [question_answer, para_text]
        labels = labels_transform_dict[data_row["label"]]
        encoding = self.tokenizer.encode_plus(
            text_pair,
            add_special_tokens=True,
            max_length=self.max_seq_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt',
        )
        return dict(
            #text=text,
            input_ids=encoding["input_ids"].flatten(),
            token_type_ids = encoding["token_type_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=torch.tensor(labels)
        )

class CreatePlDataModule(pl.LightningDataModule):
    def __init__(self, args):
        super().__init__()
        total_data = args.total_data
        self.train_df = total_data[total_data["split"]=="train"].reset_index(drop=True)
        self.valid_df = total_data[total_data["split"]=="test"].reset_index(drop=True)
        self.train_batch_size = args.train_batch_size
        self.val_batch_size = args.val_batch_size
        self.max_seq_length = args.max_seq_length
        self.tokenizer = args.tokenizer

    def train_dataloader(self):
        train_dataset = CreateDataset(self.train_df, self.tokenizer, self.max_seq_length)
        return DataLoader(train_dataset, batch_size=self.train_batch_size, shuffle=True, num_workers=os.cpu_count())

    def val_dataloader(self):
        vaild_dataset = CreateDataset(self.valid_df, self.tokenizer, self.max_seq_length)
        return DataLoader(vaild_dataset, batch_size=self.val_batch_size, num_workers=os.cpu_count())

## Model

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl
from transformers import AutoModel
from sklearn.metrics import accuracy_score
from pytorch_lightning.callbacks import ModelCheckpoint

In [None]:
class TriplebasedFinetuner(pl.LightningModule):
    def __init__(self, model_name):
        super(TriplebasedFinetuner, self).__init__()
        lg_model = AutoModel.from_pretrained(model_name)
        self.lg_model = lg_model
        self.W = nn.Linear(lg_model.config.hidden_size, 2)
        self.num_classes = 2

    def forward(self, input_ids, attention_mask, token_type_ids):
        h, attn = self.lg_model(input_ids=input_ids, 
                                attention_mask=attention_mask, 
                                token_type_ids=token_type_ids,
                                return_dict=False)
        h_cls = h[:, 0]
        logits = self.W(h_cls)
        return logits, attn

    def training_step(self, batch, batch_idx):
        input_ids=batch["input_ids"]
        token_type_ids=batch["token_type_ids"]
        attention_mask=batch["attention_mask"]
        label=batch["labels"]
        
        y_hat, attn = self(input_ids, attention_mask, token_type_ids)
        loss = F.cross_entropy(y_hat, label)
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids=batch["input_ids"]
        token_type_ids=batch["token_type_ids"]
        attention_mask=batch["attention_mask"]
        label=batch["labels"]
        
        y_hat, attn = self(input_ids, attention_mask, token_type_ids)
        loss = F.cross_entropy(y_hat, label)
        a, y_hat = torch.max(y_hat, dim=1)
        val_acc = accuracy_score(y_hat.cpu(), label.cpu())
        val_acc = torch.tensor(val_acc)
        return {'val_loss': loss, 'val_acc': val_acc}
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()
        self.log('avg_val_loss', avg_loss, prog_bar=True)
        self.log('avg_val_acc', avg_val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        input_ids=batch["input_ids"]
        token_type_ids=batch["token_type_ids"]
        attention_mask=batch["attention_mask"]
        label=batch["labels"]
        
        y_hat, attn = self(input_ids, attention_mask, token_type_ids)
        loss = F.cross_entropy(y_hat, label)
        a, y_hat = torch.max(y_hat, dim=1) #the result tuple of two output tensors (max, max_indices)
        test_acc = accuracy_score(y_hat.cpu(), label.cpu())
        test_acc = torch.tensor(test_acc)
        return {'test_loss': loss, 'test_acc': test_acc}
    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        avg_test_acc = torch.stack([x['test_acc'] for x in outputs]).mean()
        self.log('avg_test_loss', avg_loss, prog_bar=True)
        self.log('avg_test_acc', avg_test_acc, prog_bar=True)
    
    def configure_optimizers(self):
        return torch.optim.Adam([p for p in self.parameters() if p.requires_grad], lr=2e-05, eps=1e-08)

    def train_dataloader(self):
        return gen_train_dataloader

    def val_dataloader(self):
        return gen_val_dataloader

def return_ModelCheckpoint(model_name):
    checkpoint_callback = ModelCheckpoint(
        monitor='avg_val_acc',
        dirpath="./models/triple-based_filtering",
        filename=f'{model_name}',
        save_top_k=1,
        mode='max'
        )
    return checkpoint_callback

In [None]:
class Data_Args:
    total_data = feather.read_feather('../Other_Data/triple-based_filtering/triple-based_filtering_data.feather')
    label_list = None
    tokenizer = None
    train_batch_size = 16
    val_batch_size = 16
    max_seq_length = 512
    
class Model_Args:
    hidden_dropout_prob = 0.2
    hidden_size = 768
    num_labels = None
    language_model = None
    model_path = None
    output_dir = "./models/triple-based_filtering"

## Training (roberta-base)

In [None]:
model_name = "roberta-base"
data_args = Data_Args()
data_args.train_batch_size = 4
data_args.val_batch_size = 4

data_args.tokenizer = AutoTokenizer.from_pretrained(model_name)
Quality_DataLoader = CreatePlDataModule(data_args)
gen_train_dataloader = Quality_DataLoader.train_dataloader()
gen_val_dataloader = Quality_DataLoader.val_dataloader()

print(len(gen_train_dataloader),len(gen_val_dataloader))

In [None]:
model = TriplebasedFinetuner(model_name)
checkpoint_callback = return_ModelCheckpoint(model_name)
trainer = pl.Trainer(gpus=1, max_epochs=15, accumulate_grad_batches=2, callbacks=[checkpoint_callback])    
trainer.fit(model)

### Load Model And Test

In [None]:
model_name = "roberta-base"

model_file_path = f"./models/triple-based_filtering/{model_name}.ckpt"
model = TriplebasedFinetuner(model_name)
model = model.load_from_checkpoint(checkpoint_path=model_file_path,model_name=model_name)
trainer = pl.Trainer(gpus=1)
trainer.test(model, test_dataloaders=model.val_dataloader())

## Test Using thred

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import pyarrow.feather as feather
from sklearn.metrics import accuracy_score

import torch
from torch import nn
import pytorch_lightning as pl
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from pytorch_lightning import Trainer, seed_everything
seed_everything(2)
ngpus = 1 #ngpus = torch.cuda.device_count()

labels_transform_dict = {"X":0, "Y":1}

In [None]:
class TriplebasedFinetuner(pl.LightningModule):
    def __init__(self, model_name):
        super(TriplebasedFinetuner, self).__init__()
        lg_model = AutoModel.from_pretrained(model_name)
        self.lg_model = lg_model
        self.W = nn.Linear(lg_model.config.hidden_size, 2)
        self.num_classes = 2

    def forward(self, input_ids, attention_mask, token_type_ids):
        h, attn = self.lg_model(input_ids=input_ids, 
                                attention_mask=attention_mask, 
                                token_type_ids=token_type_ids,
                                return_dict=False)
        h_cls = h[:, 0]
        logits = self.W(h_cls)
        return logits, attn

    def training_step(self, batch, batch_idx):
        input_ids=batch["input_ids"]
        token_type_ids=batch["token_type_ids"]
        attention_mask=batch["attention_mask"]
        label=batch["labels"]
        
        y_hat, attn = self(input_ids, attention_mask, token_type_ids)
        loss = F.cross_entropy(y_hat, label)
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids=batch["input_ids"]
        token_type_ids=batch["token_type_ids"]
        attention_mask=batch["attention_mask"]
        label=batch["labels"]
        
        y_hat, attn = self(input_ids, attention_mask, token_type_ids)
        loss = F.cross_entropy(y_hat, label)
        a, y_hat = torch.max(y_hat, dim=1)
        val_acc = accuracy_score(y_hat.cpu(), label.cpu())
        val_acc = torch.tensor(val_acc)
        return {'val_loss': loss, 'val_acc': val_acc}
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()
        self.log('avg_val_loss', avg_loss, prog_bar=True)
        self.log('avg_val_acc', avg_val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        input_ids=batch["input_ids"]
        token_type_ids=batch["token_type_ids"]
        attention_mask=batch["attention_mask"]
        label=batch["labels"]
        
        y_hat, attn = self(input_ids, attention_mask, token_type_ids)
        loss = F.cross_entropy(y_hat, label)
        a, y_hat = torch.max(y_hat, dim=1) #the result tuple of two output tensors (max, max_indices)
        test_acc = accuracy_score(y_hat.cpu(), label.cpu())
        test_acc = torch.tensor(test_acc)
        return {'test_loss': loss, 'test_acc': test_acc}
    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        avg_test_acc = torch.stack([x['test_acc'] for x in outputs]).mean()
        self.log('avg_test_loss', avg_loss, prog_bar=True)
        self.log('avg_test_acc', avg_test_acc, prog_bar=True)
    
    def configure_optimizers(self):
        return torch.optim.Adam([p for p in self.parameters() if p.requires_grad], lr=2e-05, eps=1e-08)

    def train_dataloader(self):
        return gen_train_dataloader

    def val_dataloader(self):
        return gen_val_dataloader

def predict_labelAndpro(text, tokenizer, model):
    max_len = tokenizer.model_max_length
    inputs = tokenizer.encode_plus(
            text_pair,
            add_special_tokens=True,
            max_length=max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt',
        )
    input_ids = torch.tensor(inputs['input_ids'], dtype=torch.long).cuda(0)
    attention_mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).cuda(0)
    token_type_ids = torch.tensor(inputs["token_type_ids"], dtype=torch.long).cuda(0)
    y_hat = model(input_ids, attention_mask, token_type_ids)[0]
    prob = max(F.softmax(y_hat, dim=1)[0]).item()
    label = y_hat.argmax().item()
    return label, prob

In [None]:
class Triple_based_Filtering_Test:
    def __init__(
        self,
        model_name: str, 
        model_file_path: str
    ):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = TriplebasedFinetuner(model_name)
        model = model.load_from_checkpoint(checkpoint_path=model_file_path,model_name=model_name)
        self.model = model.eval().cuda(device=0)
        self.labels_transform_dict = {"X":0, "Y":1}
        
    def __call__(self, raw_results_df):
        self.results_df = raw_results_df
        self.append_labelAndpro()
        return self.results_df
        
    def predict_labelAndpro(self, text_pair):
        max_len = self.tokenizer.model_max_length
        inputs = self.tokenizer.encode_plus(
                text_pair,
                add_special_tokens=True,
                max_length=max_len,
                padding="max_length",
                truncation=True,
                return_attention_mask=True,
                return_token_type_ids=True,
                return_tensors='pt',
            )
        input_ids = torch.tensor(inputs['input_ids'], dtype=torch.long).cuda(0)
        attention_mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).cuda(0)
        token_type_ids = torch.tensor(inputs["token_type_ids"], dtype=torch.long).cuda(0)
        y_hat = self.model(input_ids, attention_mask, token_type_ids)[0]
        prob = max(F.softmax(y_hat, dim=1)[0]).item()
        label = y_hat.argmax().item()
        return label, prob
    
    def append_labelAndpro(self):
        self.results_df['pred_label'] = None
        self.results_df['prob'] = None
        self.results_df['true_label'] = None
        for row_idx,data_row in self.results_df.iterrows():
            question = data_row["question"]
            answer = data_row["answer"]
            question_answer = question+" "+answer
            para_text = data_row["para_text"]
            text_pair = [question_answer, para_text]
            true_label = self.labels_transform_dict[data_row["label"]]
            pred_label, prob = self.predict_labelAndpro(text_pair)
            data_row['true_label'] = true_label
            data_row['pred_label'] = pred_label
            data_row['prob'] = prob

In [None]:
model_name = "roberta-base"
model_file_path =f"./models/triple-based_filtering/{model_name}.ckpt"
Triple_Processing = Triple_based_Filtering_Test(model_name,model_file_path)

total_data = feather.read_feather('../Other_Data/triple-based_filtering/triple-based_filtering_data.feather')
test_df = total_data[total_data["split"]=="test"].reset_index(drop=True)
print(len(test_df))

In [None]:
filtered_df = Triple_Processing(test_df)

In [None]:
a = len(filtered_df.loc[(filtered_df["pred_label"]==0) & (filtered_df["true_label"]==0)])
b = len(filtered_df.loc[(filtered_df["pred_label"]==1) & (filtered_df["true_label"]==1)])
accuracy = (a+b)/len(filtered_df) 
print(accuracy) #Model Accuracy

In [None]:
thred = 0.98 # Test different thred here
# Using thred
a = len(filtered_df.loc[(filtered_df["pred_label"]==1) & (filtered_df["prob"]>=thred) & (filtered_df["true_label"]==1)])
b = len(filtered_df.loc[(filtered_df["pred_label"]==1) & (filtered_df["prob"]>=thred)])
print(a/b) # The precision of finding good triples

In [None]:
thred = 0.99 # Test different thred here
# Using thred
a = len(filtered_df.loc[(filtered_df["pred_label"]==1) & (filtered_df["prob"]>=thred) & (filtered_df["true_label"]==1)])
b = len(filtered_df.loc[(filtered_df["pred_label"]==1) & (filtered_df["prob"]>=thred)])
print(a/b) # The precision of finding good triples

# Classification and Filtering

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import pyarrow.feather as feather
from sklearn.metrics import accuracy_score

import torch
from torch import nn
import pytorch_lightning as pl
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from pytorch_lightning import Trainer, seed_everything
seed_everything(2)
ngpus = 1 #ngpus = torch.cuda.device_count()

Global seed set to 2


In [2]:
class TriplebasedFinetuner(pl.LightningModule):
    def __init__(self, model_name):
        super(TriplebasedFinetuner, self).__init__()
        lg_model = AutoModel.from_pretrained(model_name)
        self.lg_model = lg_model
        self.W = nn.Linear(lg_model.config.hidden_size, 2)
        self.num_classes = 2

    def forward(self, input_ids, attention_mask, token_type_ids):
        h, attn = self.lg_model(input_ids=input_ids, 
                                attention_mask=attention_mask, 
                                token_type_ids=token_type_ids,
                                return_dict=False)
        h_cls = h[:, 0]
        logits = self.W(h_cls)
        return logits, attn

class Triple_based_Filtering:
    def __init__(
        self,
        model_name: str, 
        model_file_path: str, 
        thred: float,
        paraid_text_dict: dict
    ):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = TriplebasedFinetuner(model_name)
        model = model.load_from_checkpoint(checkpoint_path=model_file_path,model_name=model_name)
        self.model = model.eval().cuda(device=0)
        self.thred = thred
        self.paraid_text_dict = paraid_text_dict
        
    def __call__(self, raw_results_df):
        self.results_df = raw_results_df
        self.append_labelAndpro()
        self.results_df = self.results_df.loc[(self.results_df["pred_label"]==1) & (self.results_df["prob"]>=self.thred)].reset_index(drop=True)
        return self.results_df
        
    def predict_labelAndpro(self, text_pair):
        max_len = self.tokenizer.model_max_length
        inputs = self.tokenizer.encode_plus(
                text_pair,
                add_special_tokens=True,
                max_length=max_len,
                padding="max_length",
                truncation=True,
                return_attention_mask=True,
                return_token_type_ids=True,
                return_tensors='pt',
            )
        input_ids = torch.tensor(inputs['input_ids'], dtype=torch.long).cuda(0)
        attention_mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).cuda(0)
        token_type_ids = torch.tensor(inputs["token_type_ids"], dtype=torch.long).cuda(0)
        y_hat = self.model(input_ids, attention_mask, token_type_ids)[0]
        prob = max(F.softmax(y_hat, dim=1)[0]).item()
        label = y_hat.argmax().item()
        return label, prob
    
    def append_labelAndpro(self):
        self.results_df['pred_label'] = None
        self.results_df['prob'] = None
        for row_idx,data_row in self.results_df.iterrows():
            question = data_row["question"]
            answer = data_row["answer"]
            question_answer = question+" "+answer
            para_text = self.paraid_text_dict[data_row["para_id"]]
            text_pair = [question_answer, para_text]
            pred_label, prob = self.predict_labelAndpro(text_pair)
            data_row['pred_label'] = pred_label
            data_row['prob'] = prob

In [3]:
thred = 0.99
model_name = "roberta-base"
model_file_path =f"./models/triple-based_filtering/{model_name}.ckpt"

examples=pickle.load(open("data/examples.pickle", "rb"))
paraid_text_dict={example[0]:example[2] for example in examples}
raw_results_df=feather.read_feather("data/raw_results_After_4thModule.feather")
print(len(raw_results_df))

66


In [4]:
Triple_Processing = Triple_based_Filtering(model_name,model_file_path,thred,paraid_text_dict)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaMod

In [5]:
filtered_df = Triple_Processing(raw_results_df)
print(len(filtered_df))



35


In [6]:
filtered_df = filtered_df[["org_question", "trans_question", "org_answer", "trans_answer", "ans_pos", "para_id", "trans_que", "trans_ans"]]
filtered_df.to_feather("data/final_results_After_5thModule.feather")