# Train Specificity Model & Temporally Ambiguity Model

## Training Specificity Model (Run only once to train the model)

### Dataset

In [None]:
import pickle
import numpy as np
import pandas as pd
import multiprocessing
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from torch.utils.data import RandomSampler, DataLoader
from pytorch_lightning import seed_everything

seed_everything(42)

In [None]:
class SpecificityData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Sent
        self.targets = dataframe.Label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        return [torch.tensor(ids, dtype=torch.long),
               torch.tensor(mask, dtype=torch.long),
               torch.tensor(token_type_ids, dtype=torch.long),
               torch.tensor(self.targets[index], dtype=torch.long)]

def generate_specificity_dataloaders(model_name, train_size,  train_batch_size, val_batch_size):
    sentence_file = open('../Other_Data/sentence_specificity/data.txt', 'r') 
    sentence_list = sentence_file.read().split("\n")[:-1]
    label_file = open('../Other_Data/sentence_specificity/label.txt', 'r') 
    label_list = label_file.read().split("\n")
    label_list = list(map(int, label_list))
    assert len(sentence_list)==len(label_list)
    data_dict = {'Sent':sentence_list,'Label': label_list}
    data_df = pd.DataFrame(data_dict)
    data_df['Label'] -= 1  #0:general; 1:specific
    data_df=data_df[~data_df.duplicated(['Sent'],keep='last')].reset_index(drop=True)
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    MAX_LEN = tokenizer.model_max_length

    train_df=data_df.sample(frac=train_size, random_state=200)
    validate_df=data_df.drop(train_df.index).reset_index(drop=True)
    train_df = train_df.reset_index(drop=True)
    """
    print("Tokenize MAX_LEN: {}".format(MAX_LEN))
    print("Full Dataset: {}".format(data_df.shape))
    print("Train Dataset: {}".format(train_df.shape))
    print("Validation Dataset: {}".format(validate_df.shape))
    """
    train_set = SpecificityData(train_df, tokenizer, MAX_LEN)
    val_set = SpecificityData(validate_df, tokenizer, MAX_LEN)

    train_sampler = RandomSampler(train_set)
    specificity_train_dataloader = DataLoader(train_set, sampler=train_sampler, num_workers=multiprocessing.cpu_count(), batch_size=train_batch_size)
    specificity_val_dataloader = DataLoader(val_set, num_workers=multiprocessing.cpu_count(), batch_size=val_batch_size)
    return specificity_train_dataloader, specificity_val_dataloader

### Model

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl
from transformers import AutoModel
from sklearn.metrics import accuracy_score
from pytorch_lightning.callbacks import ModelCheckpoint

In [None]:
class SpecificityFinetuner(pl.LightningModule):
    def __init__(self, model_name):
        super(SpecificityFinetuner, self).__init__()
        lg_model = AutoModel.from_pretrained(model_name)
        self.lg_model = lg_model
        self.W = nn.Linear(lg_model.config.hidden_size, 2)
        self.num_classes = 2

    def forward(self, input_ids, attention_mask, token_type_ids):
        h, attn = self.lg_model(input_ids=input_ids, 
                                attention_mask=attention_mask, 
                                token_type_ids=token_type_ids,
                                return_dict=False)
        h_cls = h[:, 0]
        logits = self.W(h_cls)
        return logits, attn

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, token_type_ids, label = batch
        y_hat, attn = self(input_ids, attention_mask, token_type_ids)
        loss = F.cross_entropy(y_hat, label)
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, token_type_ids, label = batch
        y_hat, attn = self(input_ids, attention_mask, token_type_ids)
        loss = F.cross_entropy(y_hat, label)
        a, y_hat = torch.max(y_hat, dim=1) #the result tuple of two output tensors (max, max_indices)
        val_acc = accuracy_score(y_hat.cpu(), label.cpu())
        val_acc = torch.tensor(val_acc)
        return {'val_loss': loss, 'val_acc': val_acc}
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()
        self.log('avg_val_loss', avg_loss, prog_bar=True)
        self.log('avg_val_acc', avg_val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        input_ids, attention_mask, token_type_ids, label = batch
        y_hat, attn = self(input_ids, attention_mask, token_type_ids)
        loss = F.cross_entropy(y_hat, label)
        a, y_hat = torch.max(y_hat, dim=1) #the result tuple of two output tensors (max, max_indices)
        test_acc = accuracy_score(y_hat.cpu(), label.cpu())
        test_acc = torch.tensor(test_acc)
        return {'test_loss': loss, 'test_acc': test_acc}
    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        avg_test_acc = torch.stack([x['test_acc'] for x in outputs]).mean()
        self.log('avg_test_loss', avg_loss, prog_bar=True)
        self.log('avg_test_acc', avg_test_acc, prog_bar=True)
    
    def configure_optimizers(self):
        return torch.optim.Adam([p for p in self.parameters() if p.requires_grad], lr=2e-05, eps=1e-08)

    def train_dataloader(self):
        return gen_train_dataloader

    def val_dataloader(self):
        return gen_val_dataloader

def return_ModelCheckpoint(model_name):
    checkpoint_callback = ModelCheckpoint(
        monitor='avg_val_acc',
        dirpath="./models/sentence_specificity",
        filename=model_name,
        save_top_k=1,
        mode='max'
        )
    return checkpoint_callback

### Training (roberta-base)

#### Train

In [None]:
train_size = 0.9
train_batch_size = 4
val_batch_size = 4
model_name = "roberta-base"

gen_train_dataloader, gen_val_dataloader = generate_specificity_dataloaders(model_name, train_size, train_batch_size, val_batch_size)

In [None]:
model = SpecificityFinetuner(model_name)
checkpoint_callback = return_ModelCheckpoint(model_name)
trainer = pl.Trainer(gpus=1, max_epochs=10, callbacks=[checkpoint_callback], accumulate_grad_batches=2)    
trainer.fit(model)

#### Load Model And Test

In [None]:
train_size = 0.9
train_batch_size = 8
val_batch_size = 8
model_name = "roberta-base"
gen_train_dataloader, gen_val_dataloader = generate_specificity_dataloaders(model_name, train_size, train_batch_size, val_batch_size)

In [None]:
model_name = "roberta-base"
model_file_path = f"./models/sentence_specificity/{model_name}.ckpt"
model = SpecificityFinetuner(model_name)
model = model.load_from_checkpoint(checkpoint_path=model_file_path,model_name=model_name)
trainer = pl.Trainer(gpus=1)
trainer.test(model, test_dataloaders=model.val_dataloader())

## Training Temporally Ambiguity Model Model (Run only once to train the model)

### Dataset

In [1]:
import pickle
import numpy as np
import pandas as pd
import multiprocessing
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from torch.utils.data import RandomSampler, DataLoader
from pytorch_lightning import seed_everything

seed_everything(42)

Global seed set to 42


42

In [2]:
class AmbiguityData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Sent
        self.targets = dataframe.Label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        return [torch.tensor(ids, dtype=torch.long),
               torch.tensor(mask, dtype=torch.long),
               torch.tensor(token_type_ids, dtype=torch.long),
               torch.tensor(self.targets[index], dtype=torch.long)]

def generate_ambiguity_dataloaders(model_name, train_size,  train_batch_size, val_batch_size):
    question_label_list = pickle.load(open('../Other_Data/question_temporally_ambiguity/ambiguity_filtering_data.pickle', "rb"))
    sentence_list=[]
    label_list=[]
    for r in question_label_list:
        sentence_list.append(r[0])
        label_list.append(r[1])
    label_list = list(map(int, label_list))
    assert len(sentence_list)==len(label_list)
    data_dict = {'Sent':sentence_list,'Label': label_list}
    #0:ambiguous; 1:non-ambiguous
    data_df = pd.DataFrame(data_dict)
    data_df=data_df[~data_df.duplicated(['Sent'],keep='last')].reset_index(drop=True)
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    MAX_LEN = tokenizer.model_max_length

    train_df=data_df.sample(frac=train_size, random_state=200)
    validate_df=data_df.drop(train_df.index).reset_index(drop=True)
    train_df = train_df.reset_index(drop=True)

    train_set = AmbiguityData(train_df, tokenizer, MAX_LEN)
    val_set = AmbiguityData(validate_df, tokenizer, MAX_LEN)
    
    train_sampler = RandomSampler(train_set)
    ambiguity_train_dataloader = DataLoader(train_set, sampler=train_sampler, num_workers=multiprocessing.cpu_count(), batch_size=train_batch_size)
    ambiguity_val_dataloader = DataLoader(val_set, num_workers=multiprocessing.cpu_count(), batch_size=val_batch_size)
    return ambiguity_train_dataloader, ambiguity_val_dataloader

### Model

In [3]:
import torch
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl
from transformers import AutoModel
from sklearn.metrics import accuracy_score
from pytorch_lightning.callbacks import ModelCheckpoint

In [4]:
class AmbiguityFinetuner(pl.LightningModule):
    def __init__(self, model_name):
        super(AmbiguityFinetuner, self).__init__()
        lg_model = AutoModel.from_pretrained(model_name)
        self.lg_model = lg_model
        self.W = nn.Linear(lg_model.config.hidden_size, 2)
        self.num_classes = 2

    def forward(self, input_ids, attention_mask, token_type_ids):
        h, attn = self.lg_model(input_ids=input_ids, 
                                attention_mask=attention_mask, 
                                token_type_ids=token_type_ids,
                                return_dict=False)
        h_cls = h[:, 0]
        logits = self.W(h_cls)
        return logits, attn

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, token_type_ids, label = batch
        y_hat, attn = self(input_ids, attention_mask, token_type_ids)
        loss = F.cross_entropy(y_hat, label)
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, token_type_ids, label = batch
        y_hat, attn = self(input_ids, attention_mask, token_type_ids)
        loss = F.cross_entropy(y_hat, label)
        a, y_hat = torch.max(y_hat, dim=1) #the result tuple of two output tensors (max, max_indices)
        val_acc = accuracy_score(y_hat.cpu(), label.cpu())
        val_acc = torch.tensor(val_acc)
        return {'val_loss': loss, 'val_acc': val_acc}
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()
        self.log('avg_val_loss', avg_loss, prog_bar=True)
        self.log('avg_val_acc', avg_val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        input_ids, attention_mask, token_type_ids, label = batch
        y_hat, attn = self(input_ids, attention_mask, token_type_ids)
        loss = F.cross_entropy(y_hat, label)
        a, y_hat = torch.max(y_hat, dim=1) #the result tuple of two output tensors (max, max_indices)
        test_acc = accuracy_score(y_hat.cpu(), label.cpu())
        test_acc = torch.tensor(test_acc)
        return {'test_loss': loss, 'test_acc': test_acc}
    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        avg_test_acc = torch.stack([x['test_acc'] for x in outputs]).mean()
        self.log('avg_test_loss', avg_loss, prog_bar=True)
        self.log('avg_test_acc', avg_test_acc, prog_bar=True)
    
    def configure_optimizers(self):
        return torch.optim.Adam([p for p in self.parameters() if p.requires_grad], lr=2e-05, eps=1e-08)

    def train_dataloader(self):
        return gen_train_dataloader

    def val_dataloader(self):
        return gen_val_dataloader

def return_ModelCheckpoint(model_name):
    checkpoint_callback = ModelCheckpoint(
        monitor='avg_val_acc',
        dirpath="./models/question_temporally_ambiguity",
        filename=f'{model_name}',
        save_top_k=1,
        mode='max'
        )
    return checkpoint_callback

### Training (bert-base-uncased)

#### Train

In [None]:
train_size = 0.9
train_batch_size = 4
val_batch_size = 4
model_name = "bert-base-uncased"

gen_train_dataloader, gen_val_dataloader = generate_ambiguity_dataloaders(model_name, train_size, train_batch_size, val_batch_size)

In [None]:
model = AmbiguityFinetuner(model_name)
checkpoint_callback = return_ModelCheckpoint(model_name)
trainer = pl.Trainer(gpus=1, max_epochs=10, callbacks=[checkpoint_callback], accumulate_grad_batches=2)    
trainer.fit(model)

#### Load Model And Test

In [5]:
train_size = 0.9
train_batch_size = 8
val_batch_size = 8
model_name = "bert-base-uncased"
gen_train_dataloader, gen_val_dataloader = generate_ambiguity_dataloaders(model_name, train_size, train_batch_size, val_batch_size)

In [6]:
model_name = "bert-base-uncased"
model_file_path = f"./models/question_temporally_ambiguity/{model_name}.ckpt"
model = AmbiguityFinetuner(model_name)
model = model.load_from_checkpoint(checkpoint_path=model_file_path,model_name=model_name)
trainer = pl.Trainer(gpus=1)
trainer.test(model, test_dataloaders=model.val_dataloader())

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.tr

HBox(children=(HTML(value='Testing'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'avg_test_acc': 0.794685959815979, 'avg_test_loss': 0.5140613913536072}
--------------------------------------------------------------------------------


[{'avg_test_loss': 0.5140613913536072, 'avg_test_acc': 0.794685959815979}]

# Classification and Filtering

In [1]:
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl
from sklearn.metrics import accuracy_score
from transformers import pipeline, AutoModel, AutoTokenizer
from pytorch_lightning import seed_everything
seed_everything(42)
import pickle
import pyarrow.feather as feather

class Specificity_Ambiguity_Model(pl.LightningModule):
    def __init__(self, model_name):
        super(Specificity_Ambiguity_Model, self).__init__()
        lg_model = AutoModel.from_pretrained(model_name)
        self.lg_model = lg_model
        self.W = nn.Linear(lg_model.config.hidden_size, 2)
        self.num_classes = 2

    def forward(self, input_ids, attention_mask, token_type_ids):
        h, attn = self.lg_model(input_ids=input_ids, 
                                attention_mask=attention_mask, 
                                token_type_ids=token_type_ids,
                                return_dict=False)
        h_cls = h[:, 0]
        logits = self.W(h_cls)
        return logits, attn

def predict_text(text, tokenizer, model):
    text = " ".join(text.split())
    max_len = tokenizer.model_max_length
    inputs = tokenizer(
            [text],
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
    input_ids = torch.tensor(inputs['input_ids'], dtype=torch.long).cuda(0)
    attention_mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).cuda(0)
    token_type_ids = torch.tensor(inputs["token_type_ids"], dtype=torch.long).cuda(0)
    y_hat = model(input_ids, attention_mask, token_type_ids)[0]
    label = y_hat.argmax().item()
    return label

Global seed set to 42


In [2]:
class Specificity_Ambiguity_Filtering:
    def __init__(
        self,
        specificity_model_name: str, 
        specificity_model_file_path: str,
        ambiguity_model_name: str, 
        ambiguity_model_file_path: str,
        paraid_text_dict: dict
    ):
        self.spe_tokenizer = AutoTokenizer.from_pretrained(specificity_model_name)
        spe_model = Specificity_Ambiguity_Model(specificity_model_name)
        spe_model = spe_model.load_from_checkpoint(checkpoint_path=specificity_model_file_path,model_name=specificity_model_name)
        self.spe_model = spe_model.eval().cuda(device=0)
        
        self.amb_tokenizer = AutoTokenizer.from_pretrained(ambiguity_model_name)
        amb_model = Specificity_Ambiguity_Model(ambiguity_model_name)
        amb_model = amb_model.load_from_checkpoint(checkpoint_path=ambiguity_model_file_path,model_name=ambiguity_model_name)
        self.amb_model = amb_model.eval().cuda(device=0)
        
        self.paraid_text_dict = paraid_text_dict
        self.ans_sent_label = dict()
        
    def __call__(self, raw_results_df):
        self.results_df = raw_results_df
        self.append_labelAndpro(model_tag = "specificity")
        self.results_df = self.results_df.loc[(self.results_df["specificity"]==1)].reset_index(drop=True)
        self.append_labelAndpro(model_tag = "non_ambiguity")
        self.results_df = self.results_df.loc[(self.results_df["non_ambiguity"]==1)].reset_index(drop=True)
        return self.results_df
    
    def predict_text(self, text, tokenizer, model):
        text = " ".join(text.split())
        max_len = tokenizer.model_max_length
        inputs = tokenizer(
                [text],
                max_length=max_len,
                padding='max_length',
                truncation=True,
                return_token_type_ids=True
            )
        input_ids = torch.tensor(inputs['input_ids'], dtype=torch.long).cuda(0)
        attention_mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).cuda(0)
        token_type_ids = torch.tensor(inputs["token_type_ids"], dtype=torch.long).cuda(0)
        y_hat = model(input_ids, attention_mask, token_type_ids)[0]
        label = y_hat.argmax().item()
        return label
    
    def append_labelAndpro(self, model_tag):
        self.results_df[model_tag] = None
        
        for row_idx,row in self.results_df.iterrows():
            if model_tag=="specificity":
                ans_pos = row["ans-sent_pos"]
                dict_key = f"{row['para_id']}_{ans_pos[0]}_{ans_pos[1]}"
                if dict_key in self.ans_sent_label:
                    row["specificity"] = self.ans_sent_label[dict_key]
                    continue
                ans_sent= self.paraid_text_dict[row["para_id"]][ans_pos[0]:ans_pos[1]]
                pred_label = self.predict_text(ans_sent, self.spe_tokenizer, self.spe_model) 
                #0:general; 1:specific
                self.ans_sent_label[dict_key] = pred_label
                row["specificity"] = pred_label
            if model_tag=="non_ambiguity":
                question = row["question"]
                pred_label = self.predict_text(question, self.amb_tokenizer, self.amb_model) 
                #0:ambiguous; 1:non-ambiguous
                row["non_ambiguity"] = pred_label

In [3]:
specificity_model_name = "roberta-base"
specificity_model_file_path = f"./models/sentence_specificity/{specificity_model_name}.ckpt"
ambiguity_model_name = "bert-base-uncased"
ambiguity_model_file_path = f"./models/question_temporally_ambiguity/{ambiguity_model_name}.ckpt"

examples=pickle.load(open("data/examples.pickle", "rb"))
paraid_text_dict={example[0]:example[2] for example in examples}
raw_results_df=feather.read_feather("data/raw_results_After_3rdModule.feather")
print(len(raw_results_df))
raw_results_df.head(1)

76


Unnamed: 0,question,answer,org_answer,ans_pos,ans-sent_pos,para_id,trans_que,trans_ans
0,Which country's hard-line Revolutionary Guard ...,Iran,Iran,"[0, 4]","[0, 249]",1650014_0,,


In [4]:
SpeAndAmb_Processing = Specificity_Ambiguity_Filtering(specificity_model_name,specificity_model_file_path,
                                                       ambiguity_model_name,ambiguity_model_file_path,
                                                       paraid_text_dict)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaMod

In [5]:
filtered_df = SpeAndAmb_Processing(raw_results_df)

In [6]:
print(len(filtered_df))
filtered_df.head(3)

66


Unnamed: 0,question,answer,org_answer,ans_pos,ans-sent_pos,para_id,trans_que,trans_ans,specificity,non_ambiguity
0,Which country's hard-line Revolutionary Guard ...,Iran,Iran,"[0, 4]","[0, 249]",1650014_0,,,1,1
1,What group said the death sentence against Sal...,Revolutionary Guard,Revolutionary Guard,"[17, 36]","[0, 249]",1650014_0,,,1,1
2,On what day did Iran's Revolutionary Guard say...,Saturday,Saturday,"[45, 53]","[0, 249]",1650014_0,,,1,1


In [7]:
filtered_df = filtered_df[["question","answer","org_answer","ans_pos","para_id","trans_que","trans_ans"]]
filtered_df.to_feather("data/raw_results_After_4thModule.feather")