# modeling5-nli-binaryclassifier-Trainer
- nli-binaryclassifier using Huggingface Trainer

In [7]:
# !pip install evaluate

In [10]:
import json
import os
import logging
import sys
import evaluate
import wandb
from copy import deepcopy
import transformers
import torch
import numpy as np
import random
from torch import nn
from torch.utils.data import Dataset
from transformers import (
    AutoModelForSequenceClassification, 
    AutoModel, 
    AutoConfig, 
    AutoTokenizer,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    set_seed,
)
from util.arguments import ModelArguments, DataTrainingArguments 
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [11]:
def open_json(file):
    with open(file , 'r') as f: 
        data = json.load(f)
    return data

In [12]:
train = '/data/philhoon-relevance/KILT/kilt-dpr-retrieval/nq-train-multikilt.json'

In [13]:
train_data = open_json(train)

In [14]:
def preprocessing_data(json_file, sample_size:int, position:int):
    """
    sample_size : one to five
        e.g.)
            positive_sample = 1 positive passage + n-1 negative passage
            negative_sample = n negative passage
    cut_off : number of questions discarded when there is not enough negative passages
    position : position of positive passage (1 ~ n)
        e.g.) n = 2, position = 1
            instance = [negative passage, positive passage]
    """
    cut_off = 0
    instances = []
    sample_size = sample_size
    position = position
    total_questions = len(json_file) 
    
    for idx, samples in enumerate(json_file):
        answer = samples['answers'] 
        question = samples['question']
        negative_samples = []
    
        # 'hard_negative_ctxs' should be at least equal to sample_size
        # 'positive_ctx' which contains the answer should be at least one
        if len(samples['hard_negative_ctxs']) < sample_size or len(samples['positive_ctxs']) < 1:
            cut_off += 1
        else:
            cnt_negative_sample = 0
            for negative_sample in samples['hard_negative_ctxs']:
                if cnt_negative_sample > sample_size - 1:
                    break
                ng_s = negative_sample['text'].replace('\n', ' ')
                negative_samples.append(ng_s)
                cnt_negative_sample += 1
            
            # 'hard_negative_ctxs' sorted by its score, so shuffle them
            random.shuffle(negative_samples)
            
            # replace 1 negative_sample with one positive_sample in designated position
            positive_sample = samples['positive_ctxs'][0]['text'].replace('\n', ' ')
            positive_samples = deepcopy(negative_samples)
            positive_samples[position-1] = positive_sample 
            
            negative_template={
            'text' : negative_samples,
            'labels' : 0,
            'answer' : answer,
            'question' : question,
            }
            positive_template={
                'text' : positive_samples,
                'labels' : 1,
                'answer' : answer,
                'question' : question,
                'pos' : position,
            }
            instances.append(negative_template)
            instances.append(positive_template)
    
    return instances, cut_off, total_questions

In [16]:
instances, cut_off, total_questions = preprocessing_data(
            train_data,
            5,
            5)


In [18]:
from pprint import pprint

In [19]:
pprint(instances[0])

{'answer': ['Tracy McConnell'],
 'labels': 0,
 'question': 'how i.met your mother who is the mother',
 'text': ['Hopeless (How I Met Your Mother) "Hopeless" is the 21st episode of '
          'the sixth season of the CBS sitcom "How I Met Your Mother" and the '
          '133rd episode overall. It aired on April 18, 2011. Plot. The '
          'episode starts with Barney and his father, Jerry, parting in 1983. '
          "Back in the present, Barney is disappointed in Jerry's normal "
          'suburban lifestyle, and decides not to pursue any further contact. '
          'However, Barney is surprised by a call from Jerry, who',
          'Craig Thomas (screenwriter) Craig David Thomas is an American '
          'television writer who, along with writing partner Carter Bays, has '
          'written episodes of "American Dad!", "Oliver Beene", "Quintuplets" '
          'and the hit CBS sitcom "How I Met Your Mother", which they created '
          'in 2005. In 2012 "How I Met Your Mo

In [21]:
binary_train_file = '/data/philhoon-relevance/binary-classification/NQ-DEV-DPR/5-fold/1/binary_data/binary_ex_ctx100id_split_train_1.json'
binary_dev_file = 'binary_ex_ctx100id_split_dev_1.json'


In [22]:
binary_train_data = open_json(binary_train_file)

In [25]:
pprint(binary_train_data[0])

{'ctx': {'id': '533920',
         'text': 'he was unsatisfied with the book. Ellison ultimately wrote '
                 'more than 2,000 pages of this second novel but never '
                 'finished it. Ellison died on April 16, 1994 of pancreatic '
                 'cancer and was interred in a crypt at Trinity Church '
                 'Cemetery in the Washington Heights neighborhood of Upper '
                 'Manhattan. He was survived by his second wife, Fanny Ellison '
                 '(November 27, 1911 – November 19, 2005). "Invisible Man" won '
                 'the 1953 US National Book Award for Fiction. The award was '
                 'his ticket into the American literary establishment. He '
                 'eventually was admitted to the American Academy of Arts and '
                 "Letters, received two President's",
         'title': 'Ralph Ellison'},
 'em': '0',
 'id': 1,
 'question': 'how many pages is invisible man by ralph ellison'}


In [11]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, instances, tokenizer, max_length):
        self.instances = instances
        self.tokenizer = tokenizer
        self.sep_token = tokenizer.sep_token
        self.max_length = max_length
    
    def __len__(self):
        return len(self.instances)
    
    def __getitem__(self, idx):
        input_ = [' ' + self.instances[idx]['question']] + self.instances[idx]['text']
        input_txt =  f' { self.sep_token } '.join(input_) + ' '
        
        output = self.tokenizer(
            input_txt, 
            # return_tensors="pt", will be applied later through collator
            # padding=True, will be padded later through collate
            truncation=True, 
            add_special_tokens=True, 
            max_length=self.max_length)
        
        item = {key : val for key, val in output.items()}
        item['labels'] = torch.tensor(self.instances[idx]['labels'])
        
        return item

In [12]:
parser = HfArgumentParser(
    (ModelArguments, DataTrainingArguments, TrainingArguments)
)
args = ["--model_name_or_path", 'allenai/longformer-large-4096', '--output_dir', './']
model_args, data_args, training_args = parser.parse_args_into_dataclasses(args)


In [13]:
logger = logging.getLogger(__name__)

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Log on each process the small summary:
logger.warning(
    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
    + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")

12/21/2022 17:17:10 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=3,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=

In [14]:
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
    last_checkpoint = get_last_checkpoint(training_args.output_dir)
    if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
            "Use --overwrite_output_dir to overcome."
        )
    elif last_checkpoint is not None:
        logger.info(
            f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
            "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
        )


In [15]:
# Set seed before initializing model.
set_seed(training_args.seed)

In [16]:
config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=model_args.num_labels,
    )
tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
    )
model = AutoModelForSequenceClassification.from_pretrained(
    model_args.model_name_or_path,
    config=config,
)

AttributeError: 'ModelArguments' object has no attribute 'num_labels'

In [17]:
if training_args.do_train:
    instances, cut_off, total_questions = preprocessing_data(
        data_args.train_file, 
        data_args.sample_size, 
        data_args.position)
    
    train_instance = instances[data_args.dev_size:]
    dev_instance = instances[:data_args.dev_size]
    
    train_dataset = CustomDataset(train_instance, 
                               tokenizer, 
                               model_args.max_seq_length)
    dev_dataset = CustomDataset(train_instance, 
                               tokenizer, 
                               model_args.max_seq_length)
    
    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

if training_args.do_eval:
    instances, cut_off, total_questions = preprocessing_data(
        data_args.test_file, 
        data_args.sample_size, 
        data_args.position)
    
    test_dataset = CustomDataset(instances, 
                               tokenizer, 
                               model_args.max_seq_length)
    
    


In [39]:
# Get the metric function
metric = evaluate.load("xnli")

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)


Downloading builder script:   0%|          | 0.00/2.95k [00:00<?, ?B/s]

In [None]:
 # Initialize Trainer
data_collator = DataCollatorWithPadding(
    tokenizer, 
    pad_to_multiple_of=8,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_train else None,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=30)]
)

# Training
if training_args.do_train:
    checkpoint = None
    if training_args.resume_from_checkpoint is not None:
        checkpoint = training_args.resume_from_checkpoint
    elif last_checkpoint is not None:
        checkpoint = last_checkpoint
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
    metrics = train_result.metrics
    max_train_samples = (
        data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
    )
    metrics["train_samples"] = min(max_train_samples, len(train_dataset))

    trainer.save_model()  # Saves the tokenizer too for easy upload

    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    
# Evaluation
if training_args.do_eval:
    logger.info("*** Evaluate ***")
    metrics = trainer.evaluate(eval_dataset=eval_dataset)

    max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
    metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)

In [40]:
metric

EvaluationModule(name: "xnli", module_type: "metric", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Computes XNLI score which is just simple accuracy.
Args:
    predictions: Predicted labels.
    references: Ground truth labels.
Returns:
    'accuracy': accuracy
Examples:

    >>> predictions = [0, 1]
    >>> references = [0, 1]
    >>> xnli_metric = evaluate.load("xnli")
    >>> results = xnli_metric.compute(predictions=predictions, references=references)
    >>> print(results)
    {'accuracy': 1.0}
""", stored examples: 0)

In [22]:
data_args.dataset_name = a
    

DataTrainingArguments(dataset_name='../data/train_dataset', overwrite_cache=False, max_seq_length=1024, pad_to_max_length=False)

In [None]:
data.max_seq_length

In [26]:
training_args.fp16

False

In [6]:
bb

'allenai/longformer-large-4096'

In [None]:
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments)
    )
    
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
    
    