In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import logging as log
log.basicConfig(level=log.DEBUG)

In [2]:
import sys
# sys.path.append('../')
# from baselines.utils import *
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

os.environ['WANDB_ENTITY'] = 'contract-nli-db'
os.environ['WANDB_PROJECT'] = 'contract-nli'
os.environ['WANDB_LOG_MODEL'] = 'end'

In [3]:
import torch

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [4]:
cfg = {
    "model_name": "distilbert/distilbert-base-uncased",
    "batch_size": 32,
    "train_path": "/kaggle/input/dataset-contract/train.json",
    "test_path": "/kaggle/input/dataset-contract/test.json",
    "dev_path": "/kaggle/input/dataset-contract/dev.json",
    "max_length": 512,
    "models_save_dir": "/kaggle/working/saved_model",
    "results_dir": "/kaggle/working/results",
    "dataset_dir": "/kaggle/working/dataset_dir"
}

In [5]:
# create dir if not exists
from pathlib import Path
Path(cfg["models_save_dir"]).mkdir(parents=True, exist_ok=True)
Path(cfg["dataset_dir"]).mkdir(parents=True, exist_ok=True)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(cfg['model_name'])

tokenizer.save_pretrained(cfg['models_save_dir'])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



('/kaggle/working/saved_model/tokenizer_config.json',
 '/kaggle/working/saved_model/special_tokens_map.json',
 '/kaggle/working/saved_model/vocab.txt',
 '/kaggle/working/saved_model/added_tokens.json',
 '/kaggle/working/saved_model/tokenizer.json')

In [7]:
tokenizer = AutoTokenizer.from_pretrained(cfg['models_save_dir'])


In [8]:
!pip install icecream
from icecream import ic

Collecting icecream
  Downloading icecream-2.1.3-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading icecream-2.1.3-py2.py3-none-any.whl (8.4 kB)
Installing collected packages: icecream
Successfully installed icecream-2.1.3


In [9]:
def get_hypothesis_idx(hypothesis_name):
    return int(hypothesis_name.split('-')[-1])

In [10]:
from torch.utils.data import Dataset
import random
import torch

class NLIDataset(Dataset):
    def __init__(self, documents, tokenizer, hypothesis, context_sizes, surround_character_size):
        label_dict = get_labels()
        self.tokenizer = tokenizer

        self.tokenizer.add_special_tokens({'additional_special_tokens': ['[SPAN]']})

        data_points = []
        contexts = [{}]

        for context_size in context_sizes:
            for i, doc in enumerate(documents):
                char_idx = 0
                while char_idx < len(doc['text']):
                    ic(char_idx)
                    document_spans = doc['spans']
                    cur_context = {
                        'doc_id': i,
                        'start_char_idx': char_idx,
                        'end_char_idx': char_idx + context_size,
                        'spans' : [],
                    }

                    for j, (start, end) in enumerate(document_spans):
                        if end <= char_idx:
                            continue

                        cur_context['spans'].append({
                            'start_char_idx': max(start, char_idx),
                            'end_char_idx': min(end, char_idx + context_size),
                            'marked': start >= char_idx and end <= char_idx + context_size,
                            'span_id': j
                        })

                        if end > char_idx + context_size:
                            break

                    if cur_context == contexts[-1]:
                        char_idx = cur_context['end_char_idx'] - surround_character_size
                        continue

                    contexts.append(cur_context)
                    if len(cur_context['spans']) == 1 and not cur_context['spans'][0]['marked']:
                        char_idx = cur_context['end_char_idx'] - surround_character_size
                    else:
                        char_idx = cur_context['spans'][-1]['start_char_idx'] - surround_character_size

        contexts.pop(0)

        for nda_name, nda_desc in hypothesis.items():
            for i, context in enumerate(contexts):

                nli_label = label_dict[documents[context['doc_id']]['annotation_sets'][0]['annotations'][nda_name]['choice']]

                data_point = {}
                data_point['hypotheis'] = nda_desc
                cur_premise = ""
                data_point['marked_beg'] = context['spans'][0]['marked']
                data_point['marked_end'] = context['spans'][-1]['marked']
                doc_id = context['doc_id']
                hypothesis_id = get_hypothesis_idx(nda_name)
                span_ids = []

                if len(context['spans']) == 1:
                    data_point['marked_end'] = True

                span_labels = []

                for span in context['spans']:
                    val = int(span['span_id'] in documents[context['doc_id']]['annotation_sets'][0]['annotations'][nda_name]['spans'])

                    val = 2 * val - 1 # making 0 -> -1 and 1 -> 1

                    if span['marked']:
                        span_labels.append(val)
                        span_ids.append(span['span_id'])

                    cur_premise += ' [SPAN] '
                    cur_premise += documents[context['doc_id']]['text'][span['start_char_idx']:span['end_char_idx']]

                data_point['premise'] = cur_premise
                
                if nli_label == get_labels()['NotMentioned']:
                    span_labels = torch.zeros(len(span_labels), dtype=torch.long)

                data_point['nli_label'] = torch.tensor(nli_label, dtype=torch.long)
                data_point['span_labels'] = torch.tensor(span_labels, dtype=torch.long)
                data_point['doc_id'] = torch.tensor(doc_id, dtype=torch.long)
                data_point['hypothesis_id'] = torch.tensor(hypothesis_id, dtype=torch.long)
                data_point['span_ids'] = torch.tensor(span_ids, dtype=torch.long)

                data_points.append(data_point)

        self.data_points = data_points
        self.span_token_id = self.tokenizer.convert_tokens_to_ids('[SPAN]')

    def __len__(self):
        return len(self.data_points)

    def __getitem__(self, idx):
        tokenized_data = self.tokenizer(
            [self.data_points[idx]['hypotheis']],
            [self.data_points[idx]['premise']],
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )

        tokenized_data['input_ids'] = tokenized_data['input_ids'].squeeze()
        tokenized_data['attention_mask'] = tokenized_data['attention_mask'].squeeze()
        # tokenized_data['token_type_ids'] = tokenized_data['token_type_ids'].squeeze()

        span_indices = torch.where(tokenized_data['input_ids'] == self.span_token_id)[0]

        if not self.data_points[idx]['marked_beg']:
            span_indices = span_indices[1:]
        
        if not self.data_points[idx]['marked_end'] or tokenized_data['attention_mask'][-1] == 0:
            span_indices = span_indices[:-1]
        
        span_ids = self.data_points[idx]['span_ids']
        span_ids = span_ids[:len(span_indices)]

        return {
            'input_ids': tokenized_data['input_ids'],
            'attention_mask': tokenized_data['attention_mask'],
            # 'token_type_ids': tokenized_data['token_type_ids'],
            'span_indices': span_indices,
            'nli_label': self.data_points[idx]['nli_label'],
            'span_labels': self.data_points[idx]['span_labels'][:len(span_indices)],
            'data_for_metrics': {
                'doc_id': self.data_points[idx]['doc_id'],
                'hypothesis_id': self.data_points[idx]['hypothesis_id'],
                'span_ids': span_ids,
            }
        }

In [11]:
import os
import json
from nltk import word_tokenize 
import re

def load_data(path: str) -> json:
    with open(path, 'r') as f:
        data = json.load(f)
    return data

def get_labels() -> dict:
    return {
        'NotMentioned': 0,
        'Entailment': 1,
        'Contradiction': 2,
    }

def get_hypothesis(data: dict) -> list:
    hypothesis = {}
    for key, value in data['labels'].items():
        hypothesis[key] = clean_str(value['hypothesis'])
    return hypothesis

def tokenize(str: str) -> str:
    return ' '.join(word_tokenize(str))

def clean_str(str: str) -> str:
    # remove '\n' character
    str = str.replace('\n', ' ')
    # remove '\t' character
    str = re.sub(r'\\t', ' ', str)
    # remove '\r' character
    str = re.sub(r'\\r', ' ', str)
    # remove more than 2 consecutive occcurance of a character
    str = re.sub(r'(.)\1{2,}', r'\1', str)
    return str.strip().lower()

In [12]:
train_data = load_data(os.path.join(cfg['train_path']))
dev_data = load_data(os.path.join(cfg['dev_path']))
test_data = load_data(os.path.join(cfg['test_path']))

hypothesis = get_hypothesis(train_data)

train_data = train_data['documents']
dev_data = dev_data['documents']
test_data = test_data['documents']


train_data = train_data[:20]
dev_data = dev_data[:20]
test_data = test_data[:20]

ic.disable()

ic(len(train_data), len(dev_data), len(test_data))
train_dataset = NLIDataset(train_data, tokenizer, hypothesis, [1100], 50)
dev_dataset = NLIDataset(dev_data, tokenizer, hypothesis, [1100], 50)
test_dataset = NLIDataset(test_data, tokenizer, hypothesis, [1100], 50)

ic.enable()

del train_data
del dev_data
del test_data
del hypothesis

  data_point['span_labels'] = torch.tensor(span_labels, dtype=torch.long)


In [13]:
ic(len(train_dataset), len(dev_dataset), len(test_dataset))


ic| len(train_dataset): 4080
    len(dev_dataset): 5236
    len(test_dataset): 4250


(4080, 5236, 4250)

In [14]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

def get_class_weights(dataset):
    nli_labels = [x['nli_label'] for x in dataset]

    span_labels = []
    for x in dataset:
        span_labels.extend(x['span_labels'].tolist())

    nli_weights = compute_class_weight('balanced', classes=np.unique(nli_labels), y=np.array(nli_labels))

    nli_weights = nli_weights.tolist()

    span_labels = [x for x in span_labels if x != -1]
    span_labels = np.array(span_labels)
    span_weight = np.sum(span_labels == 0) / np.sum(span_labels)

    return nli_weights, span_weight

In [15]:
nli_weights, span_weight = get_class_weights(train_dataset)


In [16]:
ic(nli_weights, span_weight)


ic| nli_weights: [0.8871493803000652, 0.6776283009466866, 2.5185185185185186]
    span_weight: 28.243243243243242


([0.8871493803000652, 0.6776283009466866, 2.5185185185185186],
 28.243243243243242)

In [17]:
from transformers import PreTrainedModel, PretrainedConfig

class ContractNLIConfig(PretrainedConfig):
    # def __init__(self, lambda_ = 1, bert_model_name = cfg['model_name'], num_labels = len(get_labels()), ignore_span_label = 2, nli_weights = nli_weights, span_weight = span_weight, **kwargs):
    def __init__(self, nli_weights = [1, 1, 1], span_weight = 1, lambda_ = 1, bert_model_name = cfg['model_name'], num_labels = len(get_labels()), ignore_span_label = 2, **kwargs):
        super().__init__(**kwargs)
        self.bert_model_name = bert_model_name
        self.num_labels = num_labels
        self.lambda_ = lambda_
        self.ignore_span_label = ignore_span_label
        self.nli_weights = nli_weights
        self.span_weight = span_weight

In [18]:
from transformers import AutoModel
from torch import nn

class ContractNLI(PreTrainedModel):
    config_class = ContractNLIConfig

    def __init__(self, config):
        super().__init__(config)
        self.bert = AutoModel.from_pretrained(config.bert_model_name)
        self.bert.resize_token_embeddings(self.bert.config.vocab_size + 1, pad_to_multiple_of=8)
        # self.bert.eval()
        # for param in self.bert.parameters():
        #     param.requires_grad = False

        self.embedding_dim = self.bert.config.hidden_size
        self.num_labels = config.num_labels
        self.lambda_ = config.lambda_
        self.nli_criterion = nn.CrossEntropyLoss(weight=torch.tensor(self.config.nli_weights, dtype=torch.float32))
        self.span_criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(self.config.span_weight, dtype=torch.float32))

        self.span_classifier = nn.Sequential(
            nn.Linear(self.embedding_dim * 4, self.embedding_dim * 4),
            nn.ReLU(),
            nn.Linear(self.embedding_dim * 4, self.embedding_dim * 2),
            nn.ReLU(),
            nn.Linear(self.embedding_dim * 2, 1)
        )

        self.nli_classifier = nn.Sequential(
            nn.Linear(self.embedding_dim * 4, self.embedding_dim * 4),
            nn.ReLU(),
            nn.Linear(self.embedding_dim * 4, self.embedding_dim * 2),
            nn.ReLU(),
            nn.Linear(self.embedding_dim * 2, self.num_labels)
        )

        # initialize weights
        self.init_weights()

    def _init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # use the same initialization as bert
            module.weight.data.normal_(mean=0.0, std=self.bert.config.initializer_range)
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    def forward(self, input_ids, attention_mask, span_indices):
        outputs = self.bert(input_ids, attention_mask, output_hidden_states=True).hidden_states[-4:]
        outputs = torch.stack(outputs, dim=0)
        outputs = outputs.permute([1, 2, 0, 3])
        outputs = outputs.reshape([outputs.shape[0], outputs.shape[1], -1])

        gather = torch.gather(outputs, 1, span_indices.unsqueeze(2).expand(-1, -1, outputs.shape[-1]))

        masked_gather = gather[span_indices != 0]
        span_logits = self.span_classifier(masked_gather)
        nli_logits = self.nli_classifier(outputs[:, 0, :])

        return span_logits, nli_logits

In [19]:
from transformers import Trainer
import torch
import torch.nn.functional as F

class ContractNLITrainer(Trainer):
    def __init__(self, *args, data_collator=None, **kwargs):
        super().__init__(*args, data_collator=data_collator, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        span_label = inputs.pop('span_labels')
        nli_label = inputs.pop('nli_label')
        inputs.pop('data_for_metrics')

        outputs = model(**inputs)
        span_logits, nli_logits = outputs[0], outputs[1]
        
        # span labels = -1, means ignore 
        
        mask = span_label != -1
        span_label = span_label[mask]
        span_logits = span_logits[mask]
        
        span_label = span_label.float()
        span_logits = span_logits.float()
        
        span_label = span_label.view(-1)
        span_logits = span_logits.view(-1)        

        # if len(true_span_labels) == 0 or len(pred_span_labels) != len(true_span_labels):
        #     span_loss = torch.tensor(0, dtype=torch.float32, device=DEVICE)
        # else:
        #     span_loss = self.model.span_criterion(pred_span_labels, true_span_labels)
        
        if len(span_label) == 0:
            span_loss = torch.tensor(0, dtype=torch.float32, device=DEVICE)
        else:
            span_loss = self.model.span_criterion(span_logits, span_label)

        nli_loss = self.model.nli_criterion(nli_logits, nli_label)

        if torch.isnan(nli_loss):
            nli_loss = torch.tensor(0, dtype=torch.float32, device=DEVICE)

        if torch.isnan(span_loss):
            span_loss = torch.tensor(0, dtype=torch.float32, device=DEVICE)

        loss = span_loss + self.model.lambda_ * nli_loss

        if loss.item() == 0:
            loss = torch.tensor(0, dtype=torch.float32, device=DEVICE, requires_grad=True)

        return (loss, outputs) if return_outputs else loss

    @staticmethod
    def collate_fn(features):
        # Get the span indices and pad them to the maximum length in the batch
        span_indices_list = [feature['span_indices'] for feature in features]
        max_len = max([len(span_indices) for span_indices in span_indices_list])
        span_indices_list = [torch.cat([span_indices, torch.zeros(max_len - len(span_indices), dtype=torch.long)]) for span_indices in span_indices_list]
    
        # Pad span_ids to the maximum length
        span_ids_list = [feature['data_for_metrics']['span_ids'] for feature in features]
        max_len_span_ids = max([len(span_ids) for span_ids in span_ids_list])
        span_ids_list = [torch.cat([span_ids, torch.full((max_len_span_ids - len(span_ids),), -1)]) for span_ids in span_ids_list]
    
        # Pad input_ids, attention_mask, and token_type_ids
        input_ids = [feature['input_ids'] for feature in features]
        attention_mask = [feature['attention_mask'] for feature in features]
        # token_type_ids = [feature['token_type_ids'] for feature in features]
        
        max_len_input = max([len(ids) for ids in input_ids])
        input_ids = [F.pad(ids, (0, max_len_input - len(ids))) for ids in input_ids]
        attention_mask = [F.pad(mask, (0, max_len_input - len(mask))) for mask in attention_mask]
        # token_type_ids = [F.pad(type_ids, (0, max_len_input - len(type_ids))) for type_ids in token_type_ids]
    
        # Stack all padded tensors
        input_ids = torch.stack(input_ids)
        attention_mask = torch.stack(attention_mask)
        # token_type_ids = torch.stack(token_type_ids)
        span_indices = torch.stack(span_indices_list)
        nli_label = torch.stack([feature['nli_label'] for feature in features])
        span_label = torch.cat([feature['span_labels'] for feature in features], dim=0)
        
        # Prepare the data for metrics
        data_for_metrics = {
            'doc_id': torch.stack([feature['data_for_metrics']['doc_id'] for feature in features]),
            'hypothesis_id': torch.stack([feature['data_for_metrics']['hypothesis_id'] for feature in features]),
            'span_ids': torch.stack(span_ids_list),
        }
    
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            # 'token_type_ids': token_type_ids,
            'span_indices': span_indices,
            'nli_label': nli_label,
            'span_labels': span_label,
            'data_for_metrics': data_for_metrics,
        }

In [20]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    auto_find_batch_size=True,
    output_dir=cfg['results_dir'],   # output directory
    num_train_epochs=10,            # total number of training epochs
    gradient_accumulation_steps=4,   # number of updates steps to accumulate before performing a backward/update pass
    logging_strategy='epoch',
    eval_steps=1,
    save_steps=1,
    logging_steps=1,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,
    label_names=['nli_label', 'span_labels', 'data_for_metrics'],
    report_to='none',
    # report_to='wandb',
)



In [21]:
def wandb_hp_space(trial):
    return {
        "method": "random",
        "metric": {
            "name": "eval/loss",
            "goal": "minimize"
        },
        "parameters": {
            "learning_rate": {
                "values": [1e-5, 3e-5, 5e-5]
            },
            "lambda_": {
                "values": [0.05, 0.1, 0.4]
            },
        }
    }

In [22]:
def model_init(trial):
    if trial is None:
        return ContractNLI(ContractNLIConfig(nli_weights=nli_weights, span_weight=span_weight))

    return ContractNLI(ContractNLIConfig(nli_weights=nli_weights, span_weight=span_weight, lambda_=trial['lambda_']))

In [23]:
# model = ContractNLI(config).to(DEVICE)
trainer = ContractNLITrainer(
    model=None,                          # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=dev_dataset,            # evaluation dataset
    data_collator=ContractNLITrainer.collate_fn,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.001)],
    model_init=model_init,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss
0,1.6796,3.938724


# Metric


In [24]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import logging as log
log.basicConfig(level=log.DEBUG)

In [25]:
import sys
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

os.environ['WANDB_ENTITY'] = 'contract-nli-db'
os.environ['WANDB_PROJECT'] = 'contract-nli-metric'
os.environ['WANDB_LOG_MODEL'] = 'end'

In [26]:
import torch

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [27]:
import json

cfg = {
    "train_path": "/kaggle/input/dataset-contract/train.json",
    "test_path": "/kaggle/input/dataset-contract/test.json",
    "dev_path": "/kaggle/input/dataset-contract/dev.json",
    "model_name": "distilbert/distilbert-base-uncased",
    "max_length": 512,
    "models_save_dir": "/kaggle/input/anlp-project-trained-model/checkpoint",
    "dataset_dir": "./scratch/shu7bh/contract_nli/dataset",
    "results_dir": "./scratch/shu7bh/contract_nli/results",
    "trained_model_dir": "/kaggle/input/anlp-project-trained-model/",
    "batch_size": 32
}

cfg

{'train_path': '/kaggle/input/dataset-contract/train.json',
 'test_path': '/kaggle/input/dataset-contract/test.json',
 'dev_path': '/kaggle/input/dataset-contract/dev.json',
 'model_name': 'distilbert/distilbert-base-uncased',
 'max_length': 512,
 'models_save_dir': '/kaggle/input/anlp-project-trained-model/checkpoint',
 'dataset_dir': './scratch/shu7bh/contract_nli/dataset',
 'results_dir': './scratch/shu7bh/contract_nli/results',
 'trained_model_dir': '/kaggle/input/anlp-project-trained-model/',
 'batch_size': 32}

In [28]:
# create dir if not exists
from pathlib import Path
Path(cfg["models_save_dir"]).mkdir(parents=True, exist_ok=True)
Path(cfg["dataset_dir"]).mkdir(parents=True, exist_ok=True)

OSError: [Errno 30] Read-only file system: '/kaggle/input/anlp-project-trained-model'

In [29]:
tokenizer = AutoTokenizer.from_pretrained(cfg['model_name'])



In [30]:
!pip install icecream
from icecream import ic

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [31]:
dev_data = load_data(os.path.join(cfg['dev_path']))
test_data = load_data(os.path.join(cfg['test_path']))

hypothesis = get_hypothesis(dev_data)

dev_data = dev_data['documents']
test_data = test_data['documents']

# dev_data = dev_data[:50]
# test_data = test_data[:50]

ic.disable()

ic(len(dev_data), len(test_data))
dev_dataset = NLIDataset(dev_data, tokenizer, hypothesis, [1100], 50)
test_dataset = NLIDataset(test_data, tokenizer, hypothesis, [1100], 50)

ic.enable()

del dev_data
del test_data
del hypothesis

  data_point['span_labels'] = torch.tensor(span_labels, dtype=torch.long)


In [32]:
print(len(dev_dataset))
print(len(test_dataset))

15385
28645


In [33]:
from sklearn.metrics import precision_recall_curve
import numpy as np
def get_micro_average_precision_at_recall(y_true, y_pred, recall_level):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    return np.interp(recall_level, recall[::-1], precision[::-1])

In [34]:
# Import numpy and sklearn.metrics
import numpy as np
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score
def calculate_micro_average_precision(y_true, y_pred):
    """Calculate the micro average precision score.

    Args:
        y_true (np.array): True labels.
        y_pred (np.array): Predicted labels.

    Returns:
        float: Micro average precision score.
    """
    # Get the number of classes
    num_classes = len(np.unique(y_true))
    
    if num_classes == 0:
        return 0.0

    # initialize the average precision score
    average_precision = 0.0

    # loop over all classes
    for class_idx in range(num_classes):
        # get the indices for this class
        y_true_indices = np.where(y_true == class_idx)
        # calculate the average precision score for this class
        average_precision += ic(precision_score(
            y_true[y_true_indices], y_pred[y_true_indices], average="micro"
        ))

    # return the average over all classes
    return average_precision / num_classes

In [35]:
from sklearn.metrics import f1_score
def calculate_f1_score_for_class(y_true, y_pred, class_idx):
    """Calculate the F1 score for a given class.

    Args:
        y_true (np.array): True labels.
        y_pred (np.array): Predicted labels.
        class_idx (int): Index of the class.

    Returns:
        float: F1 score for the given class.
    """
    # get the indices for the given class
    y_true_indices = np.where(y_true == class_idx)
    # calculate the F1 score for the given class
    return f1_score(
        y_true[y_true_indices], y_pred[y_true_indices], average="macro"
    )

In [36]:
def precision_at_recall(y_true, y_scores, recall_threshold):
    precision, recall, threshold = precision_recall_curve(y_true, y_scores)
    idx = (np.abs(recall - recall_threshold)).argmin()  # Find nearest recall value to threshold
    ic(threshold[idx])
    return precision[idx]

In [37]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    auto_find_batch_size=True,
    output_dir=cfg['results_dir'],   # output directory
    num_train_epochs=10,            # total number of training epochs
    gradient_accumulation_steps=4,   # number of updates steps to accumulate before performing a backward/update pass
    logging_strategy='epoch',
    # eval_steps=0.25,
    # save_steps=0.25,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    # fp16=True,
    label_names=['nli_label', 'span_labels', 'data_for_metrics'],
    report_to='none',
)



In [38]:
cfg['trained_model_dir']

'/kaggle/input/anlp-project-trained-model/'

In [39]:
# Define the directory where your model is stored locally.
artifact_dir = '/kaggle/input/full-finetune-bert-base/checkpoint-15242'  # Replace this with the actual path

# Load the model directly from the local directory
model = ContractNLI.from_pretrained(artifact_dir).to(DEVICE)

In [40]:
from transformers import Trainer
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from tqdm import tqdm
import numpy as np

class ContractNLIMetricTrainer(ContractNLITrainer):
    def __init__(self, *args, data_collator=None, **kwargs):
        super().__init__(*args, data_collator=data_collator, **kwargs)

    def evaluate(self, eval_dataset=None, ignore_keys=None):
        self.model.eval()
        self.dataloader = ic(self.get_eval_dataloader(eval_dataset))

        eval_nli_labels = []
        eval_nli_preds = []
        true_labels_per_span = {}
        probs_per_span = {}

        nli_metrics = {}

        for inputs in tqdm(self.dataloader):
            inputs = self._prepare_inputs(inputs)
            span_labels = inputs.pop('span_labels')
            nli_labels = inputs.pop('nli_label')
            data_for_metrics = inputs.pop('data_for_metrics')

            span_indices_to_consider = torch.where(span_labels != -1)[0]

            with torch.no_grad():
                outputs = self.model(**inputs)
                span_logits, nli_logits = outputs[0], outputs[1]

                span_labels = span_labels.float()
                span_logits = span_logits.float()
                
                span_labels = span_labels.view(-1)
                span_logits = span_logits.view(-1)

                # start_index = 0
                
                indices_considered = 0 # total number of span indices considered

                # find the corresponding span index in data_for_metrics['span_ids'] considering -1 to be padding index
                # ic(span_index)
                for i, span_index_row in enumerate(data_for_metrics['span_ids']):
                    current_index = 0 # current row's first -1 index
                    # ic(span_index_row)
                    first_minus_one_index = torch.where(span_index_row == -1)[0]
                    # ic(first_minus_one_index)
                    if len(first_minus_one_index) == 0:
                        first_minus_one_index = len(span_index_row)
                    else:
                        first_minus_one_index = first_minus_one_index[0].item()

                    key = str(data_for_metrics['doc_id'][i].item())+ '-' + str(data_for_metrics['hypothesis_id'][i].item())

                    # mask span_labels and span_logits for the current row
                    mask = span_labels[indices_considered:indices_considered+first_minus_one_index] != -1
                    span_logits_masked = span_logits[indices_considered:indices_considered+first_minus_one_index][mask]

                    spans_contribution = torch.sum(torch.sigmoid(span_logits_masked)) / (len(span_logits_masked)) 

                    if key in nli_metrics:
                        nli_metrics[key]['spans_contribution'].append(spans_contribution)
                        nli_metrics[key]['nli_logits'].append(nli_logits[i])
                    else:
                        nli_metrics[key] = {}
                        nli_metrics[key]['true_nli_labels'] = nli_labels[i]
                        nli_metrics[key]['spans_contribution'] = [spans_contribution]
                        nli_metrics[key]['nli_logits'] = [nli_logits[i]]
                    
                    current_index = first_minus_one_index
                    indices_considered += current_index
                    
                    # ic(indices_considered)
                    # ic(current_index)
                    cnt = 0 # count to keep track of the number of span indices added in dictionary
                    
                    for span_index in span_indices_to_consider:

                        if span_index < indices_considered:
                            cnt += 1
                            value_index = span_index - (indices_considered - current_index)
                            doc_id = data_for_metrics['doc_id'][i]
                            hypothesis_id = data_for_metrics['hypothesis_id'][i]
                            span_id = data_for_metrics['span_ids'][i][value_index]
                            key = str(doc_id)+ '-' + str(hypothesis_id)+ '-' + str(span_id)
                            true_labels_per_span[key] = span_labels[span_index]
                            if key in probs_per_span:
                                probs_per_span[key].append(torch.sigmoid(span_logits[span_index]))
                                # probs_per_span[key].append(span_logits[value_index])
                            else:
                                probs_per_span[key] = [torch.sigmoid(span_logits[span_index])]
                                # probs_per_span[key] = [span_logits[value_index]]
                        else: 
                            break 
                    
                    span_indices_to_consider = span_indices_to_consider[cnt:]

                # eval_span_preds = torch.tensor(eval_span_preds.squeeze(1), dtype=torch.long)

                nli_preds = torch.argmax(torch.softmax(nli_logits, dim=1), dim=1)
                eval_nli_labels.extend(nli_labels.cpu().numpy())
                eval_nli_preds.extend(nli_preds.cpu().numpy())
        
        eval_span_labels = []
        eval_span_preds = []
        span_ids_for_eval = []
        
        for key in true_labels_per_span:
            eval_span_labels.append(true_labels_per_span[key].item())
            eval_span_preds.append(torch.mean(torch.stack(probs_per_span[key])).item())
            span_id = key.split('-')[-1]  # Extract the span ID from the key (last part after '-')
            span_ids_for_eval.append(span_id)

        # Print the true labels, predicted labels, and span IDs

        eval_nli_acc = accuracy_score(eval_nli_labels, eval_nli_preds)

        ic.enable()
        # ic(list(zip(eval_span_labels, eval_span_preds, span_ids_for_eval)))
        
        # ic(list(zip(eval_span_labels, eval_span_preds)))
        # ic(len(eval_span_labels), len(eval_span_preds))
        # ic(sum(eval_span_labels), sum(eval_span_preds))

        # find threshold for 80% recall
        # precision, recall, thresholds = precision_recall_curve(eval_span_labels, eval_span_preds)


        mAP = (average_precision_score(eval_span_labels, eval_span_preds, pos_label=0) + average_precision_score(eval_span_labels, eval_span_preds, pos_label=1))/2

        # mAP = average_precision_score(torch.tensor(true_span_labels), torch.tensor(pred_span_labels))
        precision_at_80_recall = precision_at_recall(torch.tensor(eval_span_labels), torch.tensor(eval_span_preds), 0.8)
        f1_score_for_entailment = calculate_f1_score_for_class(torch.tensor(eval_nli_labels), torch.tensor(eval_nli_preds), get_labels()['Entailment'])
        f1_score_for_contradiction = calculate_f1_score_for_class(torch.tensor(eval_nli_labels), torch.tensor(eval_nli_preds), get_labels()['Contradiction'])
        
        return {
            'mAP' : mAP,
            'precision_at_80_recall' : precision_at_80_recall,
            'nli_acc': eval_nli_acc,
            'f1_score_for_entailment': f1_score_for_entailment,
            'f1_score_for_contradiction': f1_score_for_contradiction
        }

In [41]:
trainer = ContractNLIMetricTrainer(
    model=model,                          # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    # train_dataset=train_dataset,         # training dataset
    eval_dataset=dev_dataset,            # evaluation dataset
    data_collator=ContractNLIMetricTrainer.collate_fn,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [42]:
ic.disable()
# ic.enable()
results = trainer.evaluate()
results

100%|██████████| 1924/1924 [03:09<00:00, 10.17it/s]
ic| threshold[idx]: 0.99910885


{'mAP': 0.8378346383422673,
 'precision_at_80_recall': 0.6778009742519137,
 'nli_acc': 0.6693532661683458,
 'f1_score_for_entailment': 0.2646199888977385,
 'f1_score_for_contradiction': 0.25508741759816567}

# single Inference Code


In [None]:
dev_data = load_data(os.path.join(cfg['dev_path']))
test_data = load_data(os.path.join(cfg['test_path']))

hypothesis = get_hypothesis(dev_data)

dev_data = dev_data['documents']
test_data = test_data['documents']

dev_data = dev_data[:1]
test_data = test_data[:1]

ic.disable()

ic(len(dev_data), len(test_data))
dev_dataset = NLIDataset(dev_data, tokenizer, hypothesis, [1100], 50)
test_dataset = NLIDataset(test_data, tokenizer, hypothesis, [1100], 50)

print("---------------------------------------------------")
print ( test_data )
print("---------------------------------------------------")
print ( test_dataset[0])
print("---------------------------------------------------")

ic.enable()

del dev_data
del test_data
del hypothesis

In [None]:
print(len(dev_dataset))
print(len(test_dataset))

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    auto_find_batch_size=True,
    output_dir=cfg['results_dir'],   # output directory
    num_train_epochs=10,            # total number of training epochs
    gradient_accumulation_steps=4,   # number of updates steps to accumulate before performing a backward/update pass
    logging_strategy='epoch',
    # eval_steps=0.25,
    # save_steps=0.25,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    # fp16=True,
    label_names=['nli_label', 'span_labels', 'data_for_metrics'],
    report_to='none',
)

In [None]:
trainer = ContractNLIMetricTrainer(
    model=model,                          # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    # train_dataset=train_dataset,         # training dataset
    eval_dataset=dev_dataset,            # evaluation dataset
    data_collator=ContractNLIMetricTrainer.collate_fn,
)

In [None]:
ic.disable()
# ic.enable()
results = trainer.evaluate()
results

In [None]:
test_data = load_data(os.path.join(cfg['test_path']))
test_data = test_data['documents']
test_data = test_data[:1]
test_data