#Install Libraries

In [None]:
!pip install transformers[torch]
!pip install datasets
!pip install evaluate
!pip install wandb
!pip install sentencepiece

# Install Cloud Storage FUSE.
!echo "deb https://packages.cloud.google.com/apt gcsfuse-`lsb_release -c -s` main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
!apt -qq update && apt -qq install gcsfuse

Collecting transformers[torch]
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m66.8 MB/s

#Import libraries

In [None]:
from datetime import datetime
from dataclasses import dataclass, asdict, field
from typing import Union
from pydantic import BaseModel
import random
import os
import shutil

import numpy as np
import torch
import wandb
from datasets import load_dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    IntervalStrategy,
    PreTrainedModel,
    Trainer,
    TrainingArguments,
    get_linear_schedule_with_warmup,
    set_seed
)

import evaluate
import time

import warnings
warnings.simplefilter(action='ignore')
warnings.filterwarnings("ignore")

import logging
logging.disable(logging.WARNING)

import inspect
from google.colab import auth
auth.authenticate_user()



wandb.login(key="14b51e44b9f1be25514c3281f4dbcabc367c86c2")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Mount a Cloud Storage bucket or location, without the gs:// prefix.
mount_path = "x01-coliee_dir"  # or a location like "my-bucket/path/to/mount"
gs_local_path = f"{mount_path}"

!mkdir -p {gs_local_path}
!gcsfuse --implicit-dirs {mount_path} {gs_local_path}

{"time":"20/10/2023 10:29:48.509974","severity":"INFO","msg":"Start gcsfuse/1.2.0 (Go version go1.21.0) for app \"\" using mount point: /content/x01-coliee_dir\n"}


In [None]:
data_local_path = "drive/MyDrive"
model_local_path = mount_path

#Adversarial Inference

In [None]:
# track time
start_time = time.time()

def get_key_from_value(d, value):
    for key, val in d.items():
        if val == value:
            return key
    return None

feature_dict = {
    "SENTENCE_LENGTH": ['hyp_length'],
    "WORD_OVERLAP": ['overlap', 'is_word_overlap'],
    "HAS_CONTRADICTION_WORDS": ['has_negation'],
    "SUBSEQUENCE_HEURISTICS": ['is_subsequence_heuristic'],
    "ALL": 'all',
    "NONE": "None"
}

models = {
    'BERT_BASE': 'bert-base-uncased',
    'BERT_BASE_MNLI': 'gchhablani/bert-base-cased-finetuned-mnli',
    'ROBERTA_BASE': 'roberta-base',
    'ROBERTA_BASE_MNLI': 'textattack/roberta-base-MNLI',
    'LEGAL_BERT': 'nlpaueb/legal-bert-base-uncased',
    'ELECTRA_BASE_MNLI': 'howey/electra-base-mnli',
    'DEBERTA_BASE_NLI': 'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli'
}

#X01- Version --> changes --> features are added in the hyp sentence as strings using SEP token
# MAKE CHANGES TO THE CONFIG PARAMETERS HERE

# dataset_year: int = 2020 # CHANGE the dataset here
MODEL_TYPE = "hyp-only"
years = ["2018", "2019", "2020", "2021", "2022"]
feature_name = ["NONE"]#["SENTENCE_LENGTH", "WORD_OVERLAP", "HAS_CONTRADICTION_WORDS", "SUBSEQUENCE_HEURISTICS", "ALL", "NONE"]
chosen_model = models['LEGAL_BERT']
seed = 42

# metrics
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def calculate_separate_acc(dataset, adv_type):
  tmp_dataset = dataset.filter(lambda row: row['Adv Type']==adv_type)
  accuracy = metric.compute(predictions=tmp_dataset['predictions'], references=tmp_dataset['label'])
  return accuracy


for dataset_year in years:
    for feature in feature_name:
        run_num: int = 1 # CHANGE the run number here (a simple hack )
        feat_str = get_key_from_value(feature_dict, feature_dict[feature])
        @dataclass
        class GlobalConfig:
            features: list = field(default_factory=lambda: feature_dict[feature])
            device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            test_file: str = f"/content/{data_local_path}/data/task 4/test/adversarial_test_set/adversarial_test_set.csv"
            num_labels: int = 2
            seed: int = seed
            model_name: str = chosen_model # CHANGE here for models
            run_name: str = f"X01-run-{dataset_year}-{run_num}-{MODEL_TYPE}-{feat_str}-features"
            max_length: int = 512 if MODEL_TYPE == "full-context" else 180 # Based on the hyp and prem sentence lengths in the dataset # CHANGE the max length here
            output_dir: str = ""

        global_config = GlobalConfig()

        if global_config.features == 'all':
            num_features = 5
        elif global_config.features == 'None':
            num_features = 0
        else:
            num_features = len(global_config.features)

        if num_features and MODEL_TYPE == "full-context":
            model_dir = "FullContextModelsWithFeatures"
        elif num_features and MODEL_TYPE == "hyp-only":
            model_dir = "HypOnlyModelsWithFeatures"
        elif not num_features and MODEL_TYPE == "full-context":
            model_dir = "FullContextModels"
        elif not num_features and MODEL_TYPE == "hyp-only":
            model_dir = "HypOnlyModels"

        project_name = f"X01-{global_config.model_name.split('/')[1]}_{model_dir}_seed-{global_config.seed}"
        global_config.output_dir = f"/content/{model_local_path}/X01-{model_dir}/seed-{global_config.seed}/{global_config.model_name}/{dataset_year}/{global_config.run_name}/"
        for folder in os.listdir(global_config.output_dir):
          if folder.startswith("checkpoint"):
            best_model_path = os.path.join(global_config.output_dir, folder)

        # best_model_path = f"{global_config.output_dir}checkpoint-{step_number}"

        # Load the dataset
        # Data prep
        file_dict = {
            "test" : global_config.test_file,
        }

        dataset = load_dataset(
            'csv',
            data_files=file_dict,
            delimiter=',',
            column_names=['id', 'label', 'premise', 'hypothesis', 'labels', 'hyp_tokens','hyp_length', 'prem_tokens', 'prem_length', 'overlap', 'is_word_overlap', 'negations', 'has_negation', 'detected_subsequence', 'is_subsequence_heuristic', 'Artefact Type', 'Adv Type'],
            skiprows=1
        )

        dataset = dataset.remove_columns('detected_subsequence')


        # view dataset
        print(dataset)
        print(dataset['test'][0])

        # Tokenizer
        tokenizer = AutoTokenizer.from_pretrained(global_config.model_name, use_fast=False)

        # Custom dataset
        class CustomDataset(torch.utils.data.Dataset):
            def __init__(self, dataset, selected_features="None"):
                self.dataset = dataset
                self.selected_features = selected_features

            def __len__(self):
                return len(self.dataset)

            def extract_features(self, item):
                all_features = {
                    'hyp_length': item['hyp_length'],
                    'is_word_overlap': item['is_word_overlap'],
                    'overlap': item['overlap'],
                    'has_negations': item['has_negation'],
                    'is_subsequence_heuristic': item['is_subsequence_heuristic']
                }

                if self.selected_features == "all":
                    extracted_features = all_features
                elif self.selected_features == "None":
                    extracted_features = {}
                else:
                    extracted_features = {k: all_features[k] for k in self.selected_features if k in all_features}

                return extracted_features

            def __getitem__(self, idx):
                item = dict(self.dataset[idx])

                extracted_features = self.extract_features(item)

                prem_sentence = item['premise']
                hyp_sentence = item['hypothesis']
                if extracted_features:
                    for key, values in extracted_features.items():
                        hyp_sentence += "[SEP]" + str(values)

                # Tokenization
                inputs = tokenizer(prem_sentence, hyp_sentence, return_tensors="pt", padding='max_length', truncation='longest_first', max_length=global_config.max_length, add_special_tokens=True)
                item['input_ids'] = inputs['input_ids'].squeeze()
                item['attention_mask'] = inputs['attention_mask'].squeeze()

                return item

        # Custom hyp only dataset
        class CustomHypOnlyDataset(CustomDataset):
            def __getitem__(self, idx):
                item = dict(self.dataset[idx])

                extracted_features = self.extract_features(item)

                hyp_sentence = item['hypothesis']

                if extracted_features:
                    for key, values in extracted_features.items():
                        hyp_sentence += "[SEP]" + str(values)

                # Tokenization - Only tokenize the 'hypothesis'
                inputs = tokenizer(hyp_sentence, return_tensors="pt", padding='max_length', truncation=True, max_length=global_config.max_length, add_special_tokens=True)
                item['input_ids'] = inputs['input_ids'].squeeze()
                item['attention_mask'] = inputs['attention_mask'].squeeze()

                return item

        # Create datasets
        # Determine the split indices
        test_datasets = dataset['test']

        if MODEL_TYPE == "full-context":
            print("Entering Full Context Data prep")
            test_data = CustomDataset(test_datasets, selected_features=global_config.features)
            print("Done")
        elif MODEL_TYPE == "hyp-only":
            print("Entering Hyp Only Data prep")
            test_data = CustomHypOnlyDataset(test_datasets, selected_features=global_config.features)
            print("Done")

        # Inference
        model = AutoModelForSequenceClassification.from_pretrained(best_model_path, num_labels=global_config.num_labels, ignore_mismatched_sizes=True).to(global_config.device)
        model.eval()
        # Set up the Trainer and TrainingArguments for evaluation
        test_args = TrainingArguments(
            output_dir = "./model_output",
            per_device_eval_batch_size=16,
            logging_dir="./test_logs/test",
        )

        tester = Trainer(
            model=model,
            args=test_args,
            compute_metrics=compute_metrics,
        )

        # Evaluate the model on the test dataset
        results = tester.predict(test_data)
        print("Normal test data")
        print(results.metrics)

        # Get instance level predictions

        predictions = results.predictions.argmax(1)
        test_datasets = test_datasets.add_column(name="predictions", column=predictions)
        test_datasets.to_csv(f"{global_config.output_dir}/{global_config.run_name}-adversarial_instance_predictions.csv")
        against_acc = calculate_separate_acc(test_datasets, "Against")
        for_acc = calculate_separate_acc(test_datasets, "For")


        test_acc_file_path = f"{data_local_path}/{model_dir}_adversarial_test_accuracy_{global_config.seed}.txt"

        # Check if the file exists; if not, create it
        if not os.path.exists(test_acc_file_path):
            with open(test_acc_file_path, "w+") as f:
              f.write("*"*50)
              f.write(f"{global_config.model_name}---{global_config.run_name}---{dataset_year}---{global_config.seed}---ALL---{results.metrics['test_accuracy']}\n")
              f.write(f"{global_config.model_name}---{global_config.run_name}---{dataset_year}---{global_config.seed}---AGAINST---{against_acc}\n")
              f.write(f"{global_config.model_name}---{global_config.run_name}---{dataset_year}---{global_config.seed}---FOR---{for_acc}\n")
              f.write("*"*50)

        else:
            # Append the data to the file
            with open(test_acc_file_path, "a+") as f:
                f.write("*"*50)
                f.write(f"{global_config.model_name}---{global_config.run_name}---{dataset_year}---{global_config.seed}---{results.metrics['test_accuracy']}\n")
                f.write(f"{global_config.model_name}---{global_config.run_name}---{dataset_year}---{global_config.seed}---AGAINST---{against_acc}\n")
                f.write(f"{global_config.model_name}---{global_config.run_name}---{dataset_year}---{global_config.seed}---FOR---{for_acc}\n")
                f.write("*"*50)

end_time = time.time()
print(f"Total time taken {(end_time - start_time)/60} mins")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['id', 'label', 'premise', 'hypothesis', 'labels', 'hyp_tokens', 'hyp_length', 'prem_tokens', 'prem_length', 'overlap', 'is_word_overlap', 'negations', 'has_negation', 'is_subsequence_heuristic', 'Artefact Type', 'Adv Type', '__index_level_0__'],
        num_rows: 42
    })
})
{'id': 'H25-26-5', 'label': 0, 'premise': "Article 650\n(1) If the mandatary has expended costs found to be necessary for the administration of the mandated business, the mandatary may claim reimbursement of those costs from the mandator and any interest on the same from the day the costs were expended.\n(2) If the mandatary has borne any obligation found to be necessary for the administration of the mandated business, the mandatary may demand that the mandator perform the obligation on the mandatary's behalf. In such cases, if the obligation has not yet fallen due, the mandatary may require the mandator to tender reasonable security.\n(3) If a mandatary incurs 

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Entering Hyp Only Data prep
Done


Normal test data
{'test_loss': 2.5935206413269043, 'test_accuracy': 0.30952380952380953, 'test_runtime': 3.4272, 'test_samples_per_second': 12.255, 'test_steps_per_second': 0.875}


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Filter:   0%|          | 0/42 [00:00<?, ? examples/s]

Filter:   0%|          | 0/42 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['id', 'label', 'premise', 'hypothesis', 'labels', 'hyp_tokens', 'hyp_length', 'prem_tokens', 'prem_length', 'overlap', 'is_word_overlap', 'negations', 'has_negation', 'is_subsequence_heuristic', 'Artefact Type', 'Adv Type', '__index_level_0__'],
        num_rows: 42
    })
})
{'id': 'H25-26-5', 'label': 0, 'premise': "Article 650\n(1) If the mandatary has expended costs found to be necessary for the administration of the mandated business, the mandatary may claim reimbursement of those costs from the mandator and any interest on the same from the day the costs were expended.\n(2) If the mandatary has borne any obligation found to be necessary for the administration of the mandated business, the mandatary may demand that the mandator perform the obligation on the mandatary's behalf. In such cases, if the obligation has not yet fallen due, the mandatary may require the mandator to tender reasonable security.\n(3) If a mandatary incurs 

Normal test data
{'test_loss': 0.7579184770584106, 'test_accuracy': 0.5476190476190477, 'test_runtime': 0.5591, 'test_samples_per_second': 75.117, 'test_steps_per_second': 5.365}


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Filter:   0%|          | 0/42 [00:00<?, ? examples/s]

Filter:   0%|          | 0/42 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['id', 'label', 'premise', 'hypothesis', 'labels', 'hyp_tokens', 'hyp_length', 'prem_tokens', 'prem_length', 'overlap', 'is_word_overlap', 'negations', 'has_negation', 'is_subsequence_heuristic', 'Artefact Type', 'Adv Type', '__index_level_0__'],
        num_rows: 42
    })
})
{'id': 'H25-26-5', 'label': 0, 'premise': "Article 650\n(1) If the mandatary has expended costs found to be necessary for the administration of the mandated business, the mandatary may claim reimbursement of those costs from the mandator and any interest on the same from the day the costs were expended.\n(2) If the mandatary has borne any obligation found to be necessary for the administration of the mandated business, the mandatary may demand that the mandator perform the obligation on the mandatary's behalf. In such cases, if the obligation has not yet fallen due, the mandatary may require the mandator to tender reasonable security.\n(3) If a mandatary incurs 

Normal test data
{'test_loss': 3.5723211765289307, 'test_accuracy': 0.3333333333333333, 'test_runtime': 0.6368, 'test_samples_per_second': 65.958, 'test_steps_per_second': 4.711}


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Filter:   0%|          | 0/42 [00:00<?, ? examples/s]

Filter:   0%|          | 0/42 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['id', 'label', 'premise', 'hypothesis', 'labels', 'hyp_tokens', 'hyp_length', 'prem_tokens', 'prem_length', 'overlap', 'is_word_overlap', 'negations', 'has_negation', 'is_subsequence_heuristic', 'Artefact Type', 'Adv Type', '__index_level_0__'],
        num_rows: 42
    })
})
{'id': 'H25-26-5', 'label': 0, 'premise': "Article 650\n(1) If the mandatary has expended costs found to be necessary for the administration of the mandated business, the mandatary may claim reimbursement of those costs from the mandator and any interest on the same from the day the costs were expended.\n(2) If the mandatary has borne any obligation found to be necessary for the administration of the mandated business, the mandatary may demand that the mandator perform the obligation on the mandatary's behalf. In such cases, if the obligation has not yet fallen due, the mandatary may require the mandator to tender reasonable security.\n(3) If a mandatary incurs 

Normal test data
{'test_loss': 0.6931537389755249, 'test_accuracy': 0.5, 'test_runtime': 0.6309, 'test_samples_per_second': 66.572, 'test_steps_per_second': 4.755}


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Filter:   0%|          | 0/42 [00:00<?, ? examples/s]

Filter:   0%|          | 0/42 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['id', 'label', 'premise', 'hypothesis', 'labels', 'hyp_tokens', 'hyp_length', 'prem_tokens', 'prem_length', 'overlap', 'is_word_overlap', 'negations', 'has_negation', 'is_subsequence_heuristic', 'Artefact Type', 'Adv Type', '__index_level_0__'],
        num_rows: 42
    })
})
{'id': 'H25-26-5', 'label': 0, 'premise': "Article 650\n(1) If the mandatary has expended costs found to be necessary for the administration of the mandated business, the mandatary may claim reimbursement of those costs from the mandator and any interest on the same from the day the costs were expended.\n(2) If the mandatary has borne any obligation found to be necessary for the administration of the mandated business, the mandatary may demand that the mandator perform the obligation on the mandatary's behalf. In such cases, if the obligation has not yet fallen due, the mandatary may require the mandator to tender reasonable security.\n(3) If a mandatary incurs 

Normal test data
{'test_loss': 3.590916872024536, 'test_accuracy': 0.3333333333333333, 'test_runtime': 0.6129, 'test_samples_per_second': 68.527, 'test_steps_per_second': 4.895}


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Filter:   0%|          | 0/42 [00:00<?, ? examples/s]

Filter:   0%|          | 0/42 [00:00<?, ? examples/s]

Total time taken 16.788704212506612 mins
