# Install libraries

In [None]:
!pip install ray[tune]
!pip install transformers[torch]
!pip install datasets
!pip install evaluate
!pip install wandb
!pip install sentencepiece

# Install Cloud Storage FUSE.
!echo "deb https://packages.cloud.google.com/apt gcsfuse-`lsb_release -c -s` main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
!apt -qq update && apt -qq install gcsfuse

Collecting ray[tune]
  Downloading ray-2.7.0-cp310-cp310-manylinux2014_x86_64.whl (62.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboardX>=1.9 (from ray[tune])
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX, ray
Successfully installed ray-2.7.0 tensorboardX-2.6.2.2
Collecting transformers[torch]
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m21.8 MB/s[0m eta [36m0:00

# Import libraries

In [None]:
from datetime import datetime
from dataclasses import dataclass, asdict, field
from typing import Union
from pydantic import BaseModel
import random
import os

import numpy as np
import torch
import wandb
from datasets import load_dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    IntervalStrategy,
    PreTrainedModel,
    Trainer,
    TrainingArguments,
    get_linear_schedule_with_warmup,
    set_seed
)

import evaluate
import time

import warnings
warnings.simplefilter(action='ignore')
warnings.filterwarnings("ignore")

import logging
logging.disable(logging.WARNING)

import inspect
from google.colab import auth
auth.authenticate_user()

wandb.login(key="14b51e44b9f1be25514c3281f4dbcabc367c86c2")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# !sudo echo -ne '\n' | sudo add-apt-repository ppa:alessandro-strada/ppa >/dev/null 2>&1 # note: >/dev/null 2>&1 is used to supress printing
# !sudo apt update >/dev/null 2>&1
# !sudo apt install google-drive-ocamlfuse >/dev/null 2>&1
# !google-drive-ocamlfuse
# !sudo apt-get install w3m >/dev/null 2>&1 # to act as web browser
# !xdg-settings set default-web-browser w3m.desktop >/dev/null 2>&1 # to set default browser
# %cd /content
# !mkdir drive
# %cd drive
# !mkdir "MyDrive"
# !google-drive-ocamlfuse "/content/drive/MyDrive"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Mount a Cloud Storage bucket or location, without the gs:// prefix.
mount_path = "x02-coliee_dir"  # or a location like "my-bucket/path/to/mount"
gs_local_path = f"{mount_path}"

!mkdir -p {gs_local_path}
!gcsfuse --implicit-dirs {mount_path} {gs_local_path}

{"time":"04/10/2023 11:59:39.187855","severity":"INFO","msg":"Start gcsfuse/1.2.0 (Go version go1.21.0) for app \"\" using mount point: /content/x02-coliee_dir\n"}


In [None]:
data_local_path = "drive/MyDrive"
model_local_path = "models"

# Run Model

In [None]:
#X01- Version --> changes --> features are added in the hyp sentence as strings using SEP token
# MAKE CHANGES TO THE CONFIG PARAMETERS HERE

# dataset_year: int = 2020 # CHANGE the dataset here
MODEL_TYPE = "hyp-only"
do_train = True

# track time
start_time = time.time()

def get_key_from_value(d, value):
    for key, val in d.items():
        if val == value:
            return key
    return None

feature_dict = {
    "SENTENCE_LENGTH": ['hyp_length'],
    "WORD_OVERLAP": ['overlap', 'is_word_overlap'],
    "HAS_CONTRADICTION_WORDS": ['has_negation'],
    "SUBSEQUENCE_HEURISTICS": ['is_subsequence_heuristic'],
    "ALL": 'all',
    "NONE": "None"
}

models = {
    'BERT_BASE': 'bert-base-uncased',
    'BERT_BASE_MNLI': 'gchhablani/bert-base-cased-finetuned-mnli',
    'ROBERTA_BASE': 'roberta-base',
    'ROBERTA_BASE_MNLI': 'textattack/roberta-base-MNLI',
    'LEGAL_BERT': 'nlpaueb/legal-bert-base-uncased',
    'ELECTRA_BASE_MNLI': 'howey/electra-base-mnli',
    'DEBERTA_BASE_NLI': 'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli'
}


years = ["2018", "2019", "2020", "2021", "2022"]
feature_name = ["SENTENCE_LENGTH", "WORD_OVERLAP", "HAS_CONTRADICTION_WORDS", "SUBSEQUENCE_HEURISTICS", "ALL"]
chosen_model = models['LEGAL_BERT']

for dataset_year in years:
    for feature in feature_name:
        run_num: int = 1 # CHANGE the run number here (a simple hack )
        feat_str = get_key_from_value(feature_dict, feature_dict[feature])
        @dataclass
        class GlobalConfig:
            features: list = field(default_factory=lambda: feature_dict[feature])
            device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            train_file: str = f"/content/{data_local_path}/data/task 4/train/coliee_train_{dataset_year}.csv"
            test_file: str = f"/content/{data_local_path}/data/task 4/test/coliee_test_{dataset_year}.csv"
            train_split: float = 0.9
            num_labels: int = 2
            seed: int = 5
            model_name: str = chosen_model # CHANGE here for models
            run_name: str = f"X02-run-{dataset_year}-{run_num}-{MODEL_TYPE}-{feat_str}-features"
            max_length: int = 512 if MODEL_TYPE == "full-context" else 180 # Based on the hyp and prem sentence lengths in the dataset # CHANGE the max length here

        # Global Configuration
        global_config = GlobalConfig()

        @dataclass
        class BaseModelConfig:
            model_name: str = global_config.model_name
            num_train_epochs: int = 15 # CHANGE the epochs here
            train_batch_size: int = 8
            eval_batch_size: int = 8
            gradient_accumulation_steps: int = 4
            logging_dir: str = './logs'
            logging_steps: int = 10
            evaluation_strategy: str = "epoch"
            save_strategy: str = "epoch"
            load_best_model_at_end: bool = True
            metric_for_best_model: str = "accuracy"
            push_to_hub: bool = False
            output_dir: str = ""
            num_labels: int = 2
            log_steps: int = 10
            seed: int = global_config.seed
            warmup_steps: int = 0 # Adjust this based on your dataset size
            fp16: bool = True

        @dataclass
        class BertConfig(BaseModelConfig):
            learning_rate: float = 5e-6
            weight_decay: float = 0.01

        @dataclass
        class RoBertaConfig(BaseModelConfig):
            learning_rate: float = 5e-5
            weight_decay: float = 0.01

        @dataclass
        class ElectraConfig(BaseModelConfig):
            learning_rate: float = 5e-6
            weight_decay: float = 0.01

        @dataclass
        class DebertaConfig(BaseModelConfig):
            learning_rate: float = 3e-05
            weight_decay: float = 0.06


        # Set the seed for reproducibility
        def set_random_seed(seed: int):
            torch.manual_seed(seed)
            if torch.cuda.is_available():
                torch.cuda.manual_seed_all(seed)
            np.random.seed(seed)
            random.seed(seed)
            set_seed(seed)

        set_random_seed(global_config.seed)

        # Model Configuration

        # Define a function to return model-specific configuration based on the model name
        def get_model_config(model_name: str):
            if model_name == models['BERT_BASE'] or model_name == models['BERT_BASE_MNLI'] or models['LEGAL_BERT']:
                return BertConfig()
            elif model_name == models['ROBERTA_BASE'] or model_name == models['ROBERTA_BASE_MNLI']:
                return RoBertaConfig()
            elif model_name == models['ELECTRA_BASE_MNLI']:
                return ElectraConfig()
            elif model_name == models['DEBERTA_BASE_NLI']:
                return DebertaConfig()
            else:
                raise ValueError(f"Unknown model name: {model_name}. Please check the Models dataclass for valid model names.")

        # Use the function to get the model configuration
        model_config = get_model_config(global_config.model_name)
        model_config.seed = global_config.seed

        if global_config.features == 'all':
            num_features = 5
        elif global_config.features == 'None':
            num_features = 0
        else:
            num_features = len(global_config.features)

        if num_features and MODEL_TYPE == "full-context":
            model_dir = "FullContextModelsWithFeatures"
        elif num_features and MODEL_TYPE == "hyp-only":
            model_dir = "HypOnlyModelsWithFeatures"
        elif not num_features and MODEL_TYPE == "full-context":
            model_dir = "FullContextModels"
        elif not num_features and MODEL_TYPE == "hyp-only":
            model_dir = "HypOnlyModels"

        project_name = f"X02-{global_config.model_name.split('/')[1]}_{model_dir}_seed-{global_config.seed}"
        model_config.output_dir = f"/content/{model_local_path}/X02-{model_dir}/seed-{global_config.seed}/{global_config.model_name}/{dataset_year}/{global_config.run_name}/"

        # initialise model and exp tracking WandB
        if do_train:
            wandb.init(
                project=project_name,
                name=global_config.run_name,
                config=asdict(model_config)
            )
        # Custom Model definition
        class FeatureProjector(torch.nn.Module):
            def __init__(self, input_dim, output_dim):
                super(FeatureProjector, self).__init__()
                self.layer = torch.nn.Linear(input_dim, output_dim)

            def forward(self, x):
                return self.layer(x)

        class ResidualInteraction(torch.nn.Module):
            def __init__(self, bert_output_dim):
                super(ResidualInteraction, self).__init__()
                self.residual_transform = torch.nn.Linear(bert_output_dim, bert_output_dim)

            def forward(self, model_output, projected_feature):
                # Ensure projected_feature has the same shape as model_output
                expanded_feature = projected_feature.unsqueeze(1).expand_as(model_output)
                residual = self.residual_transform(expanded_feature)
                return model_output + residual

        class CustomModelForSequenceClassification(PreTrainedModel):
            def __init__(self, model_name, num_features):
                self.config = AutoConfig.from_pretrained(model_name)
                super(CustomModelForSequenceClassification, self).__init__(self.config)
                self.model = AutoModel.from_pretrained(model_name)
                self.hidden_size = 768
                self.feature_projector = FeatureProjector(num_features, self.hidden_size)
                self.residual_interaction = ResidualInteraction(self.hidden_size)
                self.classifier = torch.nn.Linear(self.hidden_size, 2)

            def forward(self, input_ids, attention_mask=None, token_type_ids=None, custom_features=None, label=None):
                model_output = self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                projected_feature = self.feature_projector(custom_features.float())
                # Extract the last_hidden_state from the model's output
                last_hidden_state = model_output.last_hidden_state
                # Add the residual to the last_hidden_state
                interacted_output = self.residual_interaction(last_hidden_state, projected_feature)
                # Take the hidden state corresponding to the [CLS] token
                cls_output = interacted_output[:, 0, :]
                logits = self.classifier(cls_output)

                # Return logits if labels are not provided
                if label is None:
                    return logits

                # Compute the loss if labels are provided
                loss_fn = torch.nn.CrossEntropyLoss()
                loss = loss_fn(logits, label)
                return loss, logits

        # Load the dataset
        # Data prep
        file_dict = {
            "train" : global_config.train_file,
            "test" : global_config.test_file,
        }

        dataset = load_dataset(
            'csv',
            data_files=file_dict,
            delimiter=',',
            column_names=['id', 'label', 'premise', 'hypothesis', 'labels', 'hyp_tokens','hyp_length', 'prem_tokens', 'prem_length', 'overlap', 'is_word_overlap', 'negations', 'has_negation', 'detected_subsequence', 'is_subsequence_heuristic'],
            skiprows=1
        )

        dataset = dataset.remove_columns('detected_subsequence')


        # view dataset
        print(dataset)
        print(dataset['train'][0])

        # Tokenizer
        tokenizer = AutoTokenizer.from_pretrained(global_config.model_name, use_fast=False)

        # Custom dataset
        class CustomDataset(torch.utils.data.Dataset):
            def __init__(self, dataset, selected_features="all"):
                self.dataset = dataset
                self.selected_features = selected_features

            def __len__(self):
                return len(self.dataset)

            def extract_features(self, item):
                all_features = {
                    'hyp_length': item['hyp_length'],
                    'is_word_overlap': item['is_word_overlap'],
                    'overlap': item['overlap'],
                    'has_negation': item['has_negation'],
                    'is_subsequence_heuristic': item['is_subsequence_heuristic']
                }

                if self.selected_features == "all":
                    extracted_features = all_features
                else:
                    extracted_features = {k: all_features[k] for k in self.selected_features if k in all_features}

                # Stack tensors together along the last dimension
                try:
                    return torch.stack([torch.tensor(value) for value in extracted_features.values()], dim=-1)
                except Exception as e:
                    print(extracted_features)
                    print(e)

            def __getitem__(self, idx):
                item = dict(self.dataset[idx])

                # Tokenization
                inputs = tokenizer(item['premise'], item['hypothesis'], return_tensors="pt", padding=True, truncation=True, max_length=global_config.max_length)
                item['input_ids'] = inputs['input_ids'].squeeze()
                item['attention_mask'] = inputs['attention_mask'].squeeze()

                # Extract custom features
                item['custom_features'] = self.extract_features(item)

                return item

        # Custom hyp only dataset
        class CustomHypOnlyDataset(CustomDataset):
            def __getitem__(self, idx):
                item = dict(self.dataset[idx])

                # Tokenization - Only tokenize the 'hypothesis'
                inputs = tokenizer(item['hypothesis'], return_tensors="pt", padding=True, truncation=True, max_length=global_config.max_length)
                item['input_ids'] = inputs['input_ids'].squeeze()
                item['attention_mask'] = inputs['attention_mask'].squeeze()

                # Extract custom features
                item['custom_features'] = self.extract_features(item)

                return item

        # data collator
        def combined_collate_fn(batch):
            # Extract and pad sequences
            input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True)
            attention_mask = pad_sequence([item['attention_mask'] for item in batch], batch_first=True)
            labels = torch.tensor([item['label'] for item in batch])

            # Extract custom features
            features = torch.stack([item['custom_features'] for item in batch], dim=0)

            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'label': labels,
                'custom_features': features
            }


        # Create datasets
        # Determine the split indices
        num_train = int(global_config.train_split * len(dataset['train']))
        num_eval = len(dataset['train']) - num_train

        # Split the dataset
        train_datasets = dataset['train'].select(range(num_train))
        eval_datasets = dataset['train'].select(range(num_train, num_train + num_eval))
        test_datasets = dataset['test']

        if MODEL_TYPE == "full-context":
            print("Entering Full Context Data prep")
            train_data = CustomDataset(train_datasets, selected_features=global_config.features)
            eval_data = CustomDataset(eval_datasets, selected_features=global_config.features)
            test_data = CustomDataset(test_datasets, selected_features=global_config.features)
            print("Done")
        elif MODEL_TYPE == "hyp-only":
            print("Entering Hyp Only Data prep")
            train_data = CustomHypOnlyDataset(train_datasets, selected_features=global_config.features)
            eval_data = CustomHypOnlyDataset(eval_datasets, selected_features=global_config.features)
            test_data = CustomHypOnlyDataset(test_datasets, selected_features=global_config.features)
            print("Done")

        print(train_data[1])

        # metrics
        metric = evaluate.load("accuracy")
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)
            return metric.compute(predictions=predictions, references=labels)

        # Initialize the model
        def load_model():
            model = CustomModelForSequenceClassification(global_config.model_name, num_features).to(global_config.device)
            print("Model Architecture")
            print(model)
            return model

        # Training arguments

        if do_train:
            model = load_model()
            training_args = TrainingArguments(
                output_dir=model_config.output_dir,
                per_device_train_batch_size=model_config.train_batch_size,
                per_device_eval_batch_size=model_config.eval_batch_size,
                num_train_epochs=model_config.num_train_epochs,
                logging_dir=model_config.logging_dir,
                logging_steps=model_config.logging_steps,
                evaluation_strategy=model_config.evaluation_strategy,
                save_strategy=model_config.save_strategy,
                load_best_model_at_end=model_config.load_best_model_at_end,
                metric_for_best_model=model_config.metric_for_best_model,
                push_to_hub=model_config.push_to_hub,
                fp16=model_config.fp16,
                save_total_limit = 1,
                gradient_accumulation_steps=model_config.gradient_accumulation_steps,
            )

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_data,
                eval_dataset=eval_data,
                data_collator=combined_collate_fn,
                compute_metrics=compute_metrics,
                # callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
            )


            # Train the model
            trainer.train()

            # Evaluate the best model on the test dataset
            print("Normal test data")
            print(trainer.evaluate(test_data))

            results = trainer.predict(test_data)

            wandb.finish()

        else:
            model = load_model()

        # Get instance level predictions

        predictions = results.predictions.argmax(1)
        test_datasets = test_datasets.add_column(name="predictions", column=predictions)
        test_datasets.to_csv(f"{model_config.output_dir}/{global_config.run_name}-instance_predictions.csv")

        test_acc_file_path = f"{data_local_path}/X02_{MODEL_TYPE}_test_accuracy_{global_config.seed}.txt"

        # Check if the file exists; if not, create it
        if not os.path.exists(test_acc_file_path):
            with open(test_acc_file_path, "w+") as f:
              f.write(f"{global_config.model_name}---{global_config.run_name}---{dataset_year}---{global_config.seed}---{results.metrics['test_accuracy']}\n")
        else:
            # Append the data to the file
            with open(test_acc_file_path, "a+") as f:
                f.write(f"{global_config.model_name}---{global_config.run_name}---{dataset_year}---{global_config.seed}---{results.metrics['test_accuracy']}\n")

        source_path = model_config.output_dir
        destination_path = f"gs://x02-coliee_dir/X02-{model_dir}/seed-{global_config.seed}/{global_config.model_name}/{dataset_year}/"

        !gsutil -m cp -r {source_path} {destination_path}

        !rm -rf {model_local_path}


[34m[1mwandb[0m: Currently logged in as: [33mvenkateshdas05[0m. Use [1m`wandb login --relogin`[0m to force relogin


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'premise', 'hypothesis', 'labels', 'hyp_tokens', 'hyp_length', 'prem_tokens', 'prem_length', 'overlap', 'is_word_overlap', 'negations', 'has_negation', 'is_subsequence_heuristic'],
        num_rows: 567
    })
    test: Dataset({
        features: ['id', 'label', 'premise', 'hypothesis', 'labels', 'hyp_tokens', 'hyp_length', 'prem_tokens', 'prem_length', 'overlap', 'is_word_overlap', 'negations', 'has_negation', 'is_subsequence_heuristic'],
        num_rows: 58
    })
})
{'id': 'H18-1-1', 'label': 1, 'premise': 'Article 572\nEven if the seller makes a special agreement to the effect that the seller does not warrant in the case prescribed in the main clause of Article 562, paragraph (1) or Article 565, the seller may not be released from that responsibility with respect to any fact that the seller knew but did not disclose, and with respect to any right that the seller personally created for or assigned to a third part

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Entering Hyp Only Data prep
Done
{'id': 'H18-1-2', 'label': 0, 'premise': 'Article 565\nThe provisions of the preceding three Articles apply mutatis mutandis if the right transferred by the seller to the buyer does not conform to the terms of the contract (including the case in which the seller fails to transfer part of a right that belongs to another person).\nArticle 566\nIf the subject matter delivered by the seller to the buyer does not conform to the terms of the contract with respect to the kind or quality, and the buyer fails to notify the seller of the non-conformity within one year from the time when the buyer becomes aware of it, the buyer may not demand cure of the non-conformity of performance, demand a reduction of the price, claim compensation for loss or damage, or cancel the contract, on the grounds of the non-conformity;provided, however, that this does not apply if the seller knew or did not know due to gross negligence the non-conformity at the time of the delivery..

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Model Architecture
CustomModelForSequenceClassification(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

Epoch,Training Loss,Validation Loss,Accuracy
1,8.3574,4.072123,0.438596


Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.

KeyboardInterrupt

Using eos_token, but it is not set yet.
