# Reqs

In [None]:
!pip install pytorch_lightning
!pip install transformers

Collecting pytorch_lightning
  Downloading pytorch_lightning-1.5.4-py3-none-any.whl (524 kB)
[K     |████████████████████████████████| 524 kB 5.2 MB/s 
[?25hCollecting fsspec[http]!=2021.06.0,>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 72.8 MB/s 
Collecting torchmetrics>=0.4.1
  Downloading torchmetrics-0.6.0-py3-none-any.whl (329 kB)
[K     |████████████████████████████████| 329 kB 65.3 MB/s 
[?25hCollecting PyYAML>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.4 MB/s 
[?25hCollecting pyDeprecate==0.3.1
  Downloading pyDeprecate-0.3.1-py3-none-any.whl (10 kB)
Collecting future>=0.17.1
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 31.5 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.m

In [None]:
import torch
import pytorch_lightning as pl
from torch.nn import functional as F
import pandas as pd
import argparse
from transformers import AutoConfig, AutoModelForMultipleChoice, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AdamW,
    get_linear_schedule_with_warmup
)


# Dataset Classes

## MC Dataset

In [None]:
class MCQDataset(Dataset):
    def __init__(self, tokenizer, df_file_name, input_max_len=512, max_samples=-1):
        self.tokenizer = tokenizer
        self.df = pd.read_csv(df_file_name)
        if (max_samples != -1):
            self.df = self.df.head(max_samples)
        self.input_max_len = input_max_len
        columns = list(self.df.columns)
        self.num_options = len([x for x in columns if x.startswith("ending")])

    def __getitem__(self, index):
        sentence1 = str(self.df.iloc[index]["sent1"])

        if ('sent2' in self.df.columns):
            sentence2 = str(self.df.iloc[index]["sent2"])
        else:
            sentence2 = None

        options = []
        for i in range(self.num_options):
            options.append(f"{sentence2} {self.df.iloc[index]['ending' + str(i)]}")

        label = self.df.iloc[index]["label"]
        sentence1 = [sentence1] * self.num_options

        instance_encoding = self._get_encoding(
            sentence1=sentence1,
            sentence2=options,
            add_special_tokens=True,
            truncation=True,
            max_length=self.input_max_len,
            padding='max_length',
        )
        return {
            "input_ids": instance_encoding["input_ids"],
            "attention_mask": instance_encoding["attention_mask"],
            "labels": torch.tensor(label),
        }

    def _get_encoding(self, sentence1, sentence2, add_special_tokens=False, truncation=True, max_length=-1,
                      padding=None):

        encoded_input = self.tokenizer(
            sentence1,
            sentence2,
            add_special_tokens=add_special_tokens,
            truncation=truncation,
            max_length=max_length,
            padding=padding,
            return_overflowing_tokens=False,
            return_tensors="pt",
        )
        if "num_truncated_tokens" in encoded_input and encoded_input["num_truncated_tokens"] > 0:
            # print("Attention! you are cropping tokens")
            pass

        # tokenized_inputs = {k: [v[i: i + args.num_options] for i in range(0, len(v), args.num_options)] for k, v in
        #                     tokenized_example.items()}

        input_ids = encoded_input["input_ids"].squeeze(0)

        attention_mask = encoded_input["attention_mask"].squeeze(0) if "attention_mask" in encoded_input else None
        token_type_ids = encoded_input["token_type_ids"].squeeze(0) if "token_type_ids" in encoded_input else None
        data_input = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
        if (token_type_ids != None):
            data_input["token_type_ids"] = token_type_ids
        return data_input

    def __len__(self):
        return self.df.shape[0]





## SWAG Dataset

In [None]:
class SWAGDataset(Dataset):
    def __init__(self, tokenizer, df_file_name, input_max_len=150, max_samples=-1):
        self.tokenizer = tokenizer
        self.df = pd.read_csv(df_file_name)
        if (max_samples != -1):
            self.df = self.df.head(max_samples)
        self.input_max_len = input_max_len
        columns = list(self.df.columns)
        self.num_options = len([x for x in columns if x.startswith("ending")])

    def __getitem__(self, index):
        sentence1 = str(self.df.iloc[index]["sent1"])

        if ('sent2' in self.df.columns):
            sentence2 = str(self.df.iloc[index]["sent2"])
        else:
            sentence2 = None

        options = []
        for i in range(self.num_options):
            options.append(f"{sentence2} {self.df.iloc[index]['ending' + str(i)]}")

        label = self.df.iloc[index]["labels"]
        sentence1 = [sentence1] * self.num_options

        instance_encoding = self._get_encoding(
            sentence1=sentence1,
            sentence2=options,
            add_special_tokens=True,
            truncation=True,
            max_length=self.input_max_len,
            padding='max_length',
        )
        return {
            "input_ids": instance_encoding["input_ids"],
            "attention_mask": instance_encoding["attention_mask"],
            "labels": torch.tensor(label),
        }

    def _get_encoding(self, sentence1, sentence2, add_special_tokens=False, truncation=True, max_length=-1,
                      padding=None):

        encoded_input = self.tokenizer(
            sentence1,
            sentence2,
            add_special_tokens=add_special_tokens,
            truncation=truncation,
            max_length=max_length,
            padding=padding,
            return_overflowing_tokens=False,
            return_tensors="pt",
        )
        if "num_truncated_tokens" in encoded_input and encoded_input["num_truncated_tokens"] > 0:
            # print("Attention! you are cropping tokens")
            pass

        # tokenized_inputs = {k: [v[i: i + args.num_options] for i in range(0, len(v), args.num_options)] for k, v in
        #                     tokenized_example.items()}

        input_ids = encoded_input["input_ids"].squeeze(0)

        attention_mask = encoded_input["attention_mask"].squeeze(0) if "attention_mask" in encoded_input else None
        token_type_ids = encoded_input["token_type_ids"].squeeze(0) if "token_type_ids" in encoded_input else None
        data_input = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
        if (token_type_ids != None):
            data_input["token_type_ids"] = token_type_ids
        return data_input

    def __len__(self):
        return self.df.shape[0]





## Commonsense QA

In [None]:
class CommonsenseQADataset(Dataset):
    def __init__(self, tokenizer, df_file_name, input_max_len=100, max_samples=-1):
        self.tokenizer = tokenizer
        self.df = pd.read_csv(df_file_name)
        if (max_samples != -1):
            self.df = self.df.head(max_samples)
        self.input_max_len = input_max_len
        columns = list(self.df.columns)
        self.num_options = 5

    def __getitem__(self, index):
        """
        Items are encoded in the format:
        [CLS] question [SEP] choice
        """
        question = str(self.df.iloc[index]["stem"])

        # Multiple Choice Options
        answerA = str(self.df.iloc[index]["A"])
        answerB = str(self.df.iloc[index]["B"])
        answerC = str(self.df.iloc[index]["C"])
        answerD = str(self.df.iloc[index]["D"])
        answerE = str(self.df.iloc[index]["E"])


        choice = [answerA, answerB, answerC, answerD, answerE]

        # Label for this dataset
        label = self.df.iloc[index]["answerKey"]
        context = [question] * self.num_options

        instance_encoding = self._get_encoding(
            sentence1=context,
            sentence2=choice,
            add_special_tokens=True,
            truncation=True,
            max_length=self.input_max_len,
            padding='max_length',
        )
        
        return {
            "input_ids": instance_encoding["input_ids"],
            "attention_mask": instance_encoding["attention_mask"],
            "labels": torch.tensor(label),
        }

    def _get_encoding(self, sentence1, sentence2, add_special_tokens=False, truncation=True, max_length=-1,
                      padding=None):
        encoded_input = self.tokenizer(
            sentence1,
            sentence2,
            add_special_tokens=add_special_tokens,
            truncation=truncation,
            max_length=max_length,
            padding=padding,
            return_overflowing_tokens=False,
            return_tensors="pt",
        )
        if "num_truncated_tokens" in encoded_input and encoded_input["num_truncated_tokens"] > 0:
            print("Attention! you are cropping tokens")
            pass
        input_ids = encoded_input["input_ids"].squeeze(0)
        attention_mask = encoded_input["attention_mask"].squeeze(0) if "attention_mask" in encoded_input else None
        token_type_ids = encoded_input["token_type_ids"].squeeze(0) if "token_type_ids" in encoded_input else None
        data_input = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
        if (token_type_ids != None):
            data_input["token_type_ids"] = token_type_ids
        return data_input



    def __len__(self):
        return self.df.shape[0]

## Abductive NLI

In [None]:
class AbductiveNLIDataset(Dataset):
    def __init__(self, tokenizer, df_file_name, input_max_len=100, max_samples=-1):
        self.tokenizer = tokenizer
        self.df = pd.read_csv(df_file_name)
        if (max_samples != -1):
            self.df = self.df.head(max_samples)
        self.input_max_len = input_max_len
        columns = list(self.df.columns)
        self.num_options = len([x for x in columns if x.startswith("hyp")])

    def __getitem__(self, index):
        observation1 = str(self.df.iloc[index]["obs1"])
        observation2 = str(self.df.iloc[index]["obs2"])


        options = []
        for i in range(1, self.num_options+1):
            options.append(f"{self.df.iloc[index]['hyp' + str(i)]}")

        label = self.df.iloc[index]["labels"]
        sentence1 = [f"{observation1} {observation2}"] * self.num_options

        instance_encoding = self._get_encoding(
            sentence1=sentence1,
            sentence2=options,
            add_special_tokens=True,
            truncation=True,
            max_length=self.input_max_len,
            padding='max_length',
        )
        return {
            "input_ids": instance_encoding["input_ids"],
            "attention_mask": instance_encoding["attention_mask"],
            "labels": torch.tensor(label),
        }

    def _get_encoding(self, sentence1, sentence2, add_special_tokens=False, truncation=True, max_length=-1,
                      padding=None):

        encoded_input = self.tokenizer(
            sentence1,
            sentence2,
            add_special_tokens=add_special_tokens,
            truncation=truncation,
            max_length=max_length,
            padding=padding,
            return_overflowing_tokens=False,
            return_tensors="pt",
        )
        if "num_truncated_tokens" in encoded_input and encoded_input["num_truncated_tokens"] > 0:
            # print("Attention! you are cropping tokens")
            pass

        # tokenized_inputs = {k: [v[i: i + args.num_options] for i in range(0, len(v), args.num_options)] for k, v in
        #                     tokenized_example.items()}

        input_ids = encoded_input["input_ids"].squeeze(0)

        attention_mask = encoded_input["attention_mask"].squeeze(0) if "attention_mask" in encoded_input else None
        token_type_ids = encoded_input["token_type_ids"].squeeze(0) if "token_type_ids" in encoded_input else None
        data_input = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
        if (token_type_ids != None):
            data_input["token_type_ids"] = token_type_ids
        return data_input

    def __len__(self):
        return self.df.shape[0]





## Social IQA Dataset

In [None]:
class SocialIQADataset(Dataset):
    def __init__(self, tokenizer, df_file_name, input_max_len=100, max_samples=-1):
        self.tokenizer = tokenizer
        self.df = pd.read_csv(df_file_name)
        if (max_samples != -1):
            self.df = self.df.head(max_samples)
        self.input_max_len = input_max_len
        columns = list(self.df.columns)
        self.num_options = len([x for x in columns if x.startswith("answer")])

    def __getitem__(self, index):
        """
        Items are encoded in the format:
        [CLS] context [SEP] question + multiple_choice_option
        """
        context = str(self.df.iloc[index]["context"])

        question = str(self.df.iloc[index]["question"])
        # Multiple Choice Options
        answerA = str(self.df.iloc[index]["answerA"])
        answerB = str(self.df.iloc[index]["answerB"])
        answerC = str(self.df.iloc[index]["answerC"])

        choice = [" ".join([question, answerA]), " ".join([question, answerB]), " ".join([question, answerC])]

        # Label for this dataset
        label = self.df.iloc[index]["labels"]
        context = [context] * self.num_options

        instance_encoding = self._get_encoding(
            sentence1=context,
            sentence2=choice,
            add_special_tokens=True,
            truncation=True,
            max_length=self.input_max_len,
            padding='max_length',
        )
        
        return {
            "input_ids": instance_encoding["input_ids"],
            "attention_mask": instance_encoding["attention_mask"],
            "labels": torch.tensor(label),
        }

    def _get_encoding(self, sentence1, sentence2, add_special_tokens=False, truncation=True, max_length=-1,
                      padding=None):
        encoded_input = self.tokenizer(
            sentence1,
            sentence2,
            add_special_tokens=add_special_tokens,
            truncation=truncation,
            max_length=max_length,
            padding=padding,
            return_overflowing_tokens=False,
            return_tensors="pt",
        )
        if "num_truncated_tokens" in encoded_input and encoded_input["num_truncated_tokens"] > 0:
            print("Attention! you are cropping tokens")
            pass
        input_ids = encoded_input["input_ids"].squeeze(0)
        attention_mask = encoded_input["attention_mask"].squeeze(0) if "attention_mask" in encoded_input else None
        token_type_ids = encoded_input["token_type_ids"].squeeze(0) if "token_type_ids" in encoded_input else None
        data_input = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
        if (token_type_ids != None):
            data_input["token_type_ids"] = token_type_ids
        return data_input



    def __len__(self):
        return self.df.shape[0]

# MC Model

In [None]:
def compute_accuracy(logits, labels):
    predicted_label = logits.max(dim=1)[1]
    acc = (predicted_label == labels).float().mean()
    return acc, predicted_label

class MCQModel(pl.LightningModule):
    def __init__(self, training_arguments, model_arguments, other_arguments):
        super(MCQModel, self).__init__()

        self.training_arguments = training_arguments
        self.model_arguments = model_arguments
        self.other_arguments = other_arguments
        self.tokenizer = AutoTokenizer.from_pretrained(model_arguments.model_name_or_path)
        config = AutoConfig.from_pretrained(model_arguments.model_name_or_path,
                                            hidden_dropout_prob=model_arguments.hidden_dropout_prob)

        self.model = AutoModelForMultipleChoice.from_pretrained(model_arguments.model_name_or_path, config=config)
        self.save_hyperparameters("training_arguments")
        self.save_hyperparameters("model_arguments")

    def is_logger(self):
        return self.trainer.proc_rank <= 0

    def forward(self,
                input_ids=None,
                inputs_embeds=None,
                attention_mask=None,
                token_type_ids=None,
                labels=None):
        return self.model(
            input_ids=input_ids,
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels,
        )

    def _step(self, batch):
        outputs = self(**batch)
        loss = outputs.loss
        logits = outputs.logits
        softmax_logits = F.softmax(logits, dim=1)
        return loss, softmax_logits

    def training_step(self, batch, batch_idx):
        loss, logits = self._step(batch)
        acc, predicted_label = compute_accuracy(logits, batch["labels"])
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('train_acc', acc, on_step=True, on_epoch=True, prog_bar=True)
        return {"loss": loss, "acc": acc}

    def training_epoch_end(self, outputs):
        avg_loss = torch.cat([x['loss'].view(-1) for x in outputs]).mean()
        avg_acc = torch.cat([x['acc'].view(-1) for x in outputs]).mean()

        print("--------------------")
        print("Train avg_loss: ", avg_loss)
        print("Train avg_acc: ", avg_acc)
        print("--------------------")

    def validation_step(self, batch, batch_idx):
        loss, logits = self._step(batch)
        logits = logits.squeeze(1)
        acc, predicted_label = compute_accuracy(logits, batch["labels"])
        self.log('val_loss', loss, on_epoch=True)
        self.log('val_acc', acc, on_epoch=True)
        return {
            "val_loss": loss,
            "val_acc": acc,
            "softmax_logits": logits.tolist(),
            "labels": batch["labels"].tolist(),
            "predictions": predicted_label.tolist(),
        }

    def validation_epoch_end(self, outputs):
        avg_loss = torch.cat([x['val_loss'].view(-1) for x in outputs]).mean()
        avg_acc = torch.cat([x['val_acc'].view(-1) for x in outputs]).mean()

        all_labels = []
        all_predictions = []
        all_softmax_logits = []

        for x in outputs:
            all_predictions += torch.tensor(x["predictions"]).tolist()
            all_softmax_logits += torch.tensor(x["softmax_logits"]).tolist()
            all_labels += torch.tensor(x["labels"]).tolist()

        softmax_logits_df = pd.DataFrame(all_softmax_logits)
        print("--------------------")
        print("Validation avg_loss: ", avg_loss)
        print("Validation avg_acc: ", avg_acc)

        result_df = pd.DataFrame({
            "label": all_labels,
            "prediction": all_predictions,
        })

        result_df = pd.concat([result_df, softmax_logits_df], axis=1)

        if (self.other_arguments.write_dev_predictions):
            output_path = self.other_arguments.output_dir + "epoch_" + str(
                self.trainer.current_epoch) + "_" + self.other_arguments.predictions_file
            print(f"Writing predictions for {self.other_arguments.DEV_FILE} to {output_path}")
            result_df.to_csv(output_path, index=False)
        print("--------------------")

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.training_arguments.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.other_arguments.learning_rate,
                          eps=self.training_arguments.adam_epsilon)
        self.opt = optimizer
        return [optimizer]

    def optimizer_step(self, epoch=None, batch_idx=None, optimizer=None, optimizer_idx=None, optimizer_closure=None,
                       on_tpu=None, using_native_amp=None, using_lbfgs=None):
        optimizer.step(closure=optimizer_closure)
        optimizer.zero_grad()
        self.lr_scheduler.step()

    def train_dataloader(self):
        train_dataset = AbductiveNLIDataset(
            tokenizer=self.tokenizer,
            df_file_name=self.other_arguments.TRAIN_FILE,
            input_max_len=self.model_arguments.max_input_seq_length,
            max_samples=self.other_arguments.max_train_samples,
        )
        dataloader = DataLoader(
            train_dataset,
            self.other_arguments.train_batch_size,
            drop_last=True, shuffle=True,
            num_workers=self.training_arguments.num_workers)

        t_total = (
                (len(dataloader.dataset) // (
                        self.other_arguments.train_batch_size * max(1, self.training_arguments.n_gpu)))
                // self.other_arguments.gradient_accumulation_steps
                * float(self.other_arguments.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.training_arguments.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = AbductiveNLIDataset(
            tokenizer=self.tokenizer,
            df_file_name=self.other_arguments.DEV_FILE,
            input_max_len=self.model_arguments.max_input_seq_length,
        )

        return DataLoader(val_dataset,
                          batch_size=self.other_arguments.eval_batch_size,
                          num_workers=self.training_arguments.num_workers)

# Arg Definitions

In [None]:
parser = argparse.ArgumentParser()

# Training arguments
training_arguments = parser.add_argument_group('training_arguments')
training_arguments.add_argument("--opt_level", default="O1")
training_arguments.add_argument("--warmup_steps", default=0, type=int)
training_arguments.add_argument('--weight_decay', type=float, default=0.0)
training_arguments.add_argument('--adam_epsilon', type=float, default=1e-8)
training_arguments.add_argument('--max_grad_norm', type=float, default=1.0)
training_arguments.add_argument("--early_stop_callback", default=False, action="store_true")
training_arguments.add_argument("--fp_16", default=False, action="store_true")
training_arguments.add_argument("--n_gpu", default=-1, type=int)
training_arguments.add_argument("--num_workers", default=8, type=int)
training_arguments.add_argument("--distributed_backend", default=None)

# Model arguments
model_arguments = parser.add_argument_group('model_arguments')
model_arguments.add_argument("--model_name_or_path", default=None)
model_arguments.add_argument("--max_input_seq_length", default=512, type=int)
model_arguments.add_argument('--hidden_dropout_prob', type=float, default=0.15)

# Other arguments
other_arguments = parser.add_argument_group('other_arguments')
other_arguments.add_argument("--output_dir", default="./")
other_arguments.add_argument("--predictions_file", default="predictions.csv")
other_arguments.add_argument("--TRAIN_FILE", default=None)
other_arguments.add_argument("--DEV_FILE", default=None)
other_arguments.add_argument("--train_batch_size", default=2, type=int)
other_arguments.add_argument("--eval_batch_size", default=2, type=int)
other_arguments.add_argument("--max_train_samples", default=-1, type=int)
other_arguments.add_argument("--num_train_epochs", default=2, type=int)
other_arguments.add_argument("--gradient_accumulation_steps", default=1, type=int)
other_arguments.add_argument("--seed", default=42, type=int)
other_arguments.add_argument("--save_top_k", default=-1, type=int)
other_arguments.add_argument("--save_last", default=False, action="store_true")
other_arguments.add_argument("--write_dev_predictions", default=False, action="store_true")
other_arguments.add_argument('--learning_rate', type=float, default=3e-4)
other_arguments.add_argument("--do_fast_dev_run", default=False, action="store_true")
other_arguments.add_argument("--limit_train_batches", default=-1, type=int)
other_arguments.add_argument("--limit_val_batches", default=-1, type=int)

_StoreAction(option_strings=['--limit_val_batches'], dest='limit_val_batches', nargs=None, const=None, default=-1, type=<class 'int'>, choices=None, help=None, metavar=None)

# Arguments + Execution

In [None]:
args = parser.parse_args(
"""
--model_name_or_path bert-base-uncased  
--max_input_seq_length 100  
--TRAIN_FILE /content/drive/MyDrive/NLP/data/abductivenli/segments/mixed/mixed.csv
--output_dir /content/drive/MyDrive/NLP/data/abductivenli/segments/mixed/
--DEV_FILE /content/drive/MyDrive/NLP/data/abductivenli/dev.csv
--train_batch_size 16
--eval_batch_size 16
--max_train_samples 5000
--num_train_epochs 5 
--gradient_accumulation_steps 1 
--save_top_k 0
--learning_rate 5e-5
--write_dev_predictions
""".split()
)

for group in parser._action_groups:
    group_dict = {a.dest: getattr(args, a.dest, None) for a in group._group_actions}
    if (group.title == "training_arguments"):
        training_arguments = argparse.Namespace(**group_dict)
    elif (group.title == "model_arguments"):
        model_arguments = argparse.Namespace(**group_dict)
    elif (group.title == "other_arguments"):
        other_arguments = argparse.Namespace(**group_dict)

print("Training arguments", training_arguments)
print("--------------------")
print("Model arguments", model_arguments)
print("--------------------")
print("Other arguments", other_arguments)
print("--------------------")

pl.seed_everything(other_arguments.seed)
model = MCQModel(training_arguments=training_arguments,
                            model_arguments=model_arguments,
                            other_arguments=other_arguments)

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=other_arguments.output_dir,
    monitor="val_acc",
    save_top_k=other_arguments.save_top_k,
    save_last=other_arguments.save_last,
    mode='max'
)

train_params = dict(
    accumulate_grad_batches=other_arguments.gradient_accumulation_steps,
    gpus=training_arguments.n_gpu,
    deterministic=True,
    max_epochs=other_arguments.num_train_epochs,
    precision=16 if training_arguments.fp_16 else 32,
    # amp_level=training_arguments.opt_level,
    amp_backend="apex",
    gradient_clip_val=training_arguments.max_grad_norm,
    callbacks=checkpoint_callback,
    fast_dev_run=other_arguments.do_fast_dev_run,
)

if (other_arguments.limit_train_batches != -1):
    train_params["limit_train_batches"] = other_arguments.limit_train_batches

if (other_arguments.limit_val_batches != -1):
    train_params["limit_val_batches"] = other_arguments.limit_val_batches

if (training_arguments.distributed_backend != None):
    train_params["distributed_backend"] = training_arguments.distributed_backend

trainer = pl.Trainer(**train_params)
trainer.fit(model)

Global seed set to 42


Training arguments Namespace(adam_epsilon=1e-08, distributed_backend=None, early_stop_callback=False, fp_16=False, max_grad_norm=1.0, n_gpu=-1, num_workers=8, opt_level='O1', warmup_steps=0, weight_decay=0.0)
--------------------
Model arguments Namespace(hidden_dropout_prob=0.15, max_input_seq_length=100, model_name_or_path='bert-base-uncased')
--------------------
Other arguments Namespace(DEV_FILE='/content/drive/MyDrive/NLP/data/abductivenli/dev.csv', TRAIN_FILE='/content/drive/MyDrive/NLP/data/abductivenli/segments/mixed/mixed.csv', do_fast_dev_run=False, eval_batch_size=16, gradient_accumulation_steps=1, learning_rate=5e-05, limit_train_batches=-1, limit_val_batches=-1, max_train_samples=5000, num_train_epochs=5, output_dir='/content/drive/MyDrive/NLP/data/abductivenli/segments/mixed/', predictions_file='predictions.csv', save_last=False, save_top_k=0, seed=42, train_batch_size=16, write_dev_predictions=True)
--------------------


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

Validation sanity check: 0it [00:00, ?it/s]

  cpuset_checked))
Global seed set to 42


--------------------
Validation avg_loss:  tensor(0.6966, device='cuda:0')
Validation avg_acc:  tensor(0.4375, device='cuda:0')
Writing predictions for /content/drive/MyDrive/NLP/data/abductivenli/dev.csv to /content/drive/MyDrive/NLP/data/abductivenli/segments/mixed/epoch_0_predictions.csv
--------------------


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

--------------------
Validation avg_loss:  tensor(0.7785, device='cuda:0')
Validation avg_acc:  tensor(0.5234, device='cuda:0')
Writing predictions for /content/drive/MyDrive/NLP/data/abductivenli/dev.csv to /content/drive/MyDrive/NLP/data/abductivenli/segments/mixed/epoch_0_predictions.csv
--------------------
--------------------
Train avg_loss:  tensor(0.6556, device='cuda:0')
Train avg_acc:  tensor(0.5845, device='cuda:0')
--------------------


Validating: 0it [00:00, ?it/s]

--------------------
Validation avg_loss:  tensor(1.3149, device='cuda:0')
Validation avg_acc:  tensor(0.5371, device='cuda:0')
Writing predictions for /content/drive/MyDrive/NLP/data/abductivenli/dev.csv to /content/drive/MyDrive/NLP/data/abductivenli/segments/mixed/epoch_1_predictions.csv
--------------------
--------------------
Train avg_loss:  tensor(0.3715, device='cuda:0')
Train avg_acc:  tensor(0.8325, device='cuda:0')
--------------------


Validating: 0it [00:00, ?it/s]

--------------------
Validation avg_loss:  tensor(1.6774, device='cuda:0')
Validation avg_acc:  tensor(0.5401, device='cuda:0')
Writing predictions for /content/drive/MyDrive/NLP/data/abductivenli/dev.csv to /content/drive/MyDrive/NLP/data/abductivenli/segments/mixed/epoch_2_predictions.csv
--------------------
--------------------
Train avg_loss:  tensor(0.1300, device='cuda:0')
Train avg_acc:  tensor(0.9507, device='cuda:0')
--------------------


Validating: 0it [00:00, ?it/s]

--------------------
Validation avg_loss:  tensor(2.0034, device='cuda:0')
Validation avg_acc:  tensor(0.5447, device='cuda:0')
Writing predictions for /content/drive/MyDrive/NLP/data/abductivenli/dev.csv to /content/drive/MyDrive/NLP/data/abductivenli/segments/mixed/epoch_3_predictions.csv
--------------------
--------------------
Train avg_loss:  tensor(0.0680, device='cuda:0')
Train avg_acc:  tensor(0.9712, device='cuda:0')
--------------------


Validating: 0it [00:00, ?it/s]

--------------------
Validation avg_loss:  tensor(2.1304, device='cuda:0')
Validation avg_acc:  tensor(0.5506, device='cuda:0')
Writing predictions for /content/drive/MyDrive/NLP/data/abductivenli/dev.csv to /content/drive/MyDrive/NLP/data/abductivenli/segments/mixed/epoch_4_predictions.csv
--------------------
--------------------
Train avg_loss:  tensor(0.0318, device='cuda:0')
Train avg_acc:  tensor(0.9888, device='cuda:0')
--------------------


# Model Arguments

## SWAG

In [None]:
"""
--model_name_or_path bert-base-uncased  
--max_input_seq_length 150  
--TRAIN_FILE /content/drive/MyDrive/NLP/data/swag/segments/easy/easy.csv
--output_dir /content/drive/MyDrive/NLP/data/swag/segments/easy/
--DEV_FILE /content/drive/MyDrive/NLP/data/swag/val_full.csv
--train_batch_size 16
--eval_batch_size 16
--max_train_samples 2500 
--num_train_epochs 5 
--gradient_accumulation_steps 1 
--save_top_k 0
--learning_rate 5e-5
--write_dev_predictions
"""

## Common Sense QA

In [None]:
"""
--model_name_or_path bert-base-uncased  
--max_input_seq_length 100  
--TRAIN_FILE /content/drive/MyDrive/NLP/data/commonsense/segments/easy/easy.csv
--output_dir /content/drive/MyDrive/NLP/data/commonsense/segments/easy/
--DEV_FILE /content/drive/MyDrive/NLP/data/commonsense/dev.csv
--train_batch_size 16
--eval_batch_size 16
--max_train_samples 5000 
--num_train_epochs 5 
--gradient_accumulation_steps 1 
--save_top_k 0
--learning_rate 5e-5
--write_dev_predictions
"""

## Abductive NLI

In [None]:
"""
--model_name_or_path bert-base-uncased  
--max_input_seq_length 100  
--TRAIN_FILE /content/drive/MyDrive/NLP/data/abductivenli/train.csv
--output_dir /content/drive/MyDrive/NLP/data/abductivenli/datamaps/
--DEV_FILE /content/drive/MyDrive/NLP/data/abductivenli/train.csv
--train_batch_size 16
--eval_batch_size 16
--max_train_samples 5000
--num_train_epochs 5 
--gradient_accumulation_steps 1 
--save_top_k 0
--learning_rate 5e-5
--write_dev_predictions
"""

## Social IQA

In [None]:
"""
--model_name_or_path bert-base-uncased  
--max_input_seq_length 100   
--TRAIN_FILE /content/drive/MyDrive/NLP/data/socialiqa/segments/easy/easy.csv
--output_dir /content/drive/MyDrive/NLP/data/socialiqa/segments/easy
--DEV_FILE /content/drive/MyDrive/NLP/data/socialiqa/dev.csv
--train_batch_size 16
--eval_batch_size 16
--max_train_samples 2500 
--num_train_epochs 5 
--gradient_accumulation_steps 1 
--save_top_k 0
--learning_rate 5e-5
--write_dev_predictions
"""