In [1]:
import transformers
import os
import math
print(transformers.__version__)
import torch
model_checkpoint = "bert-base-cased"
#model_checkpoint = "allenai/scibert_scivocab_cased"
#model_checkpoint = "roberta-large"
batch_size = 16
from transformers import AutoConfig,AutoModel
import transformers
from transformers import AutoModelForSequenceClassification,AutoTokenizer,DataCollatorWithPadding,\
                                        TrainingArguments, Trainer,default_data_collator,AdamW,\
                                        BertModelWithHeads,PfeifferConfig,PfeifferInvConfig
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True)
from torch.utils.data import DataLoader,RandomSampler
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss
from torch.optim.lr_scheduler import ExponentialLR
from transformers.models.bert.modeling_bert import BertPooler
#tokenizer = RobertaTokenizerFast.from_pretrained(model_checkpoint,add_prefix_space=True)
import transformers.adapters.composition as ac

4.11.3


In [2]:
from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase

In [3]:
from collections import Counter
fct = CrossEntropyLoss()

In [4]:
from datasets import load_dataset,load_metric
import datasets
import numpy as np
import random

In [5]:
f1_metric = load_metric("f1")
pr_metric = load_metric('precision')
re_metric = load_metric('recall')

In [6]:
class PretrainedSequenceModel(torch.nn.Module):
    def __init__(self,labels):
        super().__init__()
        self.num_labels = labels
        self.base_model = AutoModel.from_pretrained(model_checkpoint,output_hidden_states=False,add_pooling_layer=False)
        #Pretrained Using MLM and saved 
        self.base_model.load_state_dict(torch.load("mlm_books/books_amazon.bin"))
        self.dropout = torch.nn.Dropout(self.base_model.config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(self.base_model.config.hidden_size,self.num_labels)
        self._init_weights(self.classifier)
        self.pooler = BertPooler(self.base_model.config)
        self._init_weights(self.pooler)
        
    def _init_weights(self, modules):
        """Initialize the weights"""
        for module in modules.modules():
                if isinstance(module, torch.nn.Linear):
                    module.weight.data.normal_(mean=0.0, std=self.base_model.config.initializer_range)
                    if module.bias is not None:
                        module.bias.data.zero_()
                elif isinstance(module, torch.nn.LayerNorm):
                    module.bias.data.zero_()
                    module.weight.data.fill_(1.0)
        

    def forward(self,data):
        out = self.base_model(input_ids=data['input_ids'], \
                               attention_mask=data['attention_mask'])
        out = self.pooler(out.last_hidden_state)
        clf_out = self.classifier(self.dropout(out))
        return clf_out


In [7]:
class PretrainedAdapterSequenceModel(torch.nn.Module):
    def __init__(self,labels):
        super().__init__()
        self.num_labels = labels
        self.task_config = PfeifferConfig()
        self.base_model = BertModelWithHeads.from_pretrained(model_checkpoint)
        #Pretrained Using MLM and saved 
        self.base_model.load_adapter("mlm_electronics/checkpoint-22071/mlm/",set_active=False,overwrite_ok=True,with_head=False)
        self.base_model.add_adapter("sentiment",set_active=False,overwrite_ok=True,config=self.task_config)
        self.base_model.add_classification_head("sentiment",num_labels=self.num_labels,overwrite_ok=True)
        #self.base_model.delete_head("mlm")
        self.base_model.set_active_adapters(ac.Stack("mlm","sentiment"))
        self.base_model.train_adapter(['sentiment'])


    def forward(self,data):
        clf_out = self.base_model(input_ids=data['input_ids'], \
                               attention_mask=data['attention_mask'])
        return clf_out

In [8]:
def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def preprocess_function(examples):
        result = tokenizer(examples['text'], padding="max_length", max_length=512, truncation=True)
        return result

def compute_metrics(p):
    f1_metric = load_metric("f1")
    pr_metric = load_metric('precision')
    re_metric = load_metric('recall')
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    results1 = f1_metric.compute(predictions=predictions, references=labels,average="weighted")
    results2 = pr_metric.compute(predictions=predictions, references=labels,average="weighted")
    results3 = re_metric.compute(predictions=predictions, references=labels,average="weighted")
    return {
        "precision": results2["precision"],
        "recall": results3["recall"],
        "f1": results1["f1"],
    }

In [9]:
np.random.randint(0,100000)

63529

In [10]:
def baseline1():   
            
            seed = np.random.randint(0,1000000)
            fix_all_seeds(seed)
            dataset = load_dataset('csv',delimiter="\t",data_files='electronics/review_labels.csv')
            dataset = datasets.concatenate_datasets([dataset['train']])
            dataset_src = dataset.train_test_split(0.2,shuffle=False)
            
            dataset = load_dataset('csv',delimiter="\t",data_files='books/review_labels.csv')
            dataset = datasets.concatenate_datasets([dataset['train']])
            dataset_trg = dataset.train_test_split(0.2,shuffle=False)
            
            processed_datasets_src = dataset_src.map(preprocess_function,batched=True,\
                                      desc="Running tokenizer on dataset",)

            processed_datasets_trg = dataset_trg.map(preprocess_function,batched=True,\
                                      desc="Running tokenizer on dataset",)
            
            processed_datasets_src.remove_columns_(["text"])
            processed_datasets_trg.remove_columns_(["text"])
            
            config = AutoConfig.from_pretrained(model_checkpoint,num_labels=2,)
            model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,config=config,)
            
            args = TrainingArguments(
                "sanity-chunk",
                evaluation_strategy = "epoch",
                learning_rate=5e-5,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                num_train_epochs=5,
                weight_decay=0.01,
                save_strategy="epoch",
                logging_steps=100,
                overwrite_output_dir=True,
                load_best_model_at_end=True,
                metric_for_best_model = "eval_f1",
                seed = seed,
            )
            
            trainer = Trainer(
                    model,
                    args,
                    train_dataset=processed_datasets_src['train'],
                    eval_dataset= processed_datasets_src['test'],
                    data_collator=default_data_collator,
                    tokenizer=tokenizer,
                    compute_metrics=compute_metrics
                )
            
            trainer.train(resume_from_checkpoint=None)
            #p = trainer.predict(processed_datasets_src['test'])
            p = trainer.predict(processed_datasets_trg['test'])
            
            y_hat = np.argmax(p.predictions,1)
            y = p.label_ids
            out = f1_metric.compute(predictions=y_hat,references=y)
            return out['f1']

In [11]:
def run_train(final_train_loader,final_eval_loader):
            model = PretrainedAdapterSequenceModel(2)
            model.cuda()
            no_decay = ["bias", "LayerNorm.weight"]
            optimizer_grouped_parameters = [
                    {
                            "params": [p for n, p in model.named_parameters() \
                                       if not any(nd in n for nd in no_decay)],
                            "weight_decay": 1e-2,
                    },
                    {
                        "params": [p for n, p in model.named_parameters() \
                                   if any(nd in n for nd in no_decay)],
                        "weight_decay": 0.0,
                    },
                        ]
            optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)
            fct_loss = CrossEntropyLoss()
            scheduler = ExponentialLR(optimizer=optimizer,gamma=0.99,last_epoch=-1,verbose=True)
            
            best_loss = 1e5
            best_f1 = -1
            for epoch in range(10):
                print(f'EPOCH NO: {epoch}')
                model.eval()
                val_loss = 0.0
                token_predictions_store = []
                token_gold_store = []
                for step, batch in enumerate(final_eval_loader):
                    with torch.no_grad():
                        data = {'input_ids':batch['input_ids'].cuda(),\
                               'attention_mask':batch['attention_mask'].cuda(),\
                               'labels':batch['labels'].cuda()}
                        out = model(data)
                        token_predictions_store.append(out)
                        token_gold_store.append(batch['labels'])
                        loss = fct(out.view(-1,model.num_labels),\
                                                   data['labels'].view(-1))
                        val_loss = val_loss + loss.item()

                predictions = torch.vstack(token_predictions_store)
                references = torch.hstack(token_gold_store)
                predictions = torch.argmax(predictions,dim=-1)
                print(predictions.shape,references.shape)
                y_pred = predictions.detach().cpu().clone().numpy()
                y_true = references.detach().cpu().clone().numpy()
                print(y_pred.shape,y_true.shape)
                eval_f1 = f1_metric.compute(predictions=y_pred, references=y_true)
                print('-'*100)
                print(eval_f1)
                print(f'Epoch {epoch} val loss {val_loss/len(final_eval_loader)}')
                if val_loss/len(final_eval_loader) < best_loss:
                    best_loss = val_loss/len(final_eval_loader)
                    best_f1 = eval_f1['f1']
                    torch.save(model.state_dict(),"saved_model/adapter_pretrained_amazon.bin")
                print('-'*100)
        
                model.train()
                epoch_loss = 0.0
                for step, batch in enumerate(final_train_loader):
                    data = {'input_ids':batch['input_ids'].cuda(),\
                               'attention_mask':batch['attention_mask'].cuda(),\
                               'labels':batch['labels'].cuda()}
                    optimizer.zero_grad()
                    out = model(data)
                    loss = fct(out.view(-1,model.num_labels),\
                                               data['labels'].view(-1))
                    epoch_loss = epoch_loss + loss.item()
                    loss.backward()
                    optimizer.step()
                scheduler.step()
                print(f'Epoch {epoch} training loss {epoch_loss/len(final_train_loader)}')
                print('**************************************************************************')
            print(f'Best F1 score{best_f1},{best_loss}')

In [12]:
def run_test(data):
    
        model = PretrainedAdapterSequenceModel(2,False)
        model.cuda()
        model.load_state_dict(torch.load("Test/adapter_pretrained_amazon.bin"))
        model.eval()
        token_predictions_store = []
        token_gold_store = []
        for step, batch in enumerate(data):
                    with torch.no_grad():
                        data = {'input_ids':batch['input_ids'].cuda(),\
                               'attention_mask':batch['attention_mask'].cuda(),\
                               'labels':batch['labels'].cuda()}
                        out = model(data)
                        token_predictions_store.append(out)
                        token_gold_store.append(data['labels'])
                        loss = fct(out.view(-1,model.num_labels),\
                                                   data['labels'].view(-1))
                        loss = loss + loss.item()

        predictions = torch.vstack(token_predictions_store)
        references = torch.hstack(token_gold_store)
        predictions = torch.argmax(predictions,dim=-1)
        print(predictions.shape,references.shape)
        y_pred = predictions.detach().cpu().clone().numpy()
        y_true = references.detach().cpu().clone().numpy()
        test_f1 = f1_metric.compute(predictions=y_pred, references=y_true)
        print(f'Test F1 score {test_f1}')
        return test_f1['f1']

In [13]:
def run_train_adapter(final_train_loader,final_eval_loader):
            model = PretrainedAdapterSequenceModel(2)
            model.cuda()
            no_decay = ["bias", "LayerNorm.weight"]
            optimizer_grouped_parameters = [
                    {
                            "params": [p for n, p in model.named_parameters() \
                                       if not any(nd in n for nd in no_decay)],
                            "weight_decay": 1e-2,
                    },
                    {
                        "params": [p for n, p in model.named_parameters() \
                                   if any(nd in n for nd in no_decay)],
                        "weight_decay": 0.0,
                    },
                        ]
            optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)
            fct_loss = CrossEntropyLoss()
            scheduler = ExponentialLR(optimizer=optimizer,gamma=0.99,last_epoch=-1,verbose=True)
            
            best_loss = 1e5
            best_f1 = -1
            for epoch in range(10):
                print(f'EPOCH NO: {epoch}')
                model.eval()
                val_loss = 0.0
                token_predictions_store = []
                token_gold_store = []
                for step, batch in enumerate(final_eval_loader):
                    with torch.no_grad():
                        data = {'input_ids':batch['input_ids'].cuda(),\
                               'attention_mask':batch['attention_mask'].cuda(),\
                               'labels':batch['labels'].cuda()}
                        out = model(data)
                        out_preds = torch.argmax(out.logits,dim=1)
                        token_predictions_store.append(out_preds)
                        token_gold_store.append(data['labels'])
                        loss = fct(out.logits,data['labels'])
                        val_loss = val_loss + loss.item()

                predictions = torch.hstack(token_predictions_store)
                references = torch.hstack(token_gold_store)
                #predictions = torch.argmax(predictions,dim=-1)
                print(predictions.shape,references.shape)
                y_pred = predictions.detach().cpu().clone().numpy()
                y_true = references.detach().cpu().clone().numpy()
                print(y_pred.shape,y_true.shape)
                eval_f1 = f1_metric.compute(predictions=y_pred, references=y_true)
                print('-'*100)
                print(eval_f1)
                print(f'Epoch {epoch} val loss {val_loss/len(final_eval_loader)}')
                if best_f1 < eval_f1['f1']:
                    best_loss = val_loss/len(final_eval_loader)
                    best_f1 = eval_f1['f1']
                    torch.save(model.state_dict(),"Test/adapter_pretrained_amazon.bin")
                print('-'*100)
        
                model.train()
                epoch_loss = 0.0
                for step, batch in enumerate(final_train_loader):
                    data = {'input_ids':batch['input_ids'].cuda(),\
                               'attention_mask':batch['attention_mask'].cuda(),\
                               'labels':batch['labels'].cuda()}
                    optimizer.zero_grad()
                    out = model(data)
                    loss = fct(out.logits,data['labels'])
                    epoch_loss = epoch_loss + loss.item()
                    loss.backward()
                    optimizer.step()
                scheduler.step()
                print(f'Epoch {epoch} training loss {epoch_loss/len(final_train_loader)}')
                print('**************************************************************************')
            print(f'Best F1 score{best_f1},{best_loss}')
            return model

def run_test_adapter(data):
    
        model = PretrainedAdapterSequenceModel(2)
        model.cuda()
        model.load_state_dict(torch.load("Test/adapter_pretrained_amazon.bin"))
        model.eval()
        token_predictions_store = []
        token_gold_store = []
        for step, batch in enumerate(data):
                    with torch.no_grad():
                        data = {'input_ids':batch['input_ids'].cuda(),\
                               'attention_mask':batch['attention_mask'].cuda(),\
                               'labels':batch['labels'].cuda()}
                        out = model(data)
                        out_preds = torch.argmax(out.logits,dim=1)
                        token_predictions_store.append(out_preds)
                        token_gold_store.append(data['labels'])
                        loss = fct(out.logits,data['labels'])
                        loss = loss + loss.item()

        predictions = torch.hstack(token_predictions_store)
        references = torch.hstack(token_gold_store)
        print(predictions.shape,references.shape)
        y_pred = predictions.detach().cpu().clone().numpy()
        y_true = references.detach().cpu().clone().numpy()
        test_f1 = f1_metric.compute(predictions=y_pred, references=y_true)
        print(f'Test F1 score {test_f1}')
        return test_f1['f1']

In [14]:
def baseline2():
        seed = np.random.randint(0,1000000)
        fix_all_seeds(seed)
        dataset = load_dataset('csv',delimiter="\t",data_files='books/review_labels.csv')
        dataset = datasets.concatenate_datasets([dataset['train']])
        dataset_src = dataset.train_test_split(0.2,shuffle=False)

        dataset = load_dataset('csv',delimiter="\t",data_files='electronics/review_labels.csv')
        dataset = datasets.concatenate_datasets([dataset['train']])
        dataset_trg = dataset.train_test_split(0.2,shuffle=False)

        processed_datasets_src = dataset_src.map(preprocess_function,batched=True,\
                                  desc="Running tokenizer on dataset",)

        processed_datasets_trg = dataset_trg.map(preprocess_function,batched=True,\
                                  desc="Running tokenizer on dataset",)

        processed_datasets_src.remove_columns_(["text"])
        processed_datasets_trg.remove_columns_(["text"])


        train_dataloader_src =DataLoader(processed_datasets_src['train'],\
                                                     collate_fn=default_data_collator,\
                                                     batch_size =16,drop_last=True)
        eval_dataloader_src = DataLoader(processed_datasets_src['test'],\
                                         collate_fn=default_data_collator,\
                                         batch_size = 16,drop_last=True)
        test_dataloader_tgt = DataLoader(processed_datasets_trg['test'],\
                                         collate_fn=default_data_collator,\
                                         batch_size = 16,drop_last=True)
        m = run_train_adapter(train_dataloader_src,eval_dataloader_src)
        return run_test_adapter(test_dataloader_tgt)

In [15]:
#m = baseline2()

In [16]:
#m.base_model.save_all_adapters("Test/")

In [17]:
#base_model = BertModelWithHeads.from_pretrained(model_checkpoint)

In [18]:
#lang_config = PfeifferInvConfig(inv_adapter='nice')
#task_config = PfeifferConfig()

In [19]:
#base_model.load_adapter("Test/mlm/",set_active=False,overwrite_ok=True,with_head=False)

In [20]:
#base_model.load_adapter("Test/sentiment/",set_active=False,overwrite_ok=True,with_head=False)

In [21]:
#base_model.load_head("Test/sentiment/sentiment/")

In [22]:
#m.base_model.save_all_heads("Test/sentiment/")

In [24]:
output = []
for i in range(5):
            output.append(baseline2())

print(np.mean(output),np.std(output))

Using custom data configuration default-34360c82fb13fe72
Reusing dataset csv (/ukp-storage-1/sarkar/.cache/huggingface/datasets/csv/default-34360c82fb13fe72/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-6ce991164e62b0c8
Reusing dataset csv (/ukp-storage-1/sarkar/.cache/huggingface/datasets/csv/default-6ce991164e62b0c8/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

Running tokenizer on dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Adjusting learning rate of group 0 to 1.0000e-04.
Adjusting learning rate of group 1 to 1.0000e-04.
EPOCH NO: 0
torch.Size([400]) torch.Size([400])
(400,) (400,)
----------------------------------------------------------------------------------------------------
{'f1': 0.0}
Epoch 0 val loss 0.7054719877243042
----------------------------------------------------------------------------------------------------
Adjusting learning rate of group 0 to 9.9000e-05.
Adjusting learning rate of group 1 to 9.9000e-05.
Epoch 0 training loss 0.4834526439756155
**************************************************************************
EPOCH NO: 1
torch.Size([400]) torch.Size([400])
(400,) (400,)
----------------------------------------------------------------------------------------------------
{'f1': 0.8781725888324872}
Epoch 1 val loss 0.31869212925434115
----------------------------------------------------------------------------------------------------
Adjusting learning rate of group 0 to 9.8010

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([400]) torch.Size([400])
Test F1 score {'f1': 0.8804347826086957}


Using custom data configuration default-34360c82fb13fe72
Reusing dataset csv (/ukp-storage-1/sarkar/.cache/huggingface/datasets/csv/default-34360c82fb13fe72/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-6ce991164e62b0c8
Reusing dataset csv (/ukp-storage-1/sarkar/.cache/huggingface/datasets/csv/default-6ce991164e62b0c8/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

Running tokenizer on dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Adjusting learning rate of group 0 to 1.0000e-04.
Adjusting learning rate of group 1 to 1.0000e-04.
EPOCH NO: 0
torch.Size([400]) torch.Size([400])
(400,) (400,)
----------------------------------------------------------------------------------------------------
{'f1': 0.6885245901639345}
Epoch 0 val loss 0.6939820575714112
----------------------------------------------------------------------------------------------------
Adjusting learning rate of group 0 to 9.9000e-05.
Adjusting learning rate of group 1 to 9.9000e-05.
Epoch 0 training loss 0.4965819142758846
**************************************************************************
EPOCH NO: 1
torch.Size([400]) torch.Size([400])
(400,) (400,)
----------------------------------------------------------------------------------------------------
{'f1': 0.8526315789473684}
Epoch 1 val loss 0.374863141477108
----------------------------------------------------------------------------------------------------
Adjusting learning rate of grou

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([400]) torch.Size([400])
Test F1 score {'f1': 0.8668555240793202}


Using custom data configuration default-34360c82fb13fe72
Reusing dataset csv (/ukp-storage-1/sarkar/.cache/huggingface/datasets/csv/default-34360c82fb13fe72/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-6ce991164e62b0c8
Reusing dataset csv (/ukp-storage-1/sarkar/.cache/huggingface/datasets/csv/default-6ce991164e62b0c8/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

Running tokenizer on dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Adjusting learning rate of group 0 to 1.0000e-04.
Adjusting learning rate of group 1 to 1.0000e-04.
EPOCH NO: 0
torch.Size([400]) torch.Size([400])
(400,) (400,)
----------------------------------------------------------------------------------------------------
{'f1': 0.6885245901639345}
Epoch 0 val loss 0.7037584352493286
----------------------------------------------------------------------------------------------------
Adjusting learning rate of group 0 to 9.9000e-05.
Adjusting learning rate of group 1 to 9.9000e-05.
Epoch 0 training loss 0.49502692818641664
**************************************************************************
EPOCH NO: 1
torch.Size([400]) torch.Size([400])
(400,) (400,)
----------------------------------------------------------------------------------------------------
{'f1': 0.8737373737373737}
Epoch 1 val loss 0.3176766559481621
----------------------------------------------------------------------------------------------------
Adjusting learning rate of gr

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([400]) torch.Size([400])
Test F1 score {'f1': 0.8538681948424068}


Using custom data configuration default-34360c82fb13fe72
Reusing dataset csv (/ukp-storage-1/sarkar/.cache/huggingface/datasets/csv/default-34360c82fb13fe72/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-6ce991164e62b0c8
Reusing dataset csv (/ukp-storage-1/sarkar/.cache/huggingface/datasets/csv/default-6ce991164e62b0c8/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

Running tokenizer on dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Adjusting learning rate of group 0 to 1.0000e-04.
Adjusting learning rate of group 1 to 1.0000e-04.
EPOCH NO: 0
torch.Size([400]) torch.Size([400])
(400,) (400,)
----------------------------------------------------------------------------------------------------
{'f1': 0.046511627906976744}
Epoch 0 val loss 0.6933588981628418
----------------------------------------------------------------------------------------------------
Adjusting learning rate of group 0 to 9.9000e-05.
Adjusting learning rate of group 1 to 9.9000e-05.
Epoch 0 training loss 0.4717094188928604
**************************************************************************
EPOCH NO: 1
torch.Size([400]) torch.Size([400])
(400,) (400,)
----------------------------------------------------------------------------------------------------
{'f1': 0.8608247422680413}
Epoch 1 val loss 0.3308543482422829
----------------------------------------------------------------------------------------------------
Adjusting learning rate of g

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([400]) torch.Size([400])
Test F1 score {'f1': 0.8427299703264095}


Using custom data configuration default-34360c82fb13fe72
Reusing dataset csv (/ukp-storage-1/sarkar/.cache/huggingface/datasets/csv/default-34360c82fb13fe72/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-6ce991164e62b0c8
Reusing dataset csv (/ukp-storage-1/sarkar/.cache/huggingface/datasets/csv/default-6ce991164e62b0c8/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

Running tokenizer on dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Adjusting learning rate of group 0 to 1.0000e-04.
Adjusting learning rate of group 1 to 1.0000e-04.
EPOCH NO: 0
torch.Size([400]) torch.Size([400])
(400,) (400,)
----------------------------------------------------------------------------------------------------
{'f1': 0.6885245901639345}
Epoch 0 val loss 0.7010597085952759
----------------------------------------------------------------------------------------------------
Adjusting learning rate of group 0 to 9.9000e-05.
Adjusting learning rate of group 1 to 9.9000e-05.
Epoch 0 training loss 0.49464855559170245
**************************************************************************
EPOCH NO: 1
torch.Size([400]) torch.Size([400])
(400,) (400,)
----------------------------------------------------------------------------------------------------
{'f1': 0.8737373737373737}
Epoch 1 val loss 0.32422421902418136
----------------------------------------------------------------------------------------------------
Adjusting learning rate of g

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([400]) torch.Size([400])
Test F1 score {'f1': 0.8603351955307262}
0.8608447334775116 0.012625181130485587


In [None]:
0.8636363636363636

In [None]:
((0.926-0.82786)/0.926)*100