# Load Valid Data
For validation data, we will use the 200 train samples. We load it from my 60k dataset [here][1].

[1]: https://www.kaggle.com/datasets/cdeotte/60k-data-with-context-v2

In [23]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

from typing import Optional, Union
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from dataclasses import dataclass
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoTokenizer, AutoModelForMultipleChoice, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

VER=12
LOAD_DATASET_FROM_DISK = True
TRAIN_MODEL = True
USE_CONTEXT = True

# TRAIN WITH SUBSET OF MMLU
# NUM_MMLU_TRAIN_SAMPLES = 2_048
# PARAMETER EFFICIENT FINE TUNING
# PEFT REQUIRES 1XP100 GPU NOT 2XT4
USE_PEFT = True
# NUMBER OF LAYERS TO FREEZE 
# DEBERTA LARGE HAS TOTAL OF 24 LAYERS
FREEZE_LAYERS = 16
# BOOLEAN TO FREEZE EMBEDDINGS
FREEZE_EMBEDDINGS = True
# LENGTH OF CONTEXT PLUS QUESTION ANSWER
MAX_INPUT = 256
# HUGGING FACE MODEL
MODEL = 'microsoft/deberta-v3-large'

# MODEL = 'checkpoints_6/checkpoint-1200/'

In [24]:
df_valid = pd.read_csv('dataset/train_with_context2.csv')
if not USE_CONTEXT:
    df_valid['context'] = ''
print('Validation data size:', df_valid.shape )
df_valid.head()

Validation data size: (200, 8)


Unnamed: 0,prompt,context,A,B,C,D,E,answer
0,Which of the following statements accurately d...,The presence of a clustered thick disk-like co...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D
1,Which of the following is an accurate definiti...,Many of these systems evolve in a self-similar...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A
2,Which of the following statements accurately d...,It is possible that this usage is related with...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A
3,What is the significance of regularization in ...,Renormalization is distinct from regularizatio...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C
4,Which of the following statements accurately d...,Several qualitative observations can be made o...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D


In [25]:
df_train = pd.concat([
    pd.read_csv('dataset/train_cdeotte.csv'), 
    pd.read_csv('dataset/6000_train_examples_with_context.csv', 
                usecols=['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'])],
                     ignore_index=True,
)                     

if not USE_CONTEXT:
    df_train['context'] = ''

df_train.dropna(inplace=True)
df_train.shape

(13108, 8)

# Data Loader
Code is from Radek's notebook [here][1] with modifications to the tokenization process.

[1]: https://www.kaggle.com/code/radek1/new-dataset-deberta-v3-large-training

In [26]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
index_to_option = {v: k for k,v in option_to_index.items()}

def preprocess(example):
    assert example['context'] is not None, f"Context is None for example {example}"
    assert example['prompt'] is not None, f"Prompt is None for example {example}"
    
    first_sentence = ["[CLS] " + example['context']] * 5
    
    for option in 'ABCDE':
        assert example[option] is not None, f"Option {option} is None for example {example}"
        
    second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in 'ABCDE']
    
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation='only_first', 
                                  max_length=MAX_INPUT, add_special_tokens=False)
    tokenized_example['label'] = option_to_index[example['answer']]
    
    return tokenized_example

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [27]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
dataset_valid = Dataset.from_pandas(df_valid)
dataset = Dataset.from_pandas(df_train)
#dataset = dataset.remove_columns(["__index_level_0__"])
dataset

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Dataset({
    features: ['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer', '__index_level_0__'],
    num_rows: 13108
})

In [28]:
dataset_valid_path = 'dataset/tokenized_dataset_valid'
dataset_path = 'dataset/tokenized_dataset'

if LOAD_DATASET_FROM_DISK:
    tokenized_dataset_valid = Dataset.from_file(dataset_valid_path + '/data-00000-of-00001.arrow')
    tokenized_dataset = Dataset.from_file(dataset_path + '/data-00000-of-00001.arrow')
else:
    tokenized_dataset_valid = dataset_valid.map(preprocess, remove_columns=df_valid.columns.tolist())
    tokenized_dataset = dataset.map(preprocess, remove_columns=df_train.columns.tolist())
    
    # Save the tokenized datasets
    tokenized_dataset_valid.save_to_disk(dataset_valid_path)
    tokenized_dataset.save_to_disk(dataset_path)
tokenized_dataset

Dataset({
    features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 13108
})

# Build Model
We will use a Hugging Face AutoModelForMultipleChoice. For the list of possible models, see Hugging Face's repository [here][1]. We can optionally use PEFT to accelerate training and use less memory. However i have noticed that validation accuracy is less. (Note that PEFT requires us to use 1xP100 not 2xT4 GPU. I'm not sure why). We can also optionally freeze layers. This also accelerates training and uses less memory. However validation accuracy may become less.

[1]: https://huggingface.co/models

In [29]:
model = AutoModelForMultipleChoice.from_pretrained(MODEL)

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# # NOTE PEFT REQUIRES US TO USE 1XP100 NOT 2XT4. I'M NOT SURE WHY.
# if USE_PEFT:
#     !pip install --no-index --no-deps /kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl

In [31]:
if USE_PEFT:
    print('We are using PEFT.')
    peft_config = LoraConfig(
        r=8, lora_alpha=4, task_type=TaskType.SEQ_CLS, lora_dropout=0.1, 
        bias="none", inference_mode=False, 
        target_modules=["query_proj", "value_proj"],
        modules_to_save=['classifier','pooler'],
    )
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

We are using PEFT.
trainable params: 2887682 || all params: 436899842 || trainable%: 0.6609482820549979


In [32]:
if FREEZE_EMBEDDINGS:
    print('Freezing embeddings.')
    for param in model.deberta.embeddings.parameters():
        param.requires_grad = False
if FREEZE_LAYERS>0:
    print(f'Freezing {FREEZE_LAYERS} layers.')
    for layer in model.deberta.encoder.layer[:FREEZE_LAYERS]:
        for param in layer.parameters():
            param.requires_grad = False

Freezing embeddings.
Freezing 16 layers.


# MAP@3 Metric
The competition metric is MAP@3 therefore we will make a custom code to add to Hugging Face's trainer. Discussion [here][1]

[1]: https://www.kaggle.com/competitions/kaggle-llm-science-exam/discussion/435602

In [33]:
def map_at_3(predictions, labels):
    map_sum = 0
    pred = np.argsort(-1*np.array(predictions),axis=1)[:,:3]
    for x,y in zip(pred,labels):
        z = [1/i if y==j else 0 for i,j in zip([1,2,3],x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

def compute_metrics(p):
    predictions = p.predictions.tolist()
    labels = p.label_ids.tolist()
    return {"map@3": map_at_3(predictions, labels)}

# Train and Save 
We will now train and save our model using Hugging Face's easy to use trainer. By adjusting the parameters in this notebook, we can achieve `CV MAP@3 = 0.915+` and corresponding single model `LB MAP@3 = 0.830+` wow!

In we run this notebook outside of Kaggle then we can train longer and with more RAM. If we run this notebook on Kaggle, then we need to use tricks to train models efficiently. Here are some ideas:
* use fp16 (this speeds up T4 not P100)
* use gradient_accumlation_steps (this simulates larger batch sizes)
* use gradient_checkpointing (this uses disk to save RAM)
* use 2xT4 instead of 1xP100 (this doubles GPUs)
* freeze model embeddings (this reduces weights to train)
* freeze some model layers (this reduces weights to train)
* use PEFT (this reduces weights to train)
* increase LR and decrease epochs (this reduces work)
* use smaller models (this reduces weights to train)

In [34]:
per_device_train_batch_size = 1
gradient_accumulation_steps = int(8 / per_device_train_batch_size)

training_args = TrainingArguments(
    # resume_from_checkpoint="checkpoints_6/checkpoint-1200/",
    warmup_ratio=0.1, 
    learning_rate=2e-5,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    report_to='none',
    output_dir = f'./checkpoints_{VER}',
    overwrite_output_dir=True,
    fp16=True,
    gradient_accumulation_steps=gradient_accumulation_steps,
    logging_steps=75,
    evaluation_strategy='steps',
    eval_steps=75,
    save_strategy="steps",
    save_steps=75,
    load_best_model_at_end=False,
    metric_for_best_model='map@3',
    lr_scheduler_type='cosine_with_restarts', # ['linear', 'cosine', 'cosine_with_restarts', 'polynomial', 'constant', 'constant_with_warmup', 'inverse_sqrt', 'reduce_lr_on_plateau']
    weight_decay=0.01,
    save_total_limit=2,
)

In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset_valid,
    compute_metrics = compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

if TRAIN_MODEL:
    trainer.train()
    trainer.save_model(f'model_v{VER}')

  self.comm = Comm(**args)


  0%|          | 0/3276 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 1.6178, 'learning_rate': 4.573170731707318e-06, 'epoch': 0.05}


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.6093310117721558, 'eval_map@3': 0.43166666666666664, 'eval_runtime': 21.6174, 'eval_samples_per_second': 9.252, 'eval_steps_per_second': 4.626, 'epoch': 0.05}
{'loss': 1.618, 'learning_rate': 9.146341463414635e-06, 'epoch': 0.09}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.609350562095642, 'eval_map@3': 0.4741666666666666, 'eval_runtime': 21.5004, 'eval_samples_per_second': 9.302, 'eval_steps_per_second': 4.651, 'epoch': 0.09}
{'loss': 1.6121, 'learning_rate': 1.3719512195121953e-05, 'epoch': 0.14}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.609316349029541, 'eval_map@3': 0.4574999999999998, 'eval_runtime': 21.5036, 'eval_samples_per_second': 9.301, 'eval_steps_per_second': 4.65, 'epoch': 0.14}
{'loss': 1.6186, 'learning_rate': 1.829268292682927e-05, 'epoch': 0.18}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.6092138290405273, 'eval_map@3': 0.5024999999999998, 'eval_runtime': 21.529, 'eval_samples_per_second': 9.29, 'eval_steps_per_second': 4.645, 'epoch': 0.18}
{'loss': 1.6098, 'learning_rate': 1.998745935961641e-05, 'epoch': 0.23}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.6090478897094727, 'eval_map@3': 0.49833333333333313, 'eval_runtime': 21.5122, 'eval_samples_per_second': 9.297, 'eval_steps_per_second': 4.649, 'epoch': 0.23}
{'loss': 1.6096, 'learning_rate': 1.9915603851891577e-05, 'epoch': 0.27}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.6090868711471558, 'eval_map@3': 0.5133333333333331, 'eval_runtime': 21.4987, 'eval_samples_per_second': 9.303, 'eval_steps_per_second': 4.651, 'epoch': 0.27}
{'loss': 1.6163, 'learning_rate': 1.9780440827456854e-05, 'epoch': 0.32}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.6092822551727295, 'eval_map@3': 0.47833333333333317, 'eval_runtime': 21.487, 'eval_samples_per_second': 9.308, 'eval_steps_per_second': 4.654, 'epoch': 0.32}
{'loss': 1.611, 'learning_rate': 1.9582833252961143e-05, 'epoch': 0.37}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.6092578172683716, 'eval_map@3': 0.5099999999999999, 'eval_runtime': 21.4827, 'eval_samples_per_second': 9.31, 'eval_steps_per_second': 4.655, 'epoch': 0.37}
{'loss': 1.6063, 'learning_rate': 1.932404278074668e-05, 'epoch': 0.41}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.6094481945037842, 'eval_map@3': 0.49583333333333307, 'eval_runtime': 21.4667, 'eval_samples_per_second': 9.317, 'eval_steps_per_second': 4.658, 'epoch': 0.41}
{'loss': 1.6123, 'learning_rate': 1.9005721693658642e-05, 'epoch': 0.46}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.6094141006469727, 'eval_map@3': 0.4966666666666664, 'eval_runtime': 21.4957, 'eval_samples_per_second': 9.304, 'eval_steps_per_second': 4.652, 'epoch': 0.46}
{'loss': 1.6096, 'learning_rate': 1.862990235582139e-05, 'epoch': 0.5}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.6094141006469727, 'eval_map@3': 0.4858333333333331, 'eval_runtime': 21.4783, 'eval_samples_per_second': 9.312, 'eval_steps_per_second': 4.656, 'epoch': 0.5}
{'loss': 1.6148, 'learning_rate': 1.8198984236734246e-05, 'epoch': 0.55}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.6090087890625, 'eval_map@3': 0.5233333333333332, 'eval_runtime': 22.4021, 'eval_samples_per_second': 8.928, 'eval_steps_per_second': 4.464, 'epoch': 0.55}
{'loss': 1.6075, 'learning_rate': 1.7715718591533285e-05, 'epoch': 0.6}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.608373999595642, 'eval_map@3': 0.5433333333333331, 'eval_runtime': 22.1045, 'eval_samples_per_second': 9.048, 'eval_steps_per_second': 4.524, 'epoch': 0.6}
{'loss': 1.6079, 'learning_rate': 1.718319089522999e-05, 'epoch': 0.64}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.6074072122573853, 'eval_map@3': 0.5674999999999999, 'eval_runtime': 23.9778, 'eval_samples_per_second': 8.341, 'eval_steps_per_second': 4.171, 'epoch': 0.64}
{'loss': 1.609, 'learning_rate': 1.660480114307789e-05, 'epoch': 0.69}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.6058349609375, 'eval_map@3': 0.5974999999999997, 'eval_runtime': 23.8827, 'eval_samples_per_second': 8.374, 'eval_steps_per_second': 4.187, 'epoch': 0.69}
{'loss': 1.6046, 'learning_rate': 1.5984242142842003e-05, 'epoch': 0.73}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.6027148962020874, 'eval_map@3': 0.6299999999999999, 'eval_runtime': 23.6119, 'eval_samples_per_second': 8.47, 'eval_steps_per_second': 4.235, 'epoch': 0.73}
{'loss': 1.6041, 'learning_rate': 1.5325475937567185e-05, 'epoch': 0.78}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.597314476966858, 'eval_map@3': 0.6683333333333334, 'eval_runtime': 25.2978, 'eval_samples_per_second': 7.906, 'eval_steps_per_second': 3.953, 'epoch': 0.78}
{'loss': 1.5965, 'learning_rate': 1.4642150015469674e-05, 'epoch': 0.82}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.586645483970642, 'eval_map@3': 0.7191666666666667, 'eval_runtime': 24.6824, 'eval_samples_per_second': 8.103, 'eval_steps_per_second': 4.051, 'epoch': 0.82}
{'loss': 1.5874, 'learning_rate': 1.3920168858753208e-05, 'epoch': 0.87}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.5698046684265137, 'eval_map@3': 0.7391666666666669, 'eval_runtime': 22.7698, 'eval_samples_per_second': 8.784, 'eval_steps_per_second': 4.392, 'epoch': 0.87}
{'loss': 1.5733, 'learning_rate': 1.3183263001517224e-05, 'epoch': 0.92}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.5391796827316284, 'eval_map@3': 0.7600000000000002, 'eval_runtime': 20.3826, 'eval_samples_per_second': 9.812, 'eval_steps_per_second': 4.906, 'epoch': 0.92}
{'loss': 1.5479, 'learning_rate': 1.2416231689203853e-05, 'epoch': 0.96}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.4852807521820068, 'eval_map@3': 0.7725000000000003, 'eval_runtime': 21.1386, 'eval_samples_per_second': 9.461, 'eval_steps_per_second': 4.731, 'epoch': 0.96}
{'loss': 1.4946, 'learning_rate': 1.1633773618185302e-05, 'epoch': 1.01}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.3719189167022705, 'eval_map@3': 0.7850000000000001, 'eval_runtime': 20.6646, 'eval_samples_per_second': 9.678, 'eval_steps_per_second': 4.839, 'epoch': 1.01}
{'loss': 1.4182, 'learning_rate': 1.0840884498066934e-05, 'epoch': 1.05}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.2560268640518188, 'eval_map@3': 0.7833333333333335, 'eval_runtime': 21.9426, 'eval_samples_per_second': 9.115, 'eval_steps_per_second': 4.557, 'epoch': 1.05}
{'loss': 1.3815, 'learning_rate': 1.0042626636900857e-05, 'epoch': 1.1}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.182978630065918, 'eval_map@3': 0.7833333333333334, 'eval_runtime': 21.6541, 'eval_samples_per_second': 9.236, 'eval_steps_per_second': 4.618, 'epoch': 1.1}
{'loss': 1.2987, 'learning_rate': 9.244096620194059e-06, 'epoch': 1.14}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.1382423639297485, 'eval_map@3': 0.7900000000000001, 'eval_runtime': 20.442, 'eval_samples_per_second': 9.784, 'eval_steps_per_second': 4.892, 'epoch': 1.14}
{'loss': 1.2893, 'learning_rate': 8.450392771067463e-06, 'epoch': 1.19}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.1178051233291626, 'eval_map@3': 0.7900000000000001, 'eval_runtime': 20.5437, 'eval_samples_per_second': 9.735, 'eval_steps_per_second': 4.868, 'epoch': 1.19}
{'loss': 1.25, 'learning_rate': 7.666582599320962e-06, 'epoch': 1.24}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.1105586290359497, 'eval_map@3': 0.7933333333333334, 'eval_runtime': 21.4155, 'eval_samples_per_second': 9.339, 'eval_steps_per_second': 4.67, 'epoch': 1.24}
{'loss': 1.2679, 'learning_rate': 6.897670447230263e-06, 'epoch': 1.28}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.1072546243667603, 'eval_map@3': 0.7900000000000001, 'eval_runtime': 20.5908, 'eval_samples_per_second': 9.713, 'eval_steps_per_second': 4.857, 'epoch': 1.28}
{'loss': 1.2289, 'learning_rate': 6.148565538645375e-06, 'epoch': 1.33}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.0986436605453491, 'eval_map@3': 0.7950000000000002, 'eval_runtime': 20.1884, 'eval_samples_per_second': 9.907, 'eval_steps_per_second': 4.953, 'epoch': 1.33}
{'loss': 1.2385, 'learning_rate': 5.4335287439452264e-06, 'epoch': 1.37}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.088010549545288, 'eval_map@3': 0.7925000000000001, 'eval_runtime': 20.8197, 'eval_samples_per_second': 9.606, 'eval_steps_per_second': 4.803, 'epoch': 1.37}
{'loss': 1.1922, 'learning_rate': 4.7378104194550485e-06, 'epoch': 1.42}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.0778840780258179, 'eval_map@3': 0.7933333333333334, 'eval_runtime': 20.4076, 'eval_samples_per_second': 9.8, 'eval_steps_per_second': 4.9, 'epoch': 1.42}
{'loss': 1.2686, 'learning_rate': 4.075689257555974e-06, 'epoch': 1.46}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.0785820484161377, 'eval_map@3': 0.7941666666666667, 'eval_runtime': 20.1303, 'eval_samples_per_second': 9.935, 'eval_steps_per_second': 4.968, 'epoch': 1.46}
{'loss': 1.2414, 'learning_rate': 3.4513926605471504e-06, 'epoch': 1.51}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.0782490968704224, 'eval_map@3': 0.7941666666666667, 'eval_runtime': 20.3466, 'eval_samples_per_second': 9.83, 'eval_steps_per_second': 4.915, 'epoch': 1.51}
{'loss': 1.255, 'learning_rate': 2.8689065346666047e-06, 'epoch': 1.56}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.075830101966858, 'eval_map@3': 0.7941666666666667, 'eval_runtime': 20.2452, 'eval_samples_per_second': 9.879, 'eval_steps_per_second': 4.939, 'epoch': 1.56}
{'loss': 1.2049, 'learning_rate': 2.331949841532636e-06, 'epoch': 1.6}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.0737513303756714, 'eval_map@3': 0.7950000000000002, 'eval_runtime': 20.5775, 'eval_samples_per_second': 9.719, 'eval_steps_per_second': 4.86, 'epoch': 1.6}
{'loss': 1.2483, 'learning_rate': 1.843950853929314e-06, 'epoch': 1.65}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.075360894203186, 'eval_map@3': 0.7925000000000001, 'eval_runtime': 20.3213, 'eval_samples_per_second': 9.842, 'eval_steps_per_second': 4.921, 'epoch': 1.65}
{'loss': 1.2472, 'learning_rate': 1.408025267534242e-06, 'epoch': 1.69}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.0764895677566528, 'eval_map@3': 0.7925000000000001, 'eval_runtime': 20.4076, 'eval_samples_per_second': 9.8, 'eval_steps_per_second': 4.9, 'epoch': 1.69}
{'loss': 1.2067, 'learning_rate': 1.0269563083371991e-06, 'epoch': 1.74}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.0754904747009277, 'eval_map@3': 0.7925000000000001, 'eval_runtime': 20.2049, 'eval_samples_per_second': 9.899, 'eval_steps_per_second': 4.949, 'epoch': 1.74}
{'loss': 1.2158, 'learning_rate': 7.031769627565944e-07, 'epoch': 1.79}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.0751270055770874, 'eval_map@3': 0.7925000000000001, 'eval_runtime': 20.4476, 'eval_samples_per_second': 9.781, 'eval_steps_per_second': 4.891, 'epoch': 1.79}
{'loss': 1.2777, 'learning_rate': 4.387544439079239e-07, 'epoch': 1.83}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.07462739944458, 'eval_map@3': 0.7925000000000001, 'eval_runtime': 21.135, 'eval_samples_per_second': 9.463, 'eval_steps_per_second': 4.731, 'epoch': 1.83}
{'loss': 1.2468, 'learning_rate': 2.3537699320144493e-07, 'epoch': 1.88}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.074399471282959, 'eval_map@3': 0.7925000000000001, 'eval_runtime': 21.6223, 'eval_samples_per_second': 9.25, 'eval_steps_per_second': 4.625, 'epoch': 1.88}
{'loss': 1.1263, 'learning_rate': 9.434310153608073e-08, 'epoch': 1.92}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.07358717918396, 'eval_map@3': 0.7925000000000001, 'eval_runtime': 22.1328, 'eval_samples_per_second': 9.036, 'eval_steps_per_second': 4.518, 'epoch': 1.92}
{'loss': 1.2382, 'learning_rate': 1.6553218908286207e-08, 'epoch': 1.97}


  self.comm = Comm(**args)


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 1.0734565258026123, 'eval_map@3': 0.7925000000000001, 'eval_runtime': 22.5172, 'eval_samples_per_second': 8.882, 'eval_steps_per_second': 4.441, 'epoch': 1.97}
{'train_runtime': 5886.6272, 'train_samples_per_second': 4.453, 'train_steps_per_second': 0.557, 'train_loss': 1.4269542542714921, 'epoch': 2.0}


# Verify Saved Model
During training, we see the MAP@3 validation score above. Let's load the saved model and compute it again here to verify that our model is saved correctly.

In [36]:
del model, trainer

In [37]:
if USE_PEFT:
    model = AutoModelForMultipleChoice.from_pretrained(MODEL)
    model = get_peft_model(model, peft_config)
    checkpoint = torch.load(f'model_v{VER}/adapter_model.bin')
    model.load_state_dict(checkpoint)
else:
    model = AutoModelForMultipleChoice.from_pretrained(f'model_v{VER}')
trainer = Trainer(model=model)

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: Error(s) in loading state_dict for PeftModelForSequenceClassification:
	Missing key(s) in state_dict: "base_model.model.deberta.embeddings.word_embeddings.weight", "base_model.model.deberta.embeddings.LayerNorm.weight", "base_model.model.deberta.embeddings.LayerNorm.bias", "base_model.model.deberta.encoder.layer.0.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.0.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.0.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.0.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.0.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.0.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.0.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.0.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.0.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.0.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.0.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.0.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.0.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.0.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.0.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.0.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.0.output.dense.weight", "base_model.model.deberta.encoder.layer.0.output.dense.bias", "base_model.model.deberta.encoder.layer.0.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.0.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.1.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.1.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.1.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.1.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.1.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.1.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.1.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.1.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.1.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.1.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.1.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.1.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.1.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.1.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.1.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.1.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.1.output.dense.weight", "base_model.model.deberta.encoder.layer.1.output.dense.bias", "base_model.model.deberta.encoder.layer.1.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.1.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.2.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.2.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.2.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.2.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.2.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.2.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.2.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.2.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.2.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.2.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.2.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.2.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.2.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.2.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.2.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.2.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.2.output.dense.weight", "base_model.model.deberta.encoder.layer.2.output.dense.bias", "base_model.model.deberta.encoder.layer.2.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.2.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.3.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.3.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.3.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.3.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.3.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.3.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.3.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.3.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.3.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.3.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.3.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.3.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.3.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.3.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.3.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.3.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.3.output.dense.weight", "base_model.model.deberta.encoder.layer.3.output.dense.bias", "base_model.model.deberta.encoder.layer.3.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.3.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.4.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.4.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.4.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.4.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.4.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.4.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.4.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.4.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.4.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.4.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.4.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.4.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.4.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.4.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.4.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.4.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.4.output.dense.weight", "base_model.model.deberta.encoder.layer.4.output.dense.bias", "base_model.model.deberta.encoder.layer.4.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.4.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.5.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.5.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.5.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.5.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.5.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.5.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.5.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.5.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.5.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.5.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.5.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.5.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.5.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.5.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.5.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.5.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.5.output.dense.weight", "base_model.model.deberta.encoder.layer.5.output.dense.bias", "base_model.model.deberta.encoder.layer.5.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.5.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.6.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.6.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.6.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.6.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.6.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.6.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.6.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.6.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.6.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.6.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.6.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.6.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.6.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.6.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.6.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.6.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.6.output.dense.weight", "base_model.model.deberta.encoder.layer.6.output.dense.bias", "base_model.model.deberta.encoder.layer.6.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.6.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.7.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.7.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.7.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.7.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.7.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.7.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.7.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.7.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.7.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.7.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.7.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.7.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.7.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.7.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.7.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.7.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.7.output.dense.weight", "base_model.model.deberta.encoder.layer.7.output.dense.bias", "base_model.model.deberta.encoder.layer.7.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.7.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.8.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.8.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.8.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.8.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.8.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.8.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.8.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.8.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.8.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.8.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.8.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.8.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.8.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.8.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.8.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.8.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.8.output.dense.weight", "base_model.model.deberta.encoder.layer.8.output.dense.bias", "base_model.model.deberta.encoder.layer.8.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.8.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.9.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.9.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.9.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.9.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.9.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.9.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.9.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.9.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.9.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.9.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.9.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.9.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.9.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.9.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.9.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.9.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.9.output.dense.weight", "base_model.model.deberta.encoder.layer.9.output.dense.bias", "base_model.model.deberta.encoder.layer.9.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.9.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.10.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.10.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.10.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.10.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.10.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.10.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.10.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.10.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.10.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.10.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.10.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.10.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.10.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.10.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.10.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.10.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.10.output.dense.weight", "base_model.model.deberta.encoder.layer.10.output.dense.bias", "base_model.model.deberta.encoder.layer.10.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.10.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.11.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.11.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.11.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.11.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.11.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.11.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.11.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.11.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.11.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.11.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.11.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.11.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.11.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.11.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.11.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.11.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.11.output.dense.weight", "base_model.model.deberta.encoder.layer.11.output.dense.bias", "base_model.model.deberta.encoder.layer.11.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.11.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.12.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.12.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.12.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.12.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.12.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.12.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.12.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.12.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.12.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.12.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.12.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.12.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.12.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.12.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.12.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.12.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.12.output.dense.weight", "base_model.model.deberta.encoder.layer.12.output.dense.bias", "base_model.model.deberta.encoder.layer.12.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.12.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.13.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.13.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.13.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.13.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.13.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.13.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.13.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.13.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.13.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.13.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.13.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.13.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.13.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.13.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.13.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.13.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.13.output.dense.weight", "base_model.model.deberta.encoder.layer.13.output.dense.bias", "base_model.model.deberta.encoder.layer.13.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.13.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.14.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.14.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.14.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.14.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.14.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.14.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.14.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.14.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.14.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.14.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.14.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.14.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.14.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.14.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.14.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.14.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.14.output.dense.weight", "base_model.model.deberta.encoder.layer.14.output.dense.bias", "base_model.model.deberta.encoder.layer.14.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.14.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.15.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.15.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.15.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.15.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.15.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.15.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.15.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.15.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.15.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.15.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.15.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.15.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.15.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.15.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.15.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.15.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.15.output.dense.weight", "base_model.model.deberta.encoder.layer.15.output.dense.bias", "base_model.model.deberta.encoder.layer.15.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.15.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.16.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.16.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.16.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.16.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.16.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.16.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.16.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.16.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.16.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.16.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.16.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.16.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.16.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.16.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.16.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.16.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.16.output.dense.weight", "base_model.model.deberta.encoder.layer.16.output.dense.bias", "base_model.model.deberta.encoder.layer.16.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.16.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.17.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.17.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.17.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.17.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.17.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.17.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.17.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.17.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.17.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.17.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.17.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.17.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.17.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.17.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.17.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.17.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.17.output.dense.weight", "base_model.model.deberta.encoder.layer.17.output.dense.bias", "base_model.model.deberta.encoder.layer.17.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.17.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.18.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.18.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.18.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.18.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.18.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.18.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.18.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.18.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.18.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.18.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.18.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.18.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.18.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.18.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.18.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.18.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.18.output.dense.weight", "base_model.model.deberta.encoder.layer.18.output.dense.bias", "base_model.model.deberta.encoder.layer.18.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.18.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.19.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.19.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.19.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.19.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.19.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.19.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.19.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.19.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.19.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.19.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.19.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.19.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.19.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.19.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.19.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.19.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.19.output.dense.weight", "base_model.model.deberta.encoder.layer.19.output.dense.bias", "base_model.model.deberta.encoder.layer.19.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.19.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.20.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.20.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.20.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.20.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.20.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.20.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.20.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.20.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.20.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.20.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.20.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.20.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.20.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.20.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.20.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.20.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.20.output.dense.weight", "base_model.model.deberta.encoder.layer.20.output.dense.bias", "base_model.model.deberta.encoder.layer.20.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.20.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.21.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.21.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.21.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.21.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.21.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.21.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.21.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.21.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.21.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.21.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.21.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.21.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.21.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.21.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.21.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.21.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.21.output.dense.weight", "base_model.model.deberta.encoder.layer.21.output.dense.bias", "base_model.model.deberta.encoder.layer.21.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.21.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.22.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.22.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.22.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.22.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.22.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.22.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.22.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.22.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.22.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.22.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.22.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.22.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.22.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.22.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.22.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.22.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.22.output.dense.weight", "base_model.model.deberta.encoder.layer.22.output.dense.bias", "base_model.model.deberta.encoder.layer.22.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.22.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.23.attention.self.query_proj.weight", "base_model.model.deberta.encoder.layer.23.attention.self.query_proj.bias", "base_model.model.deberta.encoder.layer.23.attention.self.query_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.23.attention.self.query_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.23.attention.self.key_proj.weight", "base_model.model.deberta.encoder.layer.23.attention.self.key_proj.bias", "base_model.model.deberta.encoder.layer.23.attention.self.value_proj.weight", "base_model.model.deberta.encoder.layer.23.attention.self.value_proj.bias", "base_model.model.deberta.encoder.layer.23.attention.self.value_proj.lora_A.default.weight", "base_model.model.deberta.encoder.layer.23.attention.self.value_proj.lora_B.default.weight", "base_model.model.deberta.encoder.layer.23.attention.output.dense.weight", "base_model.model.deberta.encoder.layer.23.attention.output.dense.bias", "base_model.model.deberta.encoder.layer.23.attention.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.23.attention.output.LayerNorm.bias", "base_model.model.deberta.encoder.layer.23.intermediate.dense.weight", "base_model.model.deberta.encoder.layer.23.intermediate.dense.bias", "base_model.model.deberta.encoder.layer.23.output.dense.weight", "base_model.model.deberta.encoder.layer.23.output.dense.bias", "base_model.model.deberta.encoder.layer.23.output.LayerNorm.weight", "base_model.model.deberta.encoder.layer.23.output.LayerNorm.bias", "base_model.model.deberta.encoder.rel_embeddings.weight", "base_model.model.deberta.encoder.LayerNorm.weight", "base_model.model.deberta.encoder.LayerNorm.bias", "base_model.model.pooler.original_module.dense.weight", "base_model.model.pooler.original_module.dense.bias", "base_model.model.pooler.modules_to_save.default.dense.weight", "base_model.model.pooler.modules_to_save.default.dense.bias", "base_model.model.classifier.original_module.weight", "base_model.model.classifier.original_module.bias", "base_model.model.classifier.modules_to_save.default.weight", "base_model.model.classifier.modules_to_save.default.bias". 
	Unexpected key(s) in state_dict: "base_model.model.deberta.encoder.layer.0.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.0.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.0.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.0.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.1.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.1.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.1.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.1.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.2.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.2.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.2.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.2.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.3.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.3.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.3.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.3.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.4.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.4.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.4.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.4.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.5.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.5.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.5.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.5.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.6.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.6.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.6.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.6.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.7.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.7.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.7.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.7.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.8.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.8.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.8.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.8.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.9.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.9.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.9.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.9.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.10.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.10.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.10.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.10.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.11.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.11.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.11.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.11.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.12.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.12.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.12.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.12.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.13.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.13.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.13.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.13.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.14.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.14.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.14.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.14.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.15.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.15.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.15.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.15.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.16.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.16.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.16.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.16.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.17.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.17.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.17.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.17.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.18.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.18.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.18.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.18.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.19.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.19.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.19.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.19.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.20.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.20.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.20.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.20.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.21.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.21.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.21.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.21.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.22.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.22.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.22.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.22.attention.self.value_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.23.attention.self.query_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.23.attention.self.query_proj.lora_B.weight", "base_model.model.deberta.encoder.layer.23.attention.self.value_proj.lora_A.weight", "base_model.model.deberta.encoder.layer.23.attention.self.value_proj.lora_B.weight", "base_model.model.pooler.dense.weight", "base_model.model.pooler.dense.bias", "base_model.model.classifier.weight", "base_model.model.classifier.bias". 

In [None]:
test_df = pd.read_csv('dataset/train_with_context2.csv')
tokenized_test_dataset = Dataset.from_pandas(test_df).map(
        preprocess, remove_columns=['prompt', 'context', 'A', 'B', 'C', 'D', 'E'])

test_predictions = trainer.predict(tokenized_test_dataset).predictions
predictions_as_ids = np.argsort(-test_predictions, 1)
predictions_as_answer_letters = np.array(list('ABCDE'))[predictions_as_ids]
predictions_as_string = test_df['prediction'] = [
    ' '.join(row) for row in predictions_as_answer_letters[:, :3]
]

  self.comm = Comm(**args)


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  0%|          | 0/25 [00:00<?, ?it/s]

# Compute Validation Score

In [None]:
# https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking
import numpy as np
def precision_at_k(r, k):
    """Precision at k"""
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k

def MAP_at_3(predictions, true_items):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_3 = 0.0
    for u in range(U):
        user_preds = predictions[u].split()
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), 3)):
            map_at_3 += precision_at_k(user_results, k+1) * user_results[k]
    return map_at_3 / U

In [None]:
m = MAP_at_3(test_df.prediction.values, test_df.answer.values)
print( 'CV MAP@3 =',m )

CV MAP@3 = 0.8533333333333334


In [None]:
# Save parameters, constants and MAP@3 score to config{SIMULATION}/ folder
import json
with open(f'config{VER}.json', 'w') as f:
    json.dump({
        'SIMULATION': VER,
        'LOAD_DATASET_FROM_DISK': LOAD_DATASET_FROM_DISK,
        'USE_PEFT': USE_PEFT,
        'FREEZE_LAYERS': FREEZE_LAYERS,
        'FREEZE_EMBEDDINGS': FREEZE_EMBEDDINGS,
        'MAX_INPUT': MAX_INPUT,
        'MODEL': MODEL,
        'MAP@3 test': m,
        'training args': training_args.to_dict(),
    }, f, indent=4)

# Analyse erros

In [None]:
import textwrap

def printer(text, width=150):
    text_lines = textwrap.wrap(text, width=width)
    for line in text_lines:
        print(f"\t{line}")
              
def print_row(row):
    to_print_part1 = row['context']
    to_print_part2 = 'Prompt:\n' + '\t' + row['prompt']
    to_print_part3 = 'Answers:\n'
    for col in 'ABCDE':
        to_print_part3 += f"\t- {row[col]}\n"
    to_print_part4 = f"True Answer: {row['answer']}\n"
    # remaining_width = MAX_INPUT - len(to_print_part2) - len(to_print_part3) - len(to_print_part4) - len('Prompt:Answers:True Answer: ')
    context = textwrap.shorten(to_print_part1, width=1000, placeholder="...")
    
    print("Context:")
    printer(context)
    print(to_print_part2)
    print(to_print_part3)
    print(to_print_part4)
    print()
    
_ = test_df[test_df['prediction'].str[0] != test_df['answer']].apply(print_row, axis=1)

Context:
	It is possible that this usage is related with the Greek name of the island of Sicily, Trinacria (Τρινακρία "having three headlands").Liddell and
	Scott’s Greek-English Lexicon (A Lexicon Abridged from), Oxford, 1944, p.27, Cassell's Latin Dictionary, Marchant, J.R.V, & Charles, Joseph F.,
	(Eds.), Revised Edition, 1928 The Sicilian triskeles is shown with the head of Medusa at the center.Matthews, Jeff (2005) Symbols of Naples The
	ancient symbol has been re-introduced in modern flags of Sicily since 1848. An early flag of Sicily, proposed in 1848, included the Sicilian triskeles
	or "Trinacria symbol". The triskeles was adopted as emblem by the rulers of Syracuse. The oldest find of a triskeles in Sicily is a vase dated to 700
	BCE, for which researchers assume a Minoan-Mycenaean origin. ===Roman period and Late Antiquity=== Late examples of the triple spiral symbols are
	found in Iron Age Europe, e.g. carved in rock in Castro Culture settlement in Galicia, Asturias and Nor

In [None]:
test_df[test_df['prediction'].str[0] != test_df['answer']]

Unnamed: 0,prompt,context,A,B,C,D,E,answer,prediction
2,Which of the following statements accurately d...,It is possible that this usage is related with...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A,D A C
5,Which of the following statements accurately d...,"For a closed Gaussian surface, electric flux i...",Gauss's law holds only for situations involvin...,"Gauss's law holds in all cases, but it is most...","Gauss's law, which applies equally to all elec...",Gauss's law only holds for electric fields wit...,"Gauss's law, which holds for all situations, i...",B,C B E
13,What is the Roche limit?,"In celestial mechanics, the Roche limit, also ...",The Roche limit is the distance at which tidal...,The Roche limit is the distance at which tidal...,The Roche limit is the distance at which tidal...,The Roche limit is the distance at which tidal...,The Roche limit is the distance at which tidal...,D,E D C
17,What is the butterfly effect?,The butterfly effect describes a phenomenon in...,The butterfly effect is a physical cause that ...,The butterfly effect is a distributed causalit...,The butterfly effect is a proportionality betw...,The butterfly effect is a small push that is n...,The butterfly effect is a phenomenon that high...,E,D E B
23,What did Newton's manuscripts of the 1660s show?,Newton was well-versed in both classics and mo...,Newton learned about tangential motion and rad...,Newton's manuscripts did not show any evidence...,Newton combined tangential motion with the eff...,Newton's manuscripts showed that he learned ab...,Newton's manuscripts showed that he was indebt...,C,D C A
24,What is the decay energy for the free neutron ...,The following diagram gives a summary sketch o...,0.013343 MeV,0.013 MeV,"1,000 MeV",0.782 MeV,0.782343 MeV,E,D E A
33,What are amorphous ferromagnetic metallic allo...,Amorphous metals can be grouped in two categor...,Amorphous ferromagnetic metallic alloys are cr...,Amorphous ferromagnetic metallic alloys are no...,Amorphous ferromagnetic metallic alloys are cr...,Amorphous ferromagnetic metallic alloys are no...,Amorphous ferromagnetic metallic alloys are no...,D,E D B
34,What is the Penrose process?,The Penrose process (also called Penrose mecha...,The Penrose process is a mechanism through whi...,The Penrose process is a mechanism through whi...,The Penrose process is a mechanism through whi...,The Penrose process is a mechanism through whi...,The Penrose process is a mechanism through whi...,C,B C E
35,What was the aim of the Gravity Probe B (GP-B)...,Gravity Probe B (GP-B) was a satellite-based e...,To prove that pressure contributes equally to ...,"To measure spacetime curvature near Earth, wit...",To measure the distribution of Fe and Al on th...,To confirm the relatively large geodetic effec...,To measure the discrepancy between active and ...,B,D E B
38,What is the spin quantum number?,"In physics, the spin quantum number is a quant...",The spin quantum number is a measure of the di...,The spin quantum number is a measure of the si...,The spin quantum number is a measure of the ch...,The spin quantum number is a measure of the sp...,The spin quantum number is a dimensionless qua...,E,D E C


# Submission

In [None]:
predictions_as_string

['D C B',
 'A D E',
 'D A C',
 'C B D',
 'D E A',
 'C B E',
 'A D B',
 'D E B',
 'C B A',
 'A E C',
 'E D A',
 'A B E',
 'C A E',
 'E D C',
 'B A D',
 'B D E',
 'E A C',
 'D E B',
 'A B C',
 'E B D',
 'D B C',
 'D C E',
 'C D A',
 'D C A',
 'D E A',
 'E D A',
 'A C E',
 'D C B',
 'E B C',
 'C B A',
 'B A E',
 'E C A',
 'E B A',
 'E D B',
 'B C E',
 'D E B',
 'E B D',
 'A C D',
 'D E C',
 'A E D',
 'E D A',
 'A B D',
 'D C E',
 'D B C',
 'E C D',
 'A E B',
 'B E C',
 'C E B',
 'A D E',
 'B A C',
 'B D E',
 'E D C',
 'B C A',
 'D C A',
 'B D A',
 'B D E',
 'C A E',
 'C B A',
 'D E C',
 'A B E',
 'B E D',
 'E D B',
 'C A B',
 'A E C',
 'A B D',
 'C D E',
 'C A D',
 'E A B',
 'A D C',
 'D E A',
 'C A B',
 'A D E',
 'C A D',
 'B E D',
 'D A E',
 'A C B',
 'D B E',
 'B C D',
 'C E B',
 'E A C',
 'C E B',
 'A D E',
 'B D C',
 'A D C',
 'E C D',
 'C E D',
 'D A C',
 'B A E',
 'E C A',
 'D E C',
 'B D E',
 'B D C',
 'B E C',
 'E B D',
 'E A B',
 'C D A',
 'C D B',
 'B D E',
 'D C B',
 'D C E',
