In [1]:
import random
from typing import Union

import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import transformers

In [2]:
# ######################## PART 1: PROVIDED CODE ########################


def load_datasets(data_directory: str) -> Union[dict, dict]:
    """
    Reads the training and validation splits from disk and load
    them into memory.

    Parameters
    ----------
    data_directory: str
        The directory where the data is stored.

    Returns
    -------
    train: dict
        The train dictionary with keys 'premise', 'hypothesis', 'label'.
    validation: dict
        The validation dictionary with keys 'premise', 'hypothesis', 'label'.
    """
    import json
    import os

    with open(os.path.join(data_directory, "train.json"), "r") as f:
        train = json.load(f)

    with open(os.path.join(data_directory, "validation.json"), "r") as f:
        valid = json.load(f)

    return train, valid


class NLIDataset(torch.utils.data.Dataset):
    def __init__(self, data_dict: dict):
        self.data_dict = data_dict
        dd = data_dict

        if len(dd["premise"]) != len(dd["hypothesis"]) or len(dd["premise"]) != len(
                dd["label"]
        ):
            raise AttributeError("Incorrect length in data_dict")

    def __len__(self):
        return len(self.data_dict["premise"])

    def __getitem__(self, idx):
        dd = self.data_dict
        return dd["premise"][idx], dd["hypothesis"][idx], dd["label"][idx]


def train_distilbert(model, loader, device, optimizer):
    model.train()
    criterion = model.get_criterion()
    total_loss = 0.0

    for premise, hypothesis, target in tqdm(loader):
        optimizer.zero_grad()

        inputs = model.tokenize(premise, hypothesis).to(device)
        target = target.to(device, dtype=torch.float32)

        pred = model(inputs)

        loss = criterion(pred, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)


@torch.no_grad()
def eval_distilbert(model, loader, device):
    model.eval()

    targets = []
    preds = []

    for premise, hypothesis, target in loader:
        preds.append(model(model.tokenize(premise, hypothesis).to(device)))

        targets.append(target)

    return torch.cat(preds), torch.cat(targets)


# ######################## PART 1: YOUR WORK STARTS HERE ########################
'''
1. Finetune DistilBERT for classification (40 pts)
In this part, you will use the NLI training data (same as A1) to finetune a DistilBERT model and
predict whether a premise entails a hypothesis or not. Just like the first assignment, you will
have to implement various parts of a custom nn.Module that loads a pretrained DistilBERT from
Huggingface transformers. You can learn more about DistilBERT here, but you can just assume
it’s a smaller version of BERT that remains fairly accurate.
'''


# You will have to implement the init function of the CustomDistilBert class. You will need to
# initialize the following attributes:
# ● self.distilbert
# ● self.tokenizer
# ● self.pred_layer
# ● self.sigmoid
# ● self.criterion
# For distilbert and tokenizer, you will need to use transformers, whereas pred_layer, sigmoid,
# and criterion require torch and correspond to questions you have previously answered in A1.


class CustomDistilBert(nn.Module):
    def __init__(self):
        """
        CustomDistilBert.__init__
        Note:
        ● Load the DistilBERT model's pretrained "base uncased" weights from the Huggingface
        repository. We want the bare encoder outputting hidden-states without any specific head
        on top.
        ● Load the corresponding pre-trained tokenizer using the same method.
        ● self.pred_layer takes the output of the model and predicts a single score (binary, 1 or 0),
        then pass the output to the sigmoid layer
        ● self.sigmoid should return torch's sigmoid activation.
        ● self.criterion should be the binary cross-entropy loss. You may use torch.nn here.
        """
        super().__init__()

        # TODO: your work below
        self.distilbert = transformers.DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.pred_layer = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
        self.criterion = nn.BCELoss()

    # vvvvv DO NOT CHANGE BELOW THIS LINE vvvvv
    def get_distilbert(self):
        return self.distilbert

    def get_tokenizer(self):
        return self.tokenizer

    def get_pred_layer(self):
        return self.pred_layer

    def get_sigmoid(self):
        return self.sigmoid

    def get_criterion(self):
        return self.criterion

    # ^^^^^ DO NOT CHANGE ABOVE THIS LINE ^^^^^

    def assign_optimizer(self, **kwargs):
        """
        CustomDistilBert.assign_optimizer
        This assigns the Adam optimizer to this model's parameters (self) and returns the optimizer.
        :param kwargs:
        :return:
        """
        # TODO: your work below
        return torch.optim.Adam(self.parameters(), **kwargs)

    def slice_cls_hidden_state(
            self, x
    ) -> torch.Tensor:
        """
        Edit the method CustomDistilBert.slice_cls_hidden_state. This is a helper method that will be
        used inside forward, and will convert the output of your transformer model to something that can
        be input in the prediction layer.
        CustomDistilBert.slice_cls_hidden_state
        Using the output of the model, return the last hidden state of the CLS token.
        ParameterTypeDescription
        xBaseModelOutputThe output of the distilbert model. You need to retrieve
        the hidden state of the last output layer, then slice it to
        obtain the hidden representation. The last hidden state
        has shape: [batch_size, sequence_length,
        hidden_size]
        ReturnsDescription
        Tensor[batch_size,
        hidden_size]The last layer's hidden state representing the [CLS] token.
        Usually, CLS is the first token in the sequence.
        :param x:
        :return:
        """
        # TODO: your work below
        return x.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]

    def tokenize(
            self,
            premise: "list[str]",
            hypothesis: "list[str]",
            max_length: int = 128,
            truncation: bool = True,
            padding: bool = True,
    ):
        """
        Use the get_tokenizer function implemented in 2.1 to write the method
        CustomDistilBert.tokenize. That method is specifically to help you understand how the
        tokenizer works, and should be fairly straightforward.
        This function will be applied to the premise and hypothesis (list of str) to obtain the inputs for
        your model. You will need to use the Huggingface tokenizer returned by get_tokenizer().
        ParameterTypeDescription
        premiselist of strThe first text to be input in your model.
        hypothesislist of strThe second text to be input in your model.
        For the remaining params, see documentations.
        ReturnsDescription
        BatchEncodingA dictionary-like object that can be given to the model (you
        can find out how by reading the docs)
        :param premise:
        :param hypothesis:
        :param max_length:
        :param truncation:
        :param padding:
        :return:
        """
        # TODO: your work below
        return self.tokenizer(
            premise,
            hypothesis,
            max_length=max_length,
            truncation=truncation,
            padding=padding,
            return_tensors="pt",
        )

    def forward(self, inputs):
        """
        Given the output of your tokenizer (a BatchEncoding object), you will have to pass through your
        custom DistilBert model and output a score between 0 and 1 for each element in your batch;
        this score represents whether there’s an entailment or not.
        CustomDistilBert.forward
        Note: In the original BERT paper, the output representation of CLS is used for classification.
        You will need to slice the output of your DistilBERT to obtain the representation before giving it
        to the last layer with sigmoid activation.
        :param inputs:
        :return:
        """
        # TODO: your work below
        x = self.distilbert(**inputs, return_dict=True)
        x = self.slice_cls_hidden_state(x)
        x = self.pred_layer(x)
        x = self.sigmoid(x)
        x = x.squeeze(1)
        return x


# ######################## PART 2: YOUR WORK HERE ########################
def freeze_params(model):
    """
    Before starting, you will need to freeze all the parameters (including the embedding!). This is
    because prompt tuning relies on tuning a very small number of fixed parameters (aka “prompts”,
    since they are inserted as input embeddings to the model). Thus, everything else, including the
    input word embeddings, are not trainable.
    :param model:
    :return:
    """
    # TODO: your work below
    for param in model.parameters():
        param.requires_grad = False


def pad_attention_mask(mask, p):
    """
    Pad the start of the sequence p times of the attention_mask (which is one of the various
    outputs of a Huggingface tokenizer) because the sequence length has changed. Find the
    correct value based on Huggingface documentations.
    :param mask:
    :param p:
    :return:
    """
    # TODO: your work below
    return F.pad(mask, (p, 0), value=1)


class SoftPrompting(nn.Module):
    def __init__(self, p: int, e: int):
        super().__init__()
        self.p = p
        self.e = e

        self.prompts = torch.randn((p, e), requires_grad=True)

    def forward(self, embedded):
        """
        This takes the output of model.embeddings and adds the soft prompts, as described in the
        paper. The prompts must be added at the start of the sequence.
        ParameterTypeDescription
        embeddedTensor[B, L, E]This corresponds to model.embeddings (where model is
        a Huggingface transformer)
        ● B: Batch size
        ● L: Sequence Length
        ● E: Embedding dimension (same as e)ReturnsDescription
        Tensor[B, L+p, E]The input_embed to be given to the model, but with the added
        :param embedded:
        :return:
        """
        # TODO: your work below
        p = self.prompts.unsqueeze(0).repeat(embedded.size(0), 1, 1).to(embedded.device)
        return torch.cat([p, embedded], dim=1)

In [4]:
# get data of validation set of index 2, 6, 9
train_raw, valid_raw = load_datasets("data/nli")
valid_raw = {k: [v[i] for i in [2, 6, 9]] for k, v in valid_raw.items()}
# print them
print(valid_raw)
# get from train set
train_raw = {k: [v[i] for i in [2, 6, 9]] for k, v in train_raw.items()}
# print them
print(train_raw)

{'premise': ['Two young children in blue jerseys, one with the number 9 and one with the number 2 are standing on wooden steps in a bathroom and washing their hands in a sink.', 'Two young boys of opposing teams play football, while wearing full protection uniforms and helmets.', 'A man in a blue shirt standing in front of a garage-like structure painted with geometric designs.'], 'hypothesis': ['Two kids in numbered jerseys wash their hands.', 'boys play football', 'A man is wearing a black shirt'], 'label': [0, 0, 1]}
{'premise': ['Two brown dogs barking at each other.', 'four friends cheerfully jumping off the flight stairs.', 'Two construction workers are working on a scaffold.'], 'hypothesis': ['The animals are making noise.', 'Four people jumping off stairs.', 'The two workers are working on a car.'], 'label': [0, 0, 1]}


In [8]:
class CustomBert(nn.Module):
    def __init__(self):
        """
        CustomDistilBert.__init__
        Note:
        ● Load the DistilBERT model's pretrained "base uncased" weights from the Huggingface
        repository. We want the bare encoder outputting hidden-states without any specific head
        on top.
        ● Load the corresponding pre-trained tokenizer using the same method.
        ● self.pred_layer takes the output of the model and predicts a single score (binary, 1 or 0),
        then pass the output to the sigmoid layer
        ● self.sigmoid should return torch's sigmoid activation.
        ● self.criterion should be the binary cross-entropy loss. You may use torch.nn here.
        """
        super().__init__()

        # TODO: your work below
        self.distilbert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
        self.pred_layer = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
        self.criterion = nn.BCELoss()

    # vvvvv DO NOT CHANGE BELOW THIS LINE vvvvv
    def get_distilbert(self):
        return self.distilbert

    def get_tokenizer(self):
        return self.tokenizer

    def get_pred_layer(self):
        return self.pred_layer

    def get_sigmoid(self):
        return self.sigmoid

    def get_criterion(self):
        return self.criterion

    # ^^^^^ DO NOT CHANGE ABOVE THIS LINE ^^^^^

    def assign_optimizer(self, **kwargs):
        """
        CustomDistilBert.assign_optimizer
        This assigns the Adam optimizer to this model's parameters (self) and returns the optimizer.
        :param kwargs:
        :return:
        """
        # TODO: your work below
        return torch.optim.Adam(self.parameters(), **kwargs)

    def slice_cls_hidden_state(
            self, x
    ) -> torch.Tensor:
        """
        Edit the method CustomDistilBert.slice_cls_hidden_state. This is a helper method that will be
        used inside forward, and will convert the output of your transformer model to something that can
        be input in the prediction layer.
        CustomDistilBert.slice_cls_hidden_state
        Using the output of the model, return the last hidden state of the CLS token.
        ParameterTypeDescription
        xBaseModelOutputThe output of the distilbert model. You need to retrieve
        the hidden state of the last output layer, then slice it to
        obtain the hidden representation. The last hidden state
        has shape: [batch_size, sequence_length,
        hidden_size]
        ReturnsDescription
        Tensor[batch_size,
        hidden_size]The last layer's hidden state representing the [CLS] token.
        Usually, CLS is the first token in the sequence.
        :param x:
        :return:
        """
        # TODO: your work below
        return x.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]

    def tokenize(
            self,
            premise: "list[str]",
            hypothesis: "list[str]",
            max_length: int = 128,
            truncation: bool = True,
            padding: bool = True,
    ):
        """
        Use the get_tokenizer function implemented in 2.1 to write the method
        CustomDistilBert.tokenize. That method is specifically to help you understand how the
        tokenizer works, and should be fairly straightforward.
        This function will be applied to the premise and hypothesis (list of str) to obtain the inputs for
        your model. You will need to use the Huggingface tokenizer returned by get_tokenizer().
        ParameterTypeDescription
        premiselist of strThe first text to be input in your model.
        hypothesislist of strThe second text to be input in your model.
        For the remaining params, see documentations.
        ReturnsDescription
        BatchEncodingA dictionary-like object that can be given to the model (you
        can find out how by reading the docs)
        :param premise:
        :param hypothesis:
        :param max_length:
        :param truncation:
        :param padding:
        :return:
        """
        # TODO: your work below
        return self.tokenizer(
            premise,
            hypothesis,
            max_length=max_length,
            truncation=truncation,
            padding=padding,
            return_tensors="pt",
        )

    def forward(self, inputs):
        """
        Given the output of your tokenizer (a BatchEncoding object), you will have to pass through your
        custom DistilBert model and output a score between 0 and 1 for each element in your batch;
        this score represents whether there’s an entailment or not.
        CustomDistilBert.forward
        Note: In the original BERT paper, the output representation of CLS is used for classification.
        You will need to slice the output of your DistilBERT to obtain the representation before giving it
        to the last layer with sigmoid activation.
        :param inputs:
        :return:
        """
        # TODO: your work below
        x = self.distilbert(**inputs, return_dict=True)
        x = self.slice_cls_hidden_state(x)
        x = self.pred_layer(x)
        x = self.sigmoid(x)
        x = x.squeeze(1)
        return x

In [9]:
class CustomRobertaModel(nn.Module):
    def __init__(self):
        """
        CustomDistilBert.__init__
        Note:
        ● Load the DistilBERT model's pretrained "base uncased" weights from the Huggingface
        repository. We want the bare encoder outputting hidden-states without any specific head
        on top.
        ● Load the corresponding pre-trained tokenizer using the same method.
        ● self.pred_layer takes the output of the model and predicts a single score (binary, 1 or 0),
        then pass the output to the sigmoid layer
        ● self.sigmoid should return torch's sigmoid activation.
        ● self.criterion should be the binary cross-entropy loss. You may use torch.nn here.
        """
        super().__init__()

        # TODO: your work below
        self.distilbert = transformers.RobertaModel.from_pretrained('roberta-base')
        self.tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-base')
        self.pred_layer = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
        self.criterion = nn.BCELoss()

    # vvvvv DO NOT CHANGE BELOW THIS LINE vvvvv
    def get_distilbert(self):
        return self.distilbert

    def get_tokenizer(self):
        return self.tokenizer

    def get_pred_layer(self):
        return self.pred_layer

    def get_sigmoid(self):
        return self.sigmoid

    def get_criterion(self):
        return self.criterion

    # ^^^^^ DO NOT CHANGE ABOVE THIS LINE ^^^^^

    def assign_optimizer(self, **kwargs):
        """
        CustomDistilBert.assign_optimizer
        This assigns the Adam optimizer to this model's parameters (self) and returns the optimizer.
        :param kwargs:
        :return:
        """
        # TODO: your work below
        return torch.optim.Adam(self.parameters(), **kwargs)

    def slice_cls_hidden_state(
            self, x
    ) -> torch.Tensor:
        """
        Edit the method CustomDistilBert.slice_cls_hidden_state. This is a helper method that will be
        used inside forward, and will convert the output of your transformer model to something that can
        be input in the prediction layer.
        CustomDistilBert.slice_cls_hidden_state
        Using the output of the model, return the last hidden state of the CLS token.
        ParameterTypeDescription
        xBaseModelOutputThe output of the distilbert model. You need to retrieve
        the hidden state of the last output layer, then slice it to
        obtain the hidden representation. The last hidden state
        has shape: [batch_size, sequence_length,
        hidden_size]
        ReturnsDescription
        Tensor[batch_size,
        hidden_size]The last layer's hidden state representing the [CLS] token.
        Usually, CLS is the first token in the sequence.
        :param x:
        :return:
        """
        # TODO: your work below
        return x.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]

    def tokenize(
            self,
            premise: "list[str]",
            hypothesis: "list[str]",
            max_length: int = 128,
            truncation: bool = True,
            padding: bool = True,
    ):
        """
        Use the get_tokenizer function implemented in 2.1 to write the method
        CustomDistilBert.tokenize. That method is specifically to help you understand how the
        tokenizer works, and should be fairly straightforward.
        This function will be applied to the premise and hypothesis (list of str) to obtain the inputs for
        your model. You will need to use the Huggingface tokenizer returned by get_tokenizer().
        ParameterTypeDescription
        premiselist of strThe first text to be input in your model.
        hypothesislist of strThe second text to be input in your model.
        For the remaining params, see documentations.
        ReturnsDescription
        BatchEncodingA dictionary-like object that can be given to the model (you
        can find out how by reading the docs)
        :param premise:
        :param hypothesis:
        :param max_length:
        :param truncation:
        :param padding:
        :return:
        """
        # TODO: your work below
        return self.tokenizer(
            premise,
            hypothesis,
            max_length=max_length,
            truncation=truncation,
            padding=padding,
            return_tensors="pt",
        )

    def forward(self, inputs):
        """
        Given the output of your tokenizer (a BatchEncoding object), you will have to pass through your
        custom DistilBert model and output a score between 0 and 1 for each element in your batch;
        this score represents whether there’s an entailment or not.
        CustomDistilBert.forward
        Note: In the original BERT paper, the output representation of CLS is used for classification.
        You will need to slice the output of your DistilBERT to obtain the representation before giving it
        to the last layer with sigmoid activation.
        :param inputs:
        :return:
        """
        # TODO: your work below
        x = self.distilbert(**inputs, return_dict=True)
        x = self.slice_cls_hidden_state(x)
        x = self.pred_layer(x)
        x = self.sigmoid(x)
        x = x.squeeze(1)
        return x

In [10]:
import pandas as pd
from sklearn.metrics import accuracy_score  # Make sure sklearn is installed

random.seed(2022)
torch.manual_seed(2022)

# Parameters (you can change them)
sample_size = 2500  # Change this if you want to take a subset of data for testing
batch_size = 64
n_epochs = 10
num_words = 50000

# If you use GPUs, use the code below:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ###################### PART 1: TEST CODE ######################
# Prefilled code showing you how to use the helper functions
train_raw, valid_raw = load_datasets("data/nli")
if sample_size is not None:
    for key in ["premise", "hypothesis", "label"]:
        train_raw[key] = train_raw[key][:sample_size]
        valid_raw[key] = valid_raw[key][:sample_size]

full_text = (
        train_raw["premise"]
        + train_raw["hypothesis"]
        + valid_raw["premise"]
        + valid_raw["hypothesis"]
)

print("=" * 80)
print("Running test code for part 1")
print("-" * 80)

train_loader = torch.utils.data.DataLoader(
    NLIDataset(train_raw), batch_size=batch_size, shuffle=True
)
valid_loader = torch.utils.data.DataLoader(
    NLIDataset(valid_raw), batch_size=batch_size, shuffle=False
)

model = CustomDistilBert().to(device)
optimizer = model.assign_optimizer(lr=1e-4)

# ###################### PART 2: TEST CODE ######################
freeze_params(model.get_distilbert())  # Now, model should have no trainable parameters

sp = SoftPrompting(p=5, e=model.get_distilbert().embeddings.word_embeddings.embedding_dim).to(device)
batch = model.tokenize(
    ["This is a premise.", "This is another premise."],
    ["This is a hypothesis.", "This is another hypothesis."],
).to(device)
batch.input_embedded = sp(model.get_distilbert().embeddings(batch.input_ids))
batch.attention_mask = pad_attention_mask(batch.attention_mask, 5)

# Get other two models
model2 = CustomBert().to(device)
model3 = CustomRobertaModel().to(device)

# Get optimizer for the other two models
optimizer2 = model2.assign_optimizer(lr=1e-4)
optimizer3 = model3.assign_optimizer(lr=1e-4)

# Get soft prompting for the other two models
sp2 = SoftPrompting(p=5, e=model2.get_distilbert().embeddings.word_embeddings.embedding_dim).to(device)
sp3 = SoftPrompting(p=5, e=model3.get_distilbert().embeddings.word_embeddings.embedding_dim).to(device)

# In this section, you implemented soft prompt tuning and froze the model.
# Just like the previous question, train your model for 10 epochs using the same training loop,
# but this time the model itself is frozen and only the soft prompts are updated.
# You will see the results are different from full finetuning.

# Using the prompt tuning method described in the paper, train DistilBERT
# Then, upload a plot of the validation loss, and another of the validation accuracy over each of the 10 epochs you trained the model for.
# The legend should show the name of each model, which should all have different colors.

valid_accs = []
valid_losses = []
valid_accs2 = []
valid_losses2 = []
valid_accs3 = []
valid_losses3 = []
for epoch in range(n_epochs):
    # Train models with soft prompting
    loss = train_distilbert(sp, train_loader, device, optimizer)
    loss2 = train_distilbert(model2, train_loader, device, optimizer2)
    loss3 = train_distilbert(model3, train_loader, device, optimizer3)
    valid_preds, valid_targets = eval_distilbert(sp, valid_loader, device)
    valid_preds = valid_preds.round()
    valid_preds2, valid_targets2 = eval_distilbert(model2, valid_loader, device)
    valid_preds2 = valid_preds2.round()
    valid_preds3, valid_targets3 = eval_distilbert(model3, valid_loader, device)
    valid_preds3 = valid_preds3.round()
    valid_accs.append(accuracy_score(valid_targets.cpu(), valid_preds.cpu()))
    valid_loss = sp.get_criterion()(valid_preds, valid_targets.to(device, dtype=torch.float32)).item()
    valid_losses.append(valid_loss)
    valid_accs2.append(accuracy_score(valid_targets2.cpu(), valid_preds2.cpu()))
    valid_loss2 = model2.get_criterion()(valid_preds2, valid_targets2.to(device, dtype=torch.float32)).item()
    valid_losses2.append(valid_loss2)
    valid_accs3.append(accuracy_score(valid_targets3.cpu(), valid_preds3.cpu()))
    valid_loss3 = model3.get_criterion()(valid_preds3, valid_targets3.to(device, dtype=torch.float32)).item()
    valid_losses3.append(valid_loss3)
    print(f"Epoch {epoch + 1}: train loss {loss:.4f}, valid loss {valid_loss:.4f}, valid acc {valid_accs[-1]:.4f}")
    print(f"Epoch {epoch + 1}: train loss {loss2:.4f}, valid loss {valid_loss2:.4f}, valid acc {valid_accs2[-1]:.4f}")
    print(f"Epoch {epoch + 1}: train loss {loss3:.4f}, valid loss {valid_loss3:.4f}, valid acc {valid_accs3[-1]:.4f}")

# plot
import matplotlib.pyplot as plt
plt.plot(valid_losses, label='valid loss of distilbert')
plt.plot(valid_losses2, label='valid loss of bert')
plt.plot(valid_losses3, label='valid loss of roberta')
plt.xlabel('epoch')
plt.ylabel('validation loss')
plt.title('Validation Loss of DistilBERT, BERT, and Roberta')
plt.legend()
plt.savefig('validation_loss.png')
plt.show()

plt.plot(valid_accs, label='valid acc of distilbert')
plt.plot(valid_accs2, label='valid acc of bert')
plt.plot(valid_accs3, label='valid acc of roberta')
plt.xlabel('epoch')
plt.ylabel('validation accuracy')
plt.title('Validation Accuracy of DistilBERT, BERT, and Roberta')
plt.legend()
plt.savefig('validation_accuracy.png')
plt.show()



Running test code for part 1
--------------------------------------------------------------------------------


Downloading: 100%|██████████| 481/481 [00:00<00:00, 1.15MB/s]
Downloading: 100%|██████████| 501M/501M [00:19<00:00, 25.5MB/s] 
Downloading: 100%|██████████| 899k/899k [00:00<00:00, 14.2MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 9.89MB/s]


ModuleAttributeError: 'SoftPrompting' object has no attribute 'get_criterion'