# Load Dataset

In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [2]:
raw_datasets['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

# Preprocess Dataset with Tokenizer

In [3]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])



In [4]:
tokenized_sentences_1['input_ids']
tokenized_sentences_1['token_type_ids']
tokenized_sentences_1['attention_mask']

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1

In [5]:
tokenizer.decode(tokenized_sentences_1['input_ids'][0])

'[CLS] amrozi accused his brother, whom he called " the witness ", of deliberately distorting his evidence. [SEP]'

In [6]:
# Load the data all in once

tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

""" 
{ 
  'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102, 0, ...], ....
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], ........
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], ........
}
"""
tokenized_dataset["input_ids"][0]


[101,
 2572,
 3217,
 5831,
 5496,
 2010,
 2567,
 1010,
 3183,
 2002,
 2170,
 1000,
 1996,
 7409,
 1000,
 1010,
 1997,
 9969,
 4487,
 23809,
 3436,
 2010,
 3350,
 1012,
 102,
 7727,
 2000,
 2032,
 2004,
 2069,
 1000,
 1996,
 7409,
 1000,
 1010,
 2572,
 3217,
 5831,
 5496,
 2010,
 2567,
 1997,
 9969,
 4487,
 23809,
 3436,
 2010,
 3350,
 1012,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

# Dynanmic Padding

## Fixed Padding

In [7]:
# For the previous example, the padding length stays fixed
[len(x) for x in tokenized_dataset["input_ids"]]

[103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103,
 103

## Dynanmic Padding

However, we can achieved dynamic padding by loading the data in batch.
  
So that, the maximum padding length will be the max length in each of the batches

In [8]:
# Define the tokenizer function
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# Apply the tokenizer
tokenized_datasets = raw_datasets["train"].map(tokenize_function, batched=True)

# We see that most of the sequence as length less than 103
# So fixed padding is a waste of resources
[len(x) for x in tokenized_datasets["input_ids"]]

# NO Padding Yet

[50,
 59,
 47,
 67,
 59,
 50,
 62,
 32,
 45,
 60,
 51,
 47,
 42,
 61,
 53,
 44,
 53,
 79,
 57,
 70,
 63,
 35,
 54,
 64,
 52,
 47,
 68,
 58,
 60,
 35,
 43,
 34,
 48,
 65,
 27,
 73,
 31,
 50,
 36,
 61,
 57,
 54,
 41,
 64,
 53,
 38,
 68,
 45,
 57,
 39,
 36,
 68,
 63,
 47,
 37,
 62,
 59,
 58,
 50,
 33,
 61,
 34,
 71,
 64,
 74,
 30,
 54,
 53,
 72,
 70,
 44,
 58,
 78,
 40,
 60,
 50,
 55,
 31,
 62,
 46,
 58,
 70,
 49,
 49,
 42,
 34,
 70,
 50,
 34,
 65,
 49,
 39,
 53,
 37,
 28,
 70,
 66,
 68,
 62,
 62,
 72,
 39,
 67,
 57,
 52,
 75,
 76,
 79,
 64,
 52,
 43,
 55,
 54,
 60,
 41,
 50,
 64,
 53,
 63,
 37,
 32,
 30,
 53,
 40,
 60,
 67,
 64,
 33,
 38,
 62,
 69,
 53,
 61,
 58,
 42,
 38,
 64,
 55,
 54,
 52,
 65,
 38,
 48,
 40,
 64,
 47,
 38,
 45,
 46,
 75,
 59,
 47,
 62,
 52,
 65,
 80,
 42,
 41,
 61,
 55,
 61,
 67,
 53,
 55,
 76,
 69,
 46,
 46,
 31,
 37,
 53,
 32,
 52,
 54,
 35,
 43,
 70,
 48,
 51,
 49,
 65,
 67,
 54,
 58,
 47,
 58,
 42,
 52,
 70,
 54,
 39,
 51,
 66,
 42,
 64,
 65,
 73,
 73,
 43,
 44,


In [9]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

# Load tiny dataset
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Tokenize WITHOUT padding
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Slice 8 samples
samples = tokenized_datasets["train"][:8]
# samples = {Tokenized_input: Attention_mask}
samples = {k: v for k, v in samples.items() if k in ["input_ids", "attention_mask"]}

# BEFORE padding
print("Before padding:")
for i, input_ids in enumerate(samples["input_ids"]):
    print(f"Sample {i} length: {len(input_ids)}")

# Create collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# Apply collator to the sample
batch = data_collator([dict(zip(samples.keys(), values)) for values in zip(*samples.values())])

# AFTER padding
print("\nAfter padding:")
print(f"Batch input_ids shape: {batch['input_ids'].shape}")
for i in range(batch["input_ids"].shape[0]):
    print(f"Sample {i} padded length: {batch['input_ids'].shape[1]}")

Before padding:
Sample 0 length: 50
Sample 1 length: 59
Sample 2 length: 47
Sample 3 length: 67
Sample 4 length: 59
Sample 5 length: 50
Sample 6 length: 62
Sample 7 length: 32

After padding:
Batch input_ids shape: torch.Size([8, 67])
Sample 0 padded length: 67
Sample 1 padded length: 67
Sample 2 padded length: 67
Sample 3 padded length: 67
Sample 4 padded length: 67
Sample 5 padded length: 67
Sample 6 padded length: 67
Sample 7 padded length: 67


We see that they all pad to length 67, which is the max length in this batch `tokenized_datasets["train"][:8]`.

Less than the original fixed padding length of 103.

### Data Collator: DataCollatorWithPadding()

#### Input: A list of dictionaries, where each dictionary is one tokenized sample.
```
[
  {
    "input_ids": [101, 2040, 2001, 1996, 2034, 2343, 1029],
    "attention_mask": [1, 1, 1, 1, 1, 1, 1]
  },
  {
    "input_ids": [101, 2054, 2001, 1996, 2171, 1997, 1996, 2088, 1029],
    "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]
  }
]
```

#### Output: A single batch dictionary with padded tensors for each field
```
{
  "input_ids": tensor([
    [101, 2040, 2001, 1996, 2034, 2343, 1029, 0, 0],      # padded
    [101, 2054, 2001, 1996, 2171, 1997, 1996, 2088, 1029]  # longer original
  ]),
  "attention_mask": tensor([
    [1, 1, 1, 1, 1, 1, 1, 0, 0],
    [1, 1, 1, 1, 1, 1, 1, 1, 1]
  ])
}
```

# Training 

In [10]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    """用来tokenize data set的, 这样可以"""
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# 一次只 load 10 个 data into the mememory & conduct tokenization -> 存到disk中
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, batch_size=1000)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 408/408 [00:00<00:00, 18160.82 examples/s]


In [11]:
from transformers import TrainingArguments
# The only required training argument is the saving path
training_args = TrainingArguments("test-trainer")

In [12]:
from transformers import AutoModelForSequenceClassification
# 因为bert的head不是一个classification head, 这里assign了一个新的head for classification task
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from transformers import Trainer
trainer = Trainer(
    model,  # The checkpoint model
    training_args,  # 一些和training有关的arg
    train_dataset=tokenized_datasets["train"],   # 已经tokenized的data
    eval_dataset=tokenized_datasets["validation"],   # 已经tokenized的data
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),   # 用来padding的data collator
    tokenizer=tokenizer,
)

In [14]:
# NOTE: 这里没法看到 model 的 performance, 因为我们没有设置我们的customized metrics
trainer.train()

 12%|█▏        | 168/1377 [00:23<02:34,  7.84it/s]

KeyboardInterrupt: 

# Adding Evaluation

In [None]:
def metrics_interface(eval_pred_object:tuple[object, object]):
    """
    Return:
        A Dictionary: {"metric_name": metric_value}

    Parameters:
        - EvalPrediction: a named tuple: (predictions, label_ids)
    """
    pass

In [None]:
predictions = trainer.predict(tokenized_datasets["validation"])

100%|██████████| 51/51 [00:01<00:00, 25.77it/s]


In [None]:
# The prediction of the validation set
predictions

PredictionOutput(predictions=array([[-2.926512  ,  3.2317708 ],
       [ 2.3525238 , -2.953093  ],
       [ 1.0655872 , -0.71593904],
       [-2.415072  ,  2.773419  ],
       [ 2.5661287 , -3.1980178 ],
       [-2.8888903 ,  3.2404342 ],
       [-2.4374127 ,  2.7860684 ],
       [-3.0608406 ,  3.4244444 ],
       [-2.7522972 ,  3.1081219 ],
       [-3.0153694 ,  3.3926883 ],
       [-3.0899205 ,  3.4723876 ],
       [ 2.4825735 , -3.1180208 ],
       [ 2.4746685 , -3.1707475 ],
       [ 0.9286237 , -1.6033851 ],
       [-3.0957446 ,  3.4786286 ],
       [ 1.7035819 , -2.1303492 ],
       [-3.1010995 ,  3.4987435 ],
       [ 2.4557345 , -3.0789382 ],
       [-3.0981827 ,  3.490522  ],
       [ 1.923224  , -1.9429361 ],
       [ 2.2091608 , -2.7967875 ],
       [-2.7863698 ,  3.1637783 ],
       [ 2.084103  , -2.582215  ],
       [-2.9796214 ,  3.2960677 ],
       [-3.0499878 ,  3.430943  ],
       [-2.4036381 ,  2.8164806 ],
       [-1.8851238 ,  1.7961915 ],
       [-3.0853906 ,  3.50

In [None]:
import numpy as np
# Convert the predicted logits to {0,1} using softmax
preds = np.argmax(predictions.predictions, axis=-1)
preds

array([1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,

In [18]:
import evaluate

# The metrics function
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# `eval_strategy`: gives when do we evaluate?
training_args = TrainingArguments("test-trainer")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)


# `compute_metrics`: pass in the customized metrics function
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 12%|█▏        | 168/1377 [02:27<17:37,  1.14it/s]
 18%|█▊        | 250/1377 [00:30<02:16,  8.23it/s]

KeyboardInterrupt: 

# Trainer with Pytorch

## Data Loading and Tokenizing

In [4]:
# Loading & Tokenize Data
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



## Transform the dataset to only left with related columns

In [5]:
# Removed the untokenized features
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
# Rename the column
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
# Set the data to return tensor
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

## Use torch's data loader to load the model

In [6]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

# NOTE: `collate_fn` defines how those individual samples are combined into a batch.

## Small try on pre-trained model

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
for batch in train_dataloader:
    print({k: v.shape for k, v in batch.items()})
    outputs = model(**batch)
    break

{'labels': torch.Size([8]), 'input_ids': torch.Size([8, 70]), 'token_type_ids': torch.Size([8, 70]), 'attention_mask': torch.Size([8, 70])}


In [9]:
print(outputs.loss, outputs.logits.shape)

tensor(0.7852, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


## Defining Training Loop

#### AdamW optimizer

In [10]:
from torch.optim import AdamW

# Record the model's parameters to the optimizer 
optimizer = AdamW(model.parameters(), lr=5e-5)

#### Learning Rate Scheduler: Linear Decaying Scheduler

In [11]:
from transformers import get_scheduler

# Since we are using a Linear Decaying Scheduler, we need to know the training steps
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

# Define scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,  # the linear decaying range
)
print(num_training_steps)

1377


#### Put the model to the GPU

In [12]:
import torch

device = torch.device("mps") if torch.mps else torch.device("cpu")
model.to(device)
device

device(type='mps')

## Starts Training

In [13]:
# NOTE: the process can be accelerate to utilize multiple GPUs or TPUs

from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

# Activate the model's training mode
model.train()

# Starts updates the model's parameters
for epoch in range(num_epochs):
    for batch in train_dataloader:  # Load the data

        # Put the data into the form that can be processed by the Huggingface model
        batch = {k: v.to(device) for k, v in batch.items()}

        # Predict -> Compute Loss -> bp -> update -> reschedule the lr -> empty the gradient
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Update the progress bar
        progress_bar.update(1)

  0%|          | 0/1377 [00:00<?, ?it/s]

## The eval loop

In [15]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

FileNotFoundError: Couldn't find a module script at /Users/yifanyu/Desktop/LLM finetuning pipeline/glue/glue.py. Module 'glue' doesn't exist on the Hugging Face Hub either.