In [1]:
import os
import numpy as np
import evaluate
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from datasets import load_dataset

In [2]:
GLUE_TESTS = ["mnli", "qqp", "qnli", "sst2", "cola", "stsb", "mrpc", "rte", "wnli"]

In [3]:
tokenizer = BertTokenizerFast.from_pretrained("../tokenizers/tokenizer_WordPiece")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'BertTokenizerFast'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
task = "cola"
dataset = load_dataset("glue", task)  # mrpc rte stsb wnli cola ax
print(dataset)
print("train labels :", dataset["train"]["label"])
print("val labels :", dataset["validation"]["label"])
print("test labels :", dataset["test"]["label"])

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})
train labels : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 

In [5]:
print(max(dataset["test"]["idx"]))
print(min(dataset["test"]["idx"]))

1062
0


In [6]:
def tokenize_function(dataset_dict):
    return tokenizer(
        dataset_dict["sentence"],
        # dataset_dict["sentence2"],
        padding="max_length",
        max_length=512,
        truncation=True,
    )


tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [7]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    save_strategy="no",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,  # default 5e-5
    num_train_epochs=3,
    weight_decay=3e-5,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/3207 [00:00<?, ?it/s]

{'loss': 0.6815, 'grad_norm': 6.433343887329102, 'learning_rate': 4.985968194574369e-05, 'epoch': 0.01}
{'loss': 0.624, 'grad_norm': 4.4218831062316895, 'learning_rate': 4.970377299657e-05, 'epoch': 0.02}
{'loss': 0.5778, 'grad_norm': 1.5742686986923218, 'learning_rate': 4.954786404739633e-05, 'epoch': 0.03}
{'loss': 0.664, 'grad_norm': 5.95218563079834, 'learning_rate': 4.939195509822264e-05, 'epoch': 0.04}
{'loss': 0.5782, 'grad_norm': 12.891599655151367, 'learning_rate': 4.923604614904896e-05, 'epoch': 0.05}
{'loss': 0.6608, 'grad_norm': 1.400235652923584, 'learning_rate': 4.908013719987528e-05, 'epoch': 0.06}
{'loss': 0.5993, 'grad_norm': 3.501857280731201, 'learning_rate': 4.892422825070159e-05, 'epoch': 0.07}
{'loss': 0.6808, 'grad_norm': 4.939202308654785, 'learning_rate': 4.876831930152791e-05, 'epoch': 0.07}
{'loss': 0.6199, 'grad_norm': 1.7586559057235718, 'learning_rate': 4.861241035235422e-05, 'epoch': 0.08}
{'loss': 0.6839, 'grad_norm': 1.7730178833007812, 'learning_rate':

TrainOutput(global_step=3207, training_loss=0.611810452625095, metrics={'train_runtime': 184.6017, 'train_samples_per_second': 138.964, 'train_steps_per_second': 17.373, 'total_flos': 6749587903150080.0, 'train_loss': 0.611810452625095, 'epoch': 3.0})

In [8]:
predictions = trainer.predict(tokenized_datasets["validation"])
probs, labels, _ = predictions

  0%|          | 0/131 [00:00<?, ?it/s]

In [9]:
print(probs)
print(labels)
preds = np.argmax(probs, axis=1)
print(np.all(preds == 1))

[[-0.5229492   0.5       ]
 [-0.52246094  0.5       ]
 [-0.5229492   0.5       ]
 ...
 [-0.5229492   0.5       ]
 [-0.5229492   0.5       ]
 [-0.52246094  0.5       ]]
[1 1 1 ... 0 1 1]
True


In [10]:
import evaluate

accuracy = evaluate.load("accuracy")
results = accuracy.compute(predictions=probs.argmax(-1), references=labels)
print(results)

{'accuracy': 0.6912751677852349}


In [12]:
acc = np.mean(preds == labels)
print("accuracy:", acc)
trainer.evaluate(tokenized_datasets["validation"])

accuracy: 0.6912751677852349


  0%|          | 0/131 [00:00<?, ?it/s]

{'eval_loss': 0.6230014562606812,
 'eval_runtime': 2.3406,
 'eval_samples_per_second': 445.607,
 'eval_steps_per_second': 55.968,
 'epoch': 3.0}

In [14]:
f1 = evaluate.load("f1")
person = evaluate.load("spearmanr")

Downloading builder script:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

In [14]:
filenames = {
    "cola": "CoLA.tsv",
    "sst2": "SST-2.tsv",
    "mrpc": "MRPC.tsv",
    "qqp": "QQP.tsv",
    "stsb": "STS-B.tsv",
    "mnli_matched": "MNLI-m.tsv",
    "mnli_mismatched": "MNLI-mm.tsv",
    "qnli": "QNLI.tsv",
    "rte": "RTE.tsv",
    "wnli": "WNLI.tsv",
    "ax": "AX.tsv",
}

labelnames = {
    "mnli_matched": ["entailment", "neutral", "contradiction"],
    "mnli_mismatched": ["entailment", "neutral", "contradiction"],
    "ax": ["entailment", "neutral", "contradiction"],
    "qnli": ["entailment", "not_entailment"],
    "rte": ["entailment", "not_entailment"],
}


In [19]:
ordered_preds = [None] * (max(dataset['test']['idx']) + 1)
for i, pred in zip(dataset['test']['idx'], preds):
    ordered_preds[i] = pred
preds = np.array(ordered_preds)

In [20]:
submission_directory = "glue_submissions"
if not os.path.exists(submission_directory):
    os.makedirs(submission_directory)
filename = submission_directory + "/" + filenames[task]
labelname = labelnames.get(task)
print(labelname)
print("filename :", filename)

None
filename : glue_submissions/MRPC.tsv


In [21]:
with open(filename, "w") as f:
    f.write("index\tprediction\n")
    for idx, pred in enumerate(preds):
        if labelname:
            pred = labelname[int(pred)]
        f.write(f"{idx}\t{pred}\n")

False