In [2]:
import warnings

warnings.filterwarnings("ignore")

from datasets import load_dataset

data = load_dataset("glue", "sst2")
data

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
checkpoint="bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
def tokenize_func(examples):
    return tokenizer(examples["sentence"], truncation=True)

In [10]:
tokenized_datasets =data.map(tokenize_func, batched=True)
tokenized_datasets

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [11]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)

In [13]:
from transformers import Trainer, TrainingArguments
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)
training_args = TrainingArguments("out_files",
                                  per_device_train_batch_size=16,
                                  num_train_epochs=1,
                                  logging_steps=50,
                                  evaluation_strategy='steps')
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

Using the latest cached version of the module from C:\Users\admin\.cache\huggingface\modules\datasets_modules\metrics\accuracy\9756d5fa4a0f9da966341741fc3926eafdc604b8276add51d5abbaa8958a25f9 (last modified on Mon May 13 15:29:26 2024) since it couldn't be found locally at accuracy, or remotely on the Hugging Face Hub.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,0.5527,0.378965,0.83945
100,0.4332,0.401424,0.836009
150,0.31,0.330325,0.885321
200,0.3658,0.288142,0.883028
250,0.3089,0.449862,0.861239
300,0.3262,0.30504,0.87844
350,0.3259,0.292856,0.875
400,0.3119,0.261431,0.894495
450,0.2923,0.360653,0.858945
500,0.2974,0.272124,0.904817


TrainOutput(global_step=4210, training_loss=0.22395948551046593, metrics={'train_runtime': 591.8381, 'train_samples_per_second': 113.796, 'train_steps_per_second': 7.113, 'total_flos': 1216670228308500.0, 'train_loss': 0.22395948551046593, 'epoch': 1.0})