In [1]:
!pip install transformers datasets torch scikit-learn pandas



In [2]:
from datasets import load_dataset

# Load SST2 dataset
dataset = load_dataset("glue", "sst2")
train_data = dataset["train"]
train_data[0]


{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score
import numpy as np

# Load tokenizer and model
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# Tokenize data
def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define compute_metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Training setup
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1522,0.288708,0.899083


TrainOutput(global_step=4210, training_loss=0.2268380498376425, metrics={'train_runtime': 522.8602, 'train_samples_per_second': 128.809, 'train_steps_per_second': 8.052, 'total_flos': 612551760494100.0, 'train_loss': 0.2268380498376425, 'epoch': 1.0})

In [4]:
trainer.evaluate()


{'eval_loss': 0.2887084186077118,
 'eval_accuracy': 0.8990825688073395,
 'eval_runtime': 2.3823,
 'eval_samples_per_second': 366.027,
 'eval_steps_per_second': 23.087,
 'epoch': 1.0}

In [6]:
# Save the trained model
model.save_pretrained("initialModel")
tokenizer.save_pretrained("initialModel")


('initialModel\\tokenizer_config.json',
 'initialModel\\special_tokens_map.json',
 'initialModel\\vocab.txt',
 'initialModel\\added_tokens.json',
 'initialModel\\tokenizer.json')