In [1]:
import random
import numpy as np
import torch
from transformers import set_seed

In [2]:
seed = random.randrange(2**32)
print(f"🔢 Using random seed: {seed}")

# Seed all RNGs
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
set_seed(seed)

🔢 Using random seed: 954644565


In [3]:
!pip install transformers datasets evaluate box

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting box
  Downloading box-0.1.5-py3-none-any.whl.metadata (1.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting columnar==1.3.1 (from box)
  Downloading Columnar-1.3.1-py3-none-any.whl.metadata (11 kB)
Collecting executing==0.8.2 (from box)
  Downloading executing-0.8.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting loguru (from box)
  

In [4]:
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
import numpy as np

In [7]:
args = {
    "model_name_or_path":          "distilbert-base-uncased",
    "output_dir":                  "./output/imdb",
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size":  16,
    "learning_rate":               2e-5,
    "num_train_epochs":            2,
    "weight_decay":                0.01,
    "logging_steps":               200,
    "save_steps":                  500,
    "seed":                        seed,
}

In [8]:
raw_datasets = load_dataset("imdb")       # train/test split
accuracy_metric = evaluate.load("accuracy")

tokenizer = AutoTokenizer.from_pretrained(
    args["model_name_or_path"], use_fast=True
)

def preprocess(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length"
    )

train_ds = raw_datasets["train"].map(preprocess, batched=True, remove_columns=["text"])
eval_ds  = raw_datasets["test"].map(preprocess,  batched=True, remove_columns=["text"])


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    args["model_name_or_path"], num_labels=2
)
data_collator = DataCollatorWithPadding(tokenizer)

training_args = TrainingArguments(
    output_dir=args["output_dir"],
    seed=args["seed"],
    per_device_train_batch_size=args["per_device_train_batch_size"],
    per_device_eval_batch_size=args["per_device_eval_batch_size"],
    learning_rate=args["learning_rate"],
    num_train_epochs=args["num_train_epochs"],
    logging_steps=args["logging_steps"],
    save_steps=args["save_steps"],
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    overwrite_output_dir=True,
)

def compute_metrics(p):
    logits, labels = p
    preds = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=preds, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [None]:
trainer.train()
metrics = trainer.evaluate()
print("📊 Test accuracy:", metrics["eval_accuracy"])

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msunnysolomon8880[0m ([33msunnysolomon8880-cornell-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
