In [1]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

Finetuning BERT(`distilbert-base-cased`) for sentiment analysis

In [2]:
def truncate(example):
    return {
        'text': ' '.join(example['text'].split()[:50]),
        'label': example['label']
    }

In [3]:
from datasets import load_dataset, DatasetDict

imdb = load_dataset('stanfordnlp/imdb')

imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [4]:
imdb = DatasetDict(
    train=imdb['train'].shuffle(5525).map(truncate),
    val=imdb['test'].shuffle(5525).map(truncate)
)
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

In [5]:
print(imdb['train'][0])

{'text': "It seems Hal Hartley's films are kind of hit or miss with most audiences. This film will be no exception to that rule. Fay Grim acts as a sequel to Hartley's 'Henry Foole' from 1998. The focus this time is on Henry's ex wife (played to perfection by the always", 'label': 1}


In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased", trust_remote_code=True)

In [7]:
dataset = imdb.map(
    lambda example: tokenizer(example['text'], padding=True, truncation=True),
    batched=True,
    batch_size=32
)
print(dataset)
print(dataset['train'][0])
print('-' * 100)

dataset = dataset.remove_columns(['text'])
print(dataset)
print('-' * 100)

dataset = dataset.rename_column('label', 'labels')
print(dataset)
print(dataset['train'][0])

dataset.set_format('torch')

from torch.utils.data import DataLoader

train_loader = DataLoader(dataset['train'], batch_size=32)
eval_loader = DataLoader(dataset['val'], batch_size=32)


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    val: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
})
{'text': "It seems Hal Hartley's films are kind of hit or miss with most audiences. This film will be no exception to that rule. Fay Grim acts as a sequel to Hartley's 'Henry Foole' from 1998. The focus this time is on Henry's ex wife (played to perfection by the always", 'label': 1, 'input_ids': [101, 1135, 3093, 12193, 23053, 112, 188, 2441, 1132, 1912, 1104, 1855, 1137, 5529, 1114, 1211, 9569, 119, 1188, 1273, 1209, 1129, 1185, 5856, 1106, 1115, 3013, 119, 26630, 144, 10205, 4096, 1112, 170, 8047, 1106, 23053, 112, 188, 112, 1985, 21935, 1162, 112, 1121, 1772, 119, 1109, 2817, 1142, 1159, 1110, 1113, 1985, 112, 188, 4252, 1676, 113, 1307, 1106, 17900, 1118, 1103, 1579, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import DistilBertForSequenceClassification

from tqdm.notebook import tqdm

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=2)
model.to('cuda')

num_epochs = 3
num_training_steps = len(train_loader)
optimizer = AdamW(model.parameters(), lr=2e-3, weight_decay=0.01)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
import torch
import os

best_val_loss = float('inf')
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    print(f'epoch={epoch}')
    # training
    model.train()
    for batch_id, batch in enumerate(train_loader):
        # batch = ([text1, text2], [0, 1])
        batch = {k: v.to('cuda') for k, v in batch.items()}
        output = model(**batch)

        optimizer.zero_grad()
        output.loss.backward()
        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)

    # validation
    model.eval()
    total_loss = 0
    for batch_id, batch in enumerate(eval_loader):
        batch = {k: v.to('cuda') for k, v in batch.items()}
        with torch.no_grad():
            output = model(**batch)
        total_loss += output.loss.item()

    avg_val_loss = total_loss / len(eval_loader)
    print(f'validation loss={avg_val_loss}')

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        # 检查目录是否存在，不存在则创建
        if not os.path.exists('checkpoints/'):
            os.makedirs('checkpoints/')
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': best_val_loss,
        }, f"checkpoints/epoch_{epoch}.pt")

  0%|          | 0/782 [00:00<?, ?it/s]

epoch=0
validation loss=0.39201114532511555
epoch=1
validation loss=0.39201114532511555
epoch=2
validation loss=0.39201114532511555


In [32]:
from transformers import AutoModelForSequenceClassification

# To load our saved model, we can pass the path to the checkpoint into the `from_pretrained` method:
test_str = "wtf"

# 加载保存的模型
checkpoint = torch.load("checkpoints/epoch_0.pt", map_location='cuda')
# 从预训练模型初始化
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-cased")
# 将保存的 state_dict 加载到模型中
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()


model_inputs = tokenizer(test_str, return_tensors="pt")
print(model(**model_inputs).logits)
print()

prediction = torch.argmax(model(**model_inputs).logits)
print(prediction)
print()

print(["NEGATIVE", "POSITIVE"][prediction])

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[-0.0656, -0.1659]], grad_fn=<AddmmBackward0>)

tensor(0)

NEGATIVE


Use `Trainer`

In [43]:
from transformers import TrainingArguments, Trainer
import numpy as np

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=2)

arguments = TrainingArguments(
    output_dir="sample_hf_trainer",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    learning_rate=2e-5,
    load_best_model_at_end=True,
    seed=5525
)


def compute_metrics(eval_pred):
    """Called at the end of validation. Gives accuracy"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # calculates the accuracy
    return {"accuracy": np.mean(predictions == labels)}


trainer = Trainer(
    model=model,
    args=arguments,
    train_dataset=dataset['train'],
    eval_dataset=dataset['val'], # change to test when you do your final evaluation!
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# train the model
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4733,0.411927,0.80892
2,0.3136,0.39321,0.82556
3,0.2427,0.431751,0.82504


TrainOutput(global_step=2346, training_loss=0.3349299353610118, metrics={'train_runtime': 146.1567, 'train_samples_per_second': 513.148, 'train_steps_per_second': 16.051, 'total_flos': 2532991921825536.0, 'train_loss': 0.3349299353610118, 'epoch': 3.0})

In [51]:
test_encoding = tokenizer("im sad", padding=True, truncation=True, return_tensors="pt")
# 将字典包装到一个列表中
test_dataset = [ {k: v.squeeze(0) for k, v in test_encoding.items()} ]
prediction_output = trainer.predict(test_dataset)
print(prediction_output)
print()

predicted_indices = np.argmax(prediction_output.predictions, axis=-1)
# 如果只预测一个样本，取第一个索引
prediction = predicted_indices[0]
print(["NEGATIVE", "POSITIVE"][prediction])

PredictionOutput(predictions=array([[ 0.15588348, -0.03970203]], dtype=float32), label_ids=None, metrics={'test_runtime': 0.0156, 'test_samples_per_second': 64.124, 'test_steps_per_second': 64.124})

NEGATIVE
