In [1]:
%env ALL_PROXY=http://127.0.0.1:7890
%env HTTP_PROXY=http://127.0.0.1:7890
%env HTTPS_PROXY=http://127.0.0.1:7890

env: ALL_PROXY=http://127.0.0.1:7890
env: HTTP_PROXY=http://127.0.0.1:7890
env: HTTPS_PROXY=http://127.0.0.1:7890


In [2]:
%env HF_HUB_CACHE=./data/hf_cache

env: HF_HUB_CACHE=./data/hf_cache


# Text classification example

## Step 01. import related packages

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Step 02. Load data

In [4]:
dataset = load_dataset('csv', data_files='./data/train/ChnSentiCorp_htl_all.csv', split='train')
dataset = dataset.filter(lambda x: x['review'] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## Step 03. Split dataset

In [5]:
splited_ds_dict = dataset.train_test_split(test_size=0.2)
splited_ds_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 1553
    })
})

## Step 04. Create Dataloaders

In [6]:
import torch

tokenizer = AutoTokenizer.from_pretrained('hfl/rbt3')

def process_function(examples):
    tokenized_examples = tokenizer(examples['review'], max_length=128, truncation=True)
    tokenized_examples['labels'] = examples['label']
    return tokenized_examples

tokenized_ds = splited_ds_dict.map(process_function, batched=True, remove_columns=splited_ds_dict['train'].column_names)
tokenized_ds

Map: 100%|██████████| 6212/6212 [00:00<00:00, 7288.59 examples/s]
Map: 100%|██████████| 1553/1553 [00:00<00:00, 9887.15 examples/s] 


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1553
    })
})

In [7]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

train_set, valid_set = tokenized_ds['train'], tokenized_ds['test']
train_loader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))
valid_loader = DataLoader(valid_set, batch_size=32, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))

In [8]:
next(enumerate(valid_loader))[1]

{'input_ids': tensor([[ 101, 6983, 2421,  ..., 6586, 4638,  102],
        [ 101, 2791, 7313,  ...,    0,    0,    0],
        [ 101,  868,  711,  ..., 3121, 6822,  102],
        ...,
        [ 101, 7674, 1044,  ..., 7716, 3446,  102],
        [ 101, 2487, 4164,  ..., 1456, 7309,  102],
        [ 101, 5811, 3805,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 0])}

## Step 05. Create the model & optimizer

In [9]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained('hfl/rbt3')

if torch.cuda.is_available():
    model = model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
optimizer = Adam(model.parameters(), lr=2e-5)

## Step 07. Train and validate

In [11]:
import evaluate

clf_metrics = evaluate.combine(['accuracy', 'f1'])

In [12]:
def evaluate():
    model.eval()
    # acc_num = 0
    with torch.inference_mode():
        for batch in valid_loader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            # acc_num += (pred.long() == batch['labels'].long()).float().sum()
            clf_metrics.add_batch(predictions=pred.long(), references=batch['labels'].long())
    return clf_metrics.compute()


def train(epoch=3, log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in train_loader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step == 0 or global_step % log_step == 0:
                print(f"ep: {ep}, Step {global_step}: loss = {output.loss.item()}")
            global_step += 1
        clf = evaluate()
        print(f"ep: {ep}, {clf}")

## Step 08. Train the model

In [13]:
train(epoch=3, log_step=100)

ep: 0, Step 0: loss = 0.9596479535102844
ep: 0, Step 100: loss = 0.42726534605026245
ep: 0, {'accuracy': 0.8834513844172569, 'f1': 0.9114914425427872}
ep: 1, Step 200: loss = 0.38055431842803955
ep: 1, Step 300: loss = 0.21395260095596313
ep: 1, {'accuracy': 0.8950418544752092, 'f1': 0.9226388229710489}
ep: 2, Step 400: loss = 0.12374890595674515
ep: 2, Step 500: loss = 0.16590812802314758
ep: 2, {'accuracy': 0.8860270444301352, 'f1': 0.9193621867881548}


## Step 09. Model prediction

In [14]:
# sentence = "I Think this hotel is good!"
# sentence = "I think this hotel is terrible..."
sentence = "这个酒店好！！"
id2label = {0: 'bad', 1: 'good'}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sentence, max_length=128, padding='max_length', truncation=True, return_tensors='pt')
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"Input: {sentence}\nModel Prediction: {id2label.get(pred.item())}")

Input: 这个酒店好！！
Model Prediction: good


In [15]:
from transformers import pipeline

model.config.id2label = id2label
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [16]:
pipe(sentence)

[{'label': 'good', 'score': 0.8459985852241516}]