In [2]:
%env ALL_PROXY=http://127.0.0.1:7890
%env HTTP_PROXY=http://127.0.0.1:7890
%env HTTPS_PROXY=http://127.0.0.1:7890

env: ALL_PROXY=http://127.0.0.1:7890
env: HTTP_PROXY=http://127.0.0.1:7890
env: HTTPS_PROXY=http://127.0.0.1:7890


In [3]:
%env HF_HUB_CACHE=./data/hf_cache

env: HF_HUB_CACHE=./data/hf_cache


# Text classification example

## Step 01. import related packages

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Step 02. Load data

In [5]:
dataset = load_dataset('csv', data_files='./data/train/ChnSentiCorp_htl_all.csv', split='train')
dataset = dataset.filter(lambda x: x['review'] is not None)
dataset

Generating train split: 7766 examples [00:00, 94445.87 examples/s]
Filter: 100%|██████████| 7766/7766 [00:00<00:00, 173167.42 examples/s]


Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## Step 03. Split dataset

In [6]:
splited_ds_dict = dataset.train_test_split(test_size=0.2)
splited_ds_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 1553
    })
})

## Step 04. Create Dataloaders

In [7]:
import torch

tokenizer = AutoTokenizer.from_pretrained('hfl/rbt3')

def process_function(examples):
    tokenized_examples = tokenizer(examples['review'], max_length=128, truncation=True)
    tokenized_examples['labels'] = examples['label']
    return tokenized_examples

tokenized_ds = splited_ds_dict.map(process_function, batched=True, remove_columns=splited_ds_dict['train'].column_names)
tokenized_ds

Map: 100%|██████████| 6212/6212 [00:00<00:00, 7552.06 examples/s] 
Map: 100%|██████████| 1553/1553 [00:00<00:00, 9850.65 examples/s] 


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1553
    })
})

In [8]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

train_set, valid_set = tokenized_ds['train'], tokenized_ds['test']
train_loader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))
valid_loader = DataLoader(valid_set, batch_size=32, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))

In [9]:
next(enumerate(valid_loader))[1]

{'input_ids': tensor([[ 101, 4384, 1862,  ...,    0,    0,    0],
        [ 101, 1114, 1906,  ...,  102,    0,    0],
        [ 101, 2791, 7313,  ..., 2523, 7770,  102],
        ...,
        [ 101, 3683, 6772,  ...,    0,    0,    0],
        [ 101, 5401,  796,  ...,  679, 6639,  102],
        [ 101, 4796, 6118,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,
        0, 1, 1, 1, 1, 1, 1, 0])}

## Step 05. Create the model & optimizer

In [10]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained('hfl/rbt3')

if torch.cuda.is_available():
    model = model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
optimizer = Adam(model.parameters(), lr=2e-5)

## Step 07. Train and validate

In [12]:
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in valid_loader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            acc_num += (pred.long() == batch['labels'].long()).float().sum()
    return acc_num / len(valid_set)


def train(epoch=3, log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in train_loader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step == 0 or global_step % log_step == 0:
                print(f"ep: {ep}, Step {global_step}: loss = {output.loss.item()}")
            global_step += 1
        acc = evaluate()
        print(f"ep: {ep}, Validation Accuracy = {acc}")

## Step 08. Train the model

In [13]:
train(epoch=10, log_step=100)

ep: 0, Step 0: loss = 0.839937150478363
ep: 0, Step 100: loss = 0.2079557627439499
ep: 0, Validation Accuracy = 0.8886027336120605
ep: 1, Step 200: loss = 0.4322376847267151
ep: 1, Step 300: loss = 0.18867860734462738
ep: 1, Validation Accuracy = 0.8982614874839783
ep: 2, Step 400: loss = 0.21903608739376068
ep: 2, Step 500: loss = 0.17979544401168823
ep: 2, Validation Accuracy = 0.9104958772659302
ep: 3, Step 600: loss = 0.11133500188589096
ep: 3, Step 700: loss = 0.1292022466659546
ep: 3, Validation Accuracy = 0.9040567278862
ep: 4, Step 800: loss = 0.05941382050514221
ep: 4, Step 900: loss = 0.12862108647823334
ep: 4, Validation Accuracy = 0.905344545841217
ep: 5, Step 1000: loss = 0.06392557919025421
ep: 5, Step 1100: loss = 0.011220557615160942
ep: 5, Validation Accuracy = 0.8989053964614868
ep: 6, Step 1200: loss = 0.06360702216625214
ep: 6, Step 1300: loss = 0.10478026419878006
ep: 6, Validation Accuracy = 0.9001932144165039
ep: 7, Step 1400: loss = 0.040136899799108505
ep: 7, S

## Step 09. Model prediction

In [21]:
# sentence = "I Think this hotel is good!"
# sentence = "I think this hotel is terrible..."
sentence = "这个酒店好！！"
id2label = {0: 'bad', 1: 'good'}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sentence, max_length=128, padding='max_length', truncation=True, return_tensors='pt')
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"Input: {sentence}\nModel Prediction: {id2label.get(pred.item())}")

Input: 这个酒店好！！
Model Prediction: good


In [22]:
from transformers import pipeline

model.config.id2label = id2label
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [23]:
pipe(sentence)

[{'label': 'good', 'score': 0.9956009387969971}]