### Tokenizer 使用

In [145]:
from transformers import AutoTokenizer

tokenizer_name = "uer/roberta-base-finetuned-jd-full-chinese"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

loading configuration file config.json from cache at C:\Users\jimmy/.cache\huggingface\hub\models--uer--roberta-base-finetuned-jd-full-chinese\snapshots\001c14a6ad8498465b0d7a2be435c30e856507a8\config.json
Model config BertConfig {
  "_name_or_path": "uer/roberta-base-finetuned-jd-full-chinese",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "star 1",
    "1": "star 2",
    "2": "star 3",
    "3": "star 4",
    "4": "star 5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "star 1": 0,
    "star 2": 1,
    "star 3": 2,
    "star 4": 3,
    "star 5": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers

In [146]:
string_arr = [
    "飲料很好喝",
    "環境很髒，還有老鼠"
]
inputs = tokenizer(string_arr, padding=True, truncation=True, return_tensors="pt")
print(inputs)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': tensor([[ 101, 7614, 3160, 2523, 1962, 1600,  102,    0,    0,    0,    0],
        [ 101, 4472, 1862, 2523, 7766, 8024, 6917, 3300, 5439, 7962,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


### Transformers model使用

In [147]:
from transformers import AutoModelForSequenceClassification

model_name = "uer/roberta-base-finetuned-jd-full-chinese"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
outputs = model(**inputs)

loading configuration file config.json from cache at C:\Users\jimmy/.cache\huggingface\hub\models--uer--roberta-base-finetuned-jd-full-chinese\snapshots\001c14a6ad8498465b0d7a2be435c30e856507a8\config.json
Model config BertConfig {
  "_name_or_path": "uer/roberta-base-finetuned-jd-full-chinese",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "star 1",
    "1": "star 2",
    "2": "star 3",
    "3": "star 4",
    "4": "star 5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "star 1": 0,
    "star 2": 1,
    "star 3": 2,
    "star 4": 3,
    "star 5": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers

In [148]:
# print output's logits
print(outputs.logits)

tensor([[-2.2921, -0.9786, -0.0671,  0.8580,  0.9130],
        [ 0.7283,  0.8047,  0.2435, -1.3489, -2.3895]],
       grad_fn=<AddmmBackward0>)


### 將結果過 softmax

In [149]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[0.0161, 0.0600, 0.1493, 0.3766, 0.3979],
        [0.3491, 0.3768, 0.2150, 0.0437, 0.0154]], grad_fn=<SoftmaxBackward0>)


### 查看model label

In [150]:
model.config.id2label

{0: 'star 1', 1: 'star 2', 2: 'star 3', 3: 'star 4', 4: 'star 5'}

### 以上程式碼簡化

In [151]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
classifier(
    [
    "飲料很好喝",
    "環境很髒，還有老鼠"
    ]
)

[{'label': 'star 5', 'score': 0.3979003429412842},
 {'label': 'star 2', 'score': 0.37679576873779297}]

# Tensorflow 模型加載

In [152]:
from transformers import TFAutoModel

tf_model = TFAutoModel.from_pretrained(model_name)

loading configuration file config.json from cache at C:\Users\jimmy/.cache\huggingface\hub\models--uer--roberta-base-finetuned-jd-full-chinese\snapshots\001c14a6ad8498465b0d7a2be435c30e856507a8\config.json
Model config BertConfig {
  "_name_or_path": "uer/roberta-base-finetuned-jd-full-chinese",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "star 1",
    "1": "star 2",
    "2": "star 3",
    "3": "star 4",
    "4": "star 5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "star 1": 0,
    "star 2": 1,
    "star 3": 2,
    "star 4": 3,
    "star 5": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers

### 載入dataset

In [153]:
import pandas as pd
from datasets import Dataset, DatasetDict

In [154]:
dict0 = {
    1:0,
    2:1,
    3:2,
    4:3,
    5:4
}

In [155]:
train_df = pd.read_csv('train.csv')
train_df = train_df.drop(columns=['Unnamed: 0'])
train_df = train_df[train_df['review_text'].str.len() < 500]
train_df['label'] = train_df['label'].map(dict0)

In [156]:
test_df = pd.read_csv('test.csv')
test_df = test_df.drop(columns=['Unnamed: 0'])
test_df = test_df[test_df['review_text'].str.len() < 500]
test_df['label'] = test_df['label'].map(dict0)

In [157]:
validation_df = pd.read_csv('validation.csv')
validation_df = validation_df.drop(columns=['Unnamed: 0'])
validation_df = validation_df[validation_df['review_text'].str.len() < 500]
validation_df['label'] = validation_df['label'].map(dict0)

In [158]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
validation_dataset = Dataset.from_pandas(validation_df)
my_dataset_dict = datasets.DatasetDict({"train":train_dataset,"validation":validation_dataset ,"test":test_dataset})

In [159]:
my_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['review_text', 'label', '__index_level_0__'],
        num_rows: 496
    })
    validation: Dataset({
        features: ['review_text', 'label', '__index_level_0__'],
        num_rows: 101
    })
    test: Dataset({
        features: ['review_text', 'label', '__index_level_0__'],
        num_rows: 100
    })
})

In [160]:
my_dataset_dict.set_format(type="pandas")
df = my_dataset_dict["train"][:]
df.head()

Unnamed: 0,review_text,label,__index_level_0__
0,純素鹽酥雞很好吃！猴頭菇的口感很紮實\n\n,4,0
1,中規中矩，水果沙拉水果很多令人印象深刻\n\n,3,1
2,東西好吃\n\n,4,2
3,很喜歡餐點好吃\n\n,4,3
4,義大利麵醬汁很濃郁，堅果類給蠻多，但菇的存在感有點低~臭豆腐麵所謂的拉麵有點像泡麵，跟預期不...,2,4


In [161]:
labels = ["star 1","star 2","star 3","star 4","star 5"]

In [162]:
my_dataset_dict.reset_format()

### 分詞

In [163]:
from transformers import AutoTokenizer

model_name = "uer/roberta-base-finetuned-jd-full-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file config.json from cache at C:\Users\jimmy/.cache\huggingface\hub\models--uer--roberta-base-finetuned-jd-full-chinese\snapshots\001c14a6ad8498465b0d7a2be435c30e856507a8\config.json
Model config BertConfig {
  "_name_or_path": "uer/roberta-base-finetuned-jd-full-chinese",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "star 1",
    "1": "star 2",
    "2": "star 3",
    "3": "star 4",
    "4": "star 5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "star 1": 0,
    "star 2": 1,
    "star 3": 2,
    "star 4": 3,
    "star 5": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers

In [164]:
def tokenize(batch):
    return tokenizer(batch["review_text"], padding=True, truncation=True)

In [165]:
my_dataset_dict_encoded = my_dataset_dict.map(tokenize, batched=True, batch_size=None)
next(iter(my_dataset_dict_encoded["train"]))

  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

{'review_text': '純素鹽酥雞很好吃！猴頭菇的口感很紮實\n\n',
 'label': 4,
 '__index_level_0__': 0,
 'input_ids': [101,
  5155,
  5162,
  7921,
  6989,
  7430,
  2523,
  1962,
  1391,
  8013,
  4347,
  7531,
  5823,
  4638,
  1366,
  2697,
  2523,
  5167,
  2179,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [166]:
valid_ds = my_dataset_dict["validation"]
valid_ds["label"][:]

[4,
 4,
 4,
 4,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4]

### 訓練模型

In [167]:
from transformers import AutoModelForSequenceClassification
import torch

num_labels = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = (AutoModelForSequenceClassification
        .from_pretrained(model_name, num_labels=num_labels
        ,id2label={"1": "star 1",
                    "2": "star 2",
                    "3": "star 3",
                    "4": "star 4",
                    "5": "star 5"}
        ,label2id={"star 1": "1",
                    "star 2": "2",
                    "star 3": "3",
                    "star 4": "4",
                    "star 5": "5" })
         .to(device))

loading configuration file config.json from cache at C:\Users\jimmy/.cache\huggingface\hub\models--uer--roberta-base-finetuned-jd-full-chinese\snapshots\001c14a6ad8498465b0d7a2be435c30e856507a8\config.json
Model config BertConfig {
  "_name_or_path": "uer/roberta-base-finetuned-jd-full-chinese",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "1": "star 1",
    "2": "star 2",
    "3": "star 3",
    "4": "star 4",
    "5": "star 5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "star 1": "1",
    "star 2": "2",
    "star 3": "3",
    "star 4": "4",
    "star 5": "5"
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tr

In [168]:
from transformers import Trainer, TrainingArguments

batch_size = 32
logging_steps = len(my_dataset_dict_encoded["train"]) // batch_size
model_name = "test_model"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=1,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  label_names= labels,
#                                   report_to = "mlflow",
                                  logging_steps=logging_steps)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [169]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=my_dataset_dict_encoded["train"],
                  eval_dataset=my_dataset_dict_encoded["validation"],
                  tokenizer=tokenizer)
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, review_text. If __index_level_0__, review_text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 496
  Num Epochs = 40
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 320
  Number of trainable parameters = 102271493
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
