In [None]:
!pip install transformers sentencepiece datasets

from google.colab import drive
drive.mount('/content/gdrive/')

import os
os.chdir('/content/gdrive/MyDrive/Individual Project')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 4.7 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 43.8 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 50.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 6.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 5.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp3

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
from transformers import AutoTokenizer, AutoModelWithLMHead
from tokenizers import BertWordPieceTokenizer

# Start building classifier

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "hfl/chinese-xlnet-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/675k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/202 [00:00<?, ?B/s]

In [None]:
encoded_str = tokenizer("我而家好嬲")
encoded_str

{'input_ids': [7397, 52, 95, 453, 28783, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 4
model = AutoModelForSequenceClassification.from_pretrained("/content/gdrive/MyDrive/Individual Project/EmotionClassification/XLNet/XLNetLarge", num_labels=num_labels)

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at hfl/chinese-xlnet-base were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-xlnet-base and are newly initialized: ['logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for p

In [None]:
import torch

from transformers import TrainingArguments


args = TrainingArguments(
    output_dir="/content/gdrive/MyDrive/Individual Project/Model",
    num_train_epochs=10,
    learning_rate =1e-5,
    adam_epsilon=1e-06,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    evaluation_strategy="epoch",
    save_strategy='epoch',
    disable_tqdm=False,
    eval_steps=500,
    logging_steps=500,
    log_level='error',
    save_total_limit = 2,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    overwrite_output_dir=False,
)


def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='micro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

data = pd.read_csv("/content/gdrive/MyDrive/Individual Project/Merge_CantoneseEmotion_ds.csv")

# ----- 1. Preprocess data -----#
# Preprocess data
X = list(data["content"])
y = list(data["label"])
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

train_ratio = 0.75
validation_ratio = 0.10
test_ratio = 0.15

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio, random_state=0)

# test is now 15% of the initial data set
# validation is now 10% of the initial data set
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=0) 

X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)
test_dataset=Dataset(X_test_tokenized, y_test)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    # callbacks=checkpoint_callback,
)

# Train pre-trained model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.65236,0.810185,0.810185,0.810185,0.810185
2,No log,0.691501,0.833333,0.833333,0.833333,0.833333
3,0.376200,0.845347,0.828704,0.828704,0.828704,0.828704
4,0.376200,0.958143,0.833333,0.833333,0.833333,0.833333
5,0.376200,0.954791,0.833333,0.833333,0.833333,0.833333
6,0.174400,1.00623,0.828704,0.828704,0.828704,0.828704
7,0.174400,1.112194,0.824074,0.824074,0.824074,0.824074
8,0.174400,1.098494,0.833333,0.833333,0.833333,0.833333
9,0.141400,1.068518,0.828704,0.828704,0.828704,0.828704
10,0.141400,1.06515,0.833333,0.833333,0.833333,0.833333


TrainOutput(global_step=1740, training_loss=0.21252758201511426, metrics={'train_runtime': 298.8497, 'train_samples_per_second': 57.956, 'train_steps_per_second': 5.822, 'total_flos': 414396398034240.0, 'train_loss': 0.21252758201511426, 'epoch': 10.0})

In [None]:
tokenizer

PreTrainedTokenizerFast(name_or_path='hfl/chinese-roberta-wwm-ext', vocab_size=21128, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [None]:
trainer.save_model("/content/gdrive/MyDrive/Individual Project/EmotionClassification/XLNet/XLNetMerge2")

In [None]:
from transformers import XLNetForSequenceClassification

# ----- 3. Predict -----#

# Load trained model
model_path = "/content/gdrive/MyDrive/Individual Project/EmotionClassification/XLNet/XLNetMerge2"
model = XLNetForSequenceClassification.from_pretrained(model_path, num_labels=4)

# Define test trainer
test_trainer = Trainer(model)

# Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)

# raw_pred, _ = model.predictz(test_dataset)
# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

***** Running Prediction *****
  Num examples = 217
  Batch size = 8


In [None]:
from sklearn import metrics

print(metrics.classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8846    0.7797    0.8288        59
           1     0.8679    0.8070    0.8364        57
           2     0.7692    0.8333    0.8000        48
           3     0.8500    0.9623    0.9027        53

    accuracy                         0.8433       217
   macro avg     0.8429    0.8456    0.8420       217
weighted avg     0.8463    0.8433    0.8425       217



In [None]:
torch.save(model.state_dict(), '/content/gdrive/MyDrive/Individual Project/EmotionClassification/XLNet/BestXLNetEmotion/BestXLNet.pt')

In [None]:
import torch
   # ----- 3. Predict -----#
# model_path = "/content/gdrive/MyDrive/Individual Project/EmotionClassification/XLNet/XLNetEmotion1"
# model = XLNet.from_pretrained(model_path, num_labels=4)
# tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

text = "唔錯"
# tokenizer2 = BertTokenizer.from_pretrained('bert-base-chinese')
X_test_tokenized = tokenizer(text, padding=True, truncation=True, max_length=512)

b_input_ids= torch.tensor(X_test_tokenized['input_ids']).unsqueeze(0)
b_attention_mask = torch.tensor(X_test_tokenized['attention_mask']).unsqueeze(0)

with torch.no_grad():
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)

logits = outputs[0]
print(logits)
logits = logits.detach().numpy()
predict_label = np.argmax(logits, axis=1).flatten()
print(predict_label)

label = " "

for i in predict_label:
    if i == 0:
        label = "唔開心"
    elif i == 1:
        label = "嬲"
    elif i == 2:
        label = "擔心"
    else:
        label = "開心"
print(label)

tensor([[-0.2236, -0.4877, -1.6008,  2.4872]])
[3]
開心
