In [1]:
!pip install transformers sentencepiece datasets

from google.colab import drive
drive.mount('/content/gdrive/')

import os
os.chdir('/content/gdrive/MyDrive/Individual Project')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 13.4 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 52.0 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 60.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 50.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 60.4 MB/s 


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
from transformers import AutoTokenizer, AutoModelWithLMHead
from tokenizers import BertWordPieceTokenizer

In [None]:
#create a dictionary which associates each string label to an integer value
labels = [ "no", "weak", "strong"]
label2int = dict(zip(labels, list(range(len(labels)))))

# Start building classifier

In [3]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/263k [00:00<?, ?B/s]

In [None]:
encoded_str = tokenizer("我而家好嬲")
encoded_str

{'input_ids': [101, 2769, 5445, 2157, 1962, 100, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [4]:
from transformers import AutoModelForSequenceClassification

num_labels = 4
model = AutoModelForSequenceClassification.from_pretrained("/content/gdrive/MyDrive/Individual Project/EmotionClassification/BERT/BERTLarge", num_labels=num_labels)

In [5]:
import torch

from transformers import TrainingArguments


args = TrainingArguments(
    output_dir="/content/gdrive/MyDrive/Individual Project/Model",
    num_train_epochs=10,
    learning_rate =1e-5,
    adam_epsilon=1e-06,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    evaluation_strategy="epoch",
    save_strategy='epoch',
    disable_tqdm=False,
    eval_steps=500,
    logging_steps=500,
    log_level='error',
    save_total_limit = 2,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    overwrite_output_dir=False,
)


def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='micro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

data = pd.read_csv("/content/gdrive/MyDrive/Individual Project/Merge_CantoneseEmotion_ds.csv")

# ----- 1. Preprocess data -----#
# Preprocess data
X = list(data["content"])
y = list(data["label"])
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

train_ratio = 0.80
validation_ratio = 0.10
test_ratio = 0.10

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio, random_state=0)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=0) 

X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)
test_dataset=Dataset(X_test_tokenized, y_test)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    # callbacks=checkpoint_callback,
)

# Train pre-trained model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.509389,0.828704,0.828704,0.828704,0.828704
2,No log,0.53974,0.810185,0.810185,0.810185,0.810185
3,0.424100,0.679999,0.842593,0.842593,0.842593,0.842593
4,0.424100,0.774725,0.824074,0.824074,0.824074,0.824074
5,0.424100,0.774996,0.833333,0.833333,0.833333,0.833333
6,0.152500,0.80978,0.842593,0.842593,0.842593,0.842593
7,0.152500,0.880337,0.828704,0.828704,0.828704,0.828704
8,0.152500,0.885449,0.833333,0.833333,0.833333,0.833333
9,0.052300,0.889652,0.851852,0.851852,0.851852,0.851852
10,0.052300,0.916457,0.828704,0.828704,0.828704,0.828704


TrainOutput(global_step=1740, training_loss=0.18597575604230507, metrics={'train_runtime': 297.5726, 'train_samples_per_second': 58.204, 'train_steps_per_second': 5.847, 'total_flos': 498439955919360.0, 'train_loss': 0.18597575604230507, 'epoch': 10.0})

In [None]:
tokenizer

PreTrainedTokenizerFast(name_or_path='hfl/chinese-roberta-wwm-ext', vocab_size=21128, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [6]:
trainer.save_model("/content/gdrive/MyDrive/Individual Project/EmotionClassification/BERT/BERTMerge2")

In [7]:
# ----- 3. Predict -----#
# Load test data
# test_data = pd.read_csv("test.csv")
# X_test = list(test_data["review"])
# X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# Create torch dataset
# test_dataset = Dataset(X_test_tokenized)

# Load trained model
model_path = "/content/gdrive/MyDrive/Individual Project/EmotionClassification/BERT/BERTMerge2"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=4)

# Define test trainer
test_trainer = Trainer(model)

# Make prediction 
raw_pred, _, _ = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

***** Running Prediction *****
  Num examples = 217
  Batch size = 8


In [8]:
from sklearn import metrics

print(metrics.classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.7910    0.8983    0.8413        59
           1     0.8750    0.7368    0.8000        57
           2     0.8125    0.8125    0.8125        48
           3     0.9259    0.9434    0.9346        53

    accuracy                         0.8479       217
   macro avg     0.8511    0.8478    0.8471       217
weighted avg     0.8508    0.8479    0.8469       217



In [10]:
   # ----- 3. Predict -----#
model_path = "/content/gdrive/MyDrive/Individual Project/EmotionClassification/BERT/BERTMerge2"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=4)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

text = "ok la"
# tokenizer2 = BertTokenizer.from_pretrained('bert-base-chinese')
X_test_tokenized = tokenizer(text, padding=True, truncation=True, max_length=512)

b_input_ids= torch.tensor(X_test_tokenized['input_ids']).unsqueeze(0)
b_attention_mask = torch.tensor(X_test_tokenized['attention_mask']).unsqueeze(0)

with torch.no_grad():
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)

logits = outputs[0]
print(logits)
logits = logits.detach().numpy()
predict_label = np.argmax(logits, axis=1).flatten()
print(predict_label)

label = " "

for i in predict_label:
    if i == 0:
        label = "唔開心"
    elif i == 1:
        label = "嬲"
    elif i == 2:
        label = "擔心"
    else:
        label = "開心"
print(label)

loading configuration file /content/gdrive/MyDrive/Individual Project/EmotionClassification/BERT/BERTMerge2/config.json
Model config BertConfig {
  "_name_or_path": "/content/gdrive/MyDrive/Individual Project/EmotionClassification/BERT/BERTLarge",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_

tensor([[ 0.0951, -2.6707, -0.5865,  3.3913]])
[3]
開心


In [11]:
torch.save(model.state_dict(), '/content/gdrive/MyDrive/Individual Project/EmotionClassification/BERT/BestBERTEmotion/BestBERTEmotion.pt')

# Roy's

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers.models.bert.modeling_bert import BertPreTrainedModel

class EmotionClassifier(nn.Module):
    """
        Classifier Head for the EmotionClassifierModule
        Takes input of (batch_size, hidden_size) and outputs to num_labels
    """
    def __init__(self, args):
        super(EmotionClassifier, self).__init__()
        self.layers = nn.Sequential(
            nn.Dropout(args.dropout),
            nn.Linear(args.hidden_size, args.hidden_size),
            nn.ReLU(),
            nn.Dropout(args.dropout),
            nn.Linear(args.hidden_size, args.num_labels),
        )

    def forward(self, x):
        return self.layers(x)


class EmotionClassifierModule(BertPreTrainedModel):
    """
      Model wrapper used to classify a given utterance into a known emotion class
    """
    def __init__(self, config, args, model_type):
        super(EmotionClassifierModule, self).__init__(config)
        self.args = args
        # Can either load a saved pytorch model or a pre-trained HuggingFace model
        if not args.load_model:
            self.roberta = model_type.from_pretrained(args.model_type)
        elif args.load_pretrained == 'hf':
            self.roberta = model_type.from_pretrained(args.load_model)
        # Create Classifier Head
        self.classifier = EmotionClassifier(args)

    def forward(self, input_ids, attention_mask, emotion_labels=None, alignment_set=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        # Select [CLS] embedding and perform classification
        pooled = outputs['pooler_output']
        emotion_logits = self.classifier(pooled)

        tar_pooled = None
        # Generate [CLS] embedding for alignment set if performing cross-lingual alignment
        if alignment_set is not None:
            tar_ids = alignment_set[:, 0]
            tar_mask = alignment_set[:, 1]
            tar_pooled = self.roberta(input_ids=tar_ids, attention_mask=tar_mask)['pooler_output']

        loss = None
        if self.training and emotion_labels is not None:
            # Task loss
            loss_fn = F.cross_entropy
            loss = loss_fn(emotion_logits, emotion_labels)

            # Contrastive loss
            if alignment_set is not None:
                sim_logits = pooled / torch.norm(pooled, dim=1).reshape((-1, 1))
                tar_logits = tar_pooled / torch.norm(tar_pooled, dim=1).reshape((-1, 1))
                sim = torch.matmul(sim_logits, tar_logits.T)
                mask = (emotion_labels != emotion_labels.reshape((-1, 1))).float().to(self.args.device)
                mask += torch.diag(torch.ones(input_ids.size(0))).to(self.args.device)
                sim = sim * mask

                loss_ctr = loss_fn(sim, torch.arange(input_ids.size(0)).to(self.args.device))
                loss += loss_ctr

        return emotion_logits, loss


In [None]:
model = torch.load("")