In [None]:
!pip install transformers sentencepiece datasets

from google.colab import drive
drive.mount('/content/gdrive/')

import os
os.chdir('/content/gdrive/MyDrive/Individual Project')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
from transformers import AutoTokenizer, AutoModelWithLMHead
from tokenizers import BertWordPieceTokenizer

# Start building classifier

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "hfl/chinese-roberta-wwm-ext"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/263k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# Sanity Test

encoded_str = tokenizer("我而家好嬲")
encoded_str

{'input_ids': [101, 2769, 5445, 2157, 1962, 100, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 4
model = AutoModelForSequenceClassification.from_pretrained("/content/gdrive/MyDrive/Individual Project/EmotionClassification/Roberta/RobertaLarge", num_labels=num_labels)

In [None]:
from transformers import EarlyStoppingCallback

# Early Stopping Module

trainer_callback = EarlyStoppingCallback(early_stopping_patience = 20, early_stopping_threshold = 0.001)

In [None]:
import torch

from transformers import TrainingArguments


args = TrainingArguments(
    output_dir="/content/gdrive/MyDrive/Individual Project/Model",
    num_train_epochs=50,
    learning_rate =1e-5,
    adam_epsilon=1e-06,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    evaluation_strategy="epoch",
    save_strategy='epoch',
    disable_tqdm=False,
    eval_steps=500,
    logging_steps=500,
    log_level='error',
    save_total_limit = 2,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    overwrite_output_dir=False,
    warmup_steps = 100, # Number of warmup steps
)


def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='micro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

data = pd.read_csv("/content/gdrive/MyDrive/Individual Project/Modify_empatheticPersonasTW - Copy.csv")

# ----- 1. Preprocess data -----#
# Preprocess data
X = list(data["content"])
y = list(data["label"])
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

train_ratio = 0.75
validation_ratio = 0.10
test_ratio = 0.15

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio, random_state=0)

# test is now 15% of the initial data set
# validation is now 10% of the initial data set
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=0) 

X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)
test_dataset=Dataset(X_test_tokenized, y_test)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks = [trainer_callback], # EarlyStoppingCallback module
)

# Train pre-trained model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.53172,0.864407,0.864407,0.864407,0.864407
2,No log,0.602242,0.881356,0.881356,0.881356,0.881356
3,No log,0.628367,0.881356,0.881356,0.881356,0.881356
4,No log,0.68992,0.872881,0.872881,0.872881,0.872881
5,No log,0.770842,0.872881,0.872881,0.872881,0.872881
6,0.234600,0.782979,0.881356,0.881356,0.881356,0.881356
7,0.234600,0.797043,0.872881,0.872881,0.872881,0.872881
8,0.234600,0.806094,0.881356,0.881356,0.881356,0.881356
9,0.234600,0.820355,0.881356,0.881356,0.881356,0.881356
10,0.234600,0.826258,0.872881,0.872881,0.872881,0.872881


TrainOutput(global_step=950, training_loss=0.13753564633821186, metrics={'train_runtime': 182.8695, 'train_samples_per_second': 51.622, 'train_steps_per_second': 5.195, 'total_flos': 271667042949120.0, 'train_loss': 0.13753564633821186, 'epoch': 10.0})

In [None]:
tokenizer

In [None]:
trainer.save_model("/content/gdrive/MyDrive/Individual Project/EmotionClassification/Roberta/RobertaLarge1.5")

In [None]:
# ----- 3. Predict -----#

# Load trained model
model_path = "/content/gdrive/MyDrive/Individual Project/EmotionClassification/Roberta/RobertaLarge1.5"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=4)

# Define test trainer
test_trainer = Trainer(model)

# Make prediction 
raw_pred, _, _ = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

***** Running Prediction *****
  Num examples = 119
  Batch size = 8


In [None]:
from sklearn import metrics

print(metrics.classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.7917    0.7037    0.7451        27
           1     0.9211    0.9211    0.9211        38
           2     0.8438    0.9310    0.8852        29
           3     0.9200    0.9200    0.9200        25

    accuracy                         0.8739       119
   macro avg     0.8691    0.8689    0.8678       119
weighted avg     0.8726    0.8739    0.8722       119

