In [None]:
!pip install transformers sentencepiece datasets

from google.colab import drive
drive.mount('/content/gdrive/')

import os
os.chdir('/content/gdrive/MyDrive/Individual Project')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.9 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 51.8 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 50.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 2.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 3.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
from transformers import AutoTokenizer, AutoModelWithLMHead
from tokenizers import BertWordPieceTokenizer

In [None]:
#create a dictionary which associates each string label to an integer value
labels = [ "no", "weak", "strong"]
label2int = dict(zip(labels, list(range(len(labels)))))

# Start building classifier

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "uer/roberta-base-word-chinese-cluecorpussmall"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/305 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/485 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.90M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/156 [00:00<?, ?B/s]

In [None]:
encoded_str = tokenizer("我而家好嬲")
encoded_str

{'input_ids': [2, 373, 40848, 53, 1, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained("/content/gdrive/MyDrive/Individual Project/EmpathyResponse/Roberta/RobertaEmpathy2", num_labels=num_labels)

In [None]:
from transformers import EarlyStoppingCallback

# Early Stopping Module

trainer_callback = EarlyStoppingCallback(early_stopping_patience = 20, early_stopping_threshold = 0.001)

In [None]:
import torch

from transformers import TrainingArguments

model_name = model_checkpoint.split("/")[-1]


args = TrainingArguments(
    output_dir="/content/gdrive/MyDrive/Individual Project/Model",
    num_train_epochs=50,
    learning_rate =1e-5,
    adam_epsilon=1e-06,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    evaluation_strategy="epoch",
    save_strategy='epoch',
    disable_tqdm=False,
    eval_steps=500,
    logging_steps=500,
    log_level='error',
    save_total_limit = 2,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    overwrite_output_dir=False,
    warmup_steps = 100, # Number of warmup steps

)


def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='micro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

data = pd.read_csv("/content/gdrive/MyDrive/Individual Project/EmpathyResponse/Lisa_EmpathyResponse.csv")

# ----- 1. Preprocess data -----#
# Preprocess data
X = list(data["response"])
y = list(data["empathy_score"])
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio, random_state=0)

# test is now 15% of the initial data set
# validation is now 10% of the initial data set
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=0) 

X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)
test_dataset=Dataset(X_test_tokenized, y_test)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks = [trainer_callback], # EarlyStoppingCallback module

)

# Train pre-trained model
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.658822,0.69697,0.69697,0.69697,0.69697
2,No log,0.629828,0.739394,0.739394,0.739394,0.739394
3,No log,0.755851,0.69697,0.69697,0.69697,0.69697
4,No log,0.839667,0.709091,0.709091,0.709091,0.709091


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.658822,0.69697,0.69697,0.69697,0.69697
2,No log,0.629828,0.739394,0.739394,0.739394,0.739394
3,No log,0.755851,0.69697,0.69697,0.69697,0.69697
4,No log,0.839667,0.709091,0.709091,0.709091,0.709091
5,No log,1.220118,0.690909,0.690909,0.690909,0.690909
6,No log,1.348824,0.709091,0.709091,0.709091,0.709091
7,0.435600,1.452118,0.709091,0.709091,0.709091,0.709091
8,0.435600,1.665697,0.69697,0.69697,0.69697,0.69697
9,0.435600,1.789197,0.684848,0.684848,0.684848,0.684848
10,0.435600,1.776273,0.690909,0.690909,0.690909,0.690909


TrainOutput(global_step=830, training_loss=0.29688779761992307, metrics={'train_runtime': 755.8244, 'train_samples_per_second': 10.915, 'train_steps_per_second': 1.098, 'total_flos': 1366503583945500.0, 'train_loss': 0.29688779761992307, 'epoch': 10.0})

In [None]:
tokenizer

PreTrainedTokenizerFast(name_or_path='hfl/chinese-roberta-wwm-ext-large', vocab_size=21128, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [None]:
trainer.save_model("/content/gdrive/MyDrive/Individual Project/EmpathyResponse/Roberta/RobertaEmpathy2")

In [None]:
# ----- 3. Predict -----#
# Load test data
# test_data = pd.read_csv("test.csv")
# X_test = list(test_data["review"])
# X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# Create torch dataset
# test_dataset = Dataset(X_test_tokenized)

# Load trained model
model_path = "/content/gdrive/MyDrive/Individual Project/EmpathyResponse/Roberta/RobertaEmpathy2"
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3)

# Define test trainer
test_trainer = Trainer(model)

# Make prediction 
raw_pred, _, _ = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

loading configuration file /content/gdrive/MyDrive/Individual Project/EmpathyResponse/Roberta/RobertaEmpathy2/config.json
Model config BertConfig {
  "_name_or_path": "/content/gdrive/MyDrive/Individual Project/EmpathyResponse/Roberta/RobertaEmpathy2",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "tokenizer_class": "AlbertTokenizer",
 

tensor([[    2,    17,    53, 96399, 52711, 11178,  7576,  3097, 10306,     3]])
tensor([[ 1.3822, -0.6351, -1.0665]])
[0]


In [None]:
from sklearn import metrics

print(metrics.classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.7714    0.8710    0.8182        31
           1     0.8571    0.6522    0.7407        46
           2     0.7250    0.8788    0.7945        33

    accuracy                         0.7818       110
   macro avg     0.7845    0.8006    0.7845       110
weighted avg     0.7933    0.7818    0.7787       110



In [None]:

torch.save(model.state_dict(), '/content/gdrive/MyDrive/Individual Project/EmpathyResponse/Roberta/BestRobertaEmpathy/RobertaEmpathy2.pt')

In [None]:
import torch

model = '/content/gdrive/MyDrive/Individual Project/EmpathyResponse/Roberta/BestRobertaEmpathy/RobertaEmpathy2.pt'

state_dic = torch.load(model)
print(state_dic.keys())

odict_keys(['bert.embeddings.position_ids', 'bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weigh