In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, EvalPrediction
from datasets import load_dataset, DatasetDict
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch

In [2]:
dataset = load_dataset('csv', data_files='data/risk-dataset.csv')
labels = [label for label in dataset['train'].features.keys() if label not in ['description']]
train_testvalid = dataset['train'].train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [3]:
#build the model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred

    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    rmse = np.sqrt(mse)
    return {"mse": mse, "mae": mae, "rmse":rmse}
def preprocess_data(examples):
    text = examples["description"]
    encoding = tokenizer(text, truncation=True, padding="max_length", max_length=200)
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}

    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    # Ensure labels remain as NumPy array
    encoding["labels"] = labels_matrix
    return encoding



In [4]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=[])

Map:   0%|          | 0/599 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

In [5]:
encoded_dataset.set_format("torch")

In [6]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="regression", 
                                                           num_labels=len(labels))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [7]:
batch_size = 8
metric_name = "rmse"
EPOCHS = 40

args = TrainingArguments(
    f"risk-model/",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=False,
    metric_for_best_model=metric_name,
)

In [8]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_for_regression
)

In [9]:
trainer.train()
trainer.save_model("risk-model/")

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [10]:
trainer.evaluate()

{'eval_loss': 0.018667146563529968,
 'eval_mse': 0.018667146563529968,
 'eval_mae': 0.07483573257923126,
 'eval_rmse': 0.13662776350975037,
 'eval_runtime': 0.1366,
 'eval_samples_per_second': 548.962,
 'eval_steps_per_second': 73.195,
 'epoch': 40.0}

In [13]:
#trying without sigmoid
def output_to_values(output):
    return output.logits[0]

In [24]:
attributes = "health: 0.7\nstrength: 0.1\ndexterity: 0.1\nperception: 0.55\nintelligence: 0.9\ncharisma: 0.0\nstamina: 0.01"
dm_text= "\n<DM>There's a giant monster in front of you.</DM>"
player_text = "\n<Player>I try to outrun it</Player>"
input = attributes+dm_text+player_text
encoding = tokenizer(input, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
prediction = trainer.model(**encoding)
print(output_to_values(prediction))

tensor([0.2970], device='cuda:0', grad_fn=<SelectBackward0>)


In [None]:
#for if you have to load a model
model_directory = "models"
batch_size = 8
metric_name = "rmse"

model = AutoModelForSequenceClassification.from_pretrained(model_directory)
tokenizer =  AutoTokenizer.from_pretrained(model_directory)
args = TrainingArguments(
    f"color_model/",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=40,
    weight_decay=0.01,
    load_best_model_at_end=False,
    metric_for_best_model=metric_name,
)
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_for_regression
)