In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, EvalPrediction
from datasets import load_dataset, DatasetDict
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch
import random

In [2]:
dataset = load_dataset('csv', data_files='data/attribute-dataset.csv')
labels = [label for label in dataset['train'].features.keys() if label not in ['description']]
train_testvalid = dataset['train'].train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [3]:
#build the model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred

    predictions = np.tanh(logits)

    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    rmse = np.sqrt(mse)
    return {"mse": mse, "mae": mae, "rmse":rmse}

def preprocess_data(examples):
    text = examples["description"]
    encoding = tokenizer(text, truncation=True, padding="max_length", max_length=1000)
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}

    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    #ensure labels remain as NumPy array
    encoding["labels"] = labels_matrix
    return encoding



In [11]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=[] )

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

Map:   0%|          | 0/239 [00:00<?, ? examples/s]

Map:   0%|          | 0/238 [00:00<?, ? examples/s]

In [12]:
encoded_dataset.set_format("torch")

In [14]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="regression", 
                                                           num_labels=len(labels))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [15]:
batch_size = 16
metric_name = "rmse"
EPOCHS = 20
args = TrainingArguments(
    f"attribute-model/",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    save_total_limit=2,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=EPOCHS,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [16]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_for_regression
)

In [17]:
trainer.train()
trainer.save_model("attribute-model/")

Epoch,Training Loss,Validation Loss,Mse,Mae,Rmse
1,No log,0.003721,0.003721,0.04039,0.061


In [18]:
trainer.evaluate()

{'eval_loss': 0.0037208872381597757,
 'eval_mse': 0.0037209391593933105,
 'eval_mae': 0.040389951318502426,
 'eval_rmse': 0.06099950149655342,
 'eval_runtime': 0.4048,
 'eval_samples_per_second': 587.941,
 'eval_steps_per_second': 37.055,
 'epoch': 20.0}

In [21]:
def output_clamp(output, desired_range=1.0):
    logits = output.logits

    #clamped = desired_range + torch.tanh(logits)

    return logits

In [27]:
scenarios = pd.read_csv("data/attribute-scenario-dataset.csv")["scenarios"].to_list()
input = random.choice(scenarios) 
encoding = tokenizer(input, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
prediction = trainer.model(**encoding)
print(input)
print(output_clamp(prediction))

<DM>The wind howls as you climb the treacherous mountain path, the air thin and cold at higher elevations.</DM>
<Player>I push forward with my strength, focusing on each step to reach the summit despite the harsh weather conditions.</Player>
<DM>Your body aches, but your strength drives you onward. After a grueling climb, you finally reach the peak, and the sight before you takes your breath away.</DM>
tensor([[-0.0106,  0.0139,  0.0212, -0.0018,  0.0327,  0.0084,  0.0474]],
       device='cuda:0', grad_fn=<AddmmBackward0>)


In [None]:
#for if you have to load a model
model_directory = "models"
batch_size = 8
metric_name = "rmse"

model = AutoModelForSequenceClassification.from_pretrained(model_directory)
tokenizer =  AutoTokenizer.from_pretrained(model_directory)
args = TrainingArguments(
    f"color_model/",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=40,
    weight_decay=0.01,
    load_best_model_at_end=False,
    metric_for_best_model=metric_name,
)
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_for_regression
)