In [1]:
import pandas as pd
import mapply
mapply.init(progressbar=True, max_chunks_per_worker=100)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
clean = pd.read_feather('../data/lyrics_cleaned.feather')
clean = clean[clean['tag'] != 'misc']
# clean = clean[clean['tag'] != 'pop']

In [3]:
from sklearn.model_selection import train_test_split

SUBSAMPLE = 0.001

ds = clean.drop(['tokens', 'id'], axis=1)

TRAIN_P, VALID_P, TEST_P = 0.6, 0.2, 0.2
LABEL = 'tag'

ds = ds.rename(columns={LABEL: 'label'})
LABEL = 'label'

ds[LABEL], labels_map = pd.factorize(ds[LABEL])

assert TRAIN_P + VALID_P + TEST_P == 1

dev_ds, test_ds = train_test_split(ds, test_size=TEST_P, random_state=0, stratify=ds[LABEL])
train_ds, valid_ds = train_test_split(dev_ds, test_size=(VALID_P / (TRAIN_P + VALID_P)), random_state=0, stratify=dev_ds[LABEL])
  
if SUBSAMPLE:
  train_ds, _ = train_test_split(train_ds, train_size=SUBSAMPLE, random_state=0, stratify=train_ds[LABEL])
  valid_ds, _ = train_test_split(valid_ds, train_size=SUBSAMPLE, random_state=0, stratify=valid_ds[LABEL])
  test_ds, _ = train_test_split(test_ds, train_size=SUBSAMPLE, random_state=0, stratify=test_ds[LABEL])

In [4]:
from datasets import Dataset, DatasetDict

dataset_dict = {}
dataset_dict['train'] = Dataset.from_pandas(train_ds, split = 'Train').remove_columns(['__index_level_0__'])
dataset_dict['valid'] = Dataset.from_pandas(valid_ds, split = 'Validation').remove_columns(['__index_level_0__'])
dataset_dict['test'] = Dataset.from_pandas(test_ds, split = 'Test').remove_columns(['__index_level_0__'])

datasets = DatasetDict(dataset_dict)
datasets


DatasetDict({
    train: Dataset({
        features: ['title', 'artist', 'label', 'lyrics_clean'],
        num_rows: 42
    })
    valid: Dataset({
        features: ['title', 'artist', 'label', 'lyrics_clean'],
        num_rows: 14
    })
    test: Dataset({
        features: ['title', 'artist', 'label', 'lyrics_clean'],
        num_rows: 14
    })
})

In [5]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datetime import datetime

# MODEL = "FacebookAI/roberta-base"
# MODEL = "google-bert/bert-base-uncased"
# openai-community/gpt2
MODEL = "distilbert/distilroberta-base"
MODEL_DIR = '../NAS/' + str(datetime.now()) + '/' + MODEL.split('/')[-1]
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def tokenize(row):
  return tokenizer(row['lyrics_clean'], truncation=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_datasets = datasets.map(tokenize, batched=True)

Map: 100%|██████████| 42/42 [00:00<00:00, 2004.49 examples/s]
Map: 100%|██████████| 14/14 [00:00<00:00, 1826.05 examples/s]
Map: 100%|██████████| 14/14 [00:00<00:00, 1879.23 examples/s]


In [6]:
from transformers import TrainingArguments, AutoModelForSequenceClassification

training_args = TrainingArguments(MODEL_DIR, save_strategy='no')

model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels = labels_map.shape[0])

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
TRAIN_ONLY_HEAD = False

if TRAIN_ONLY_HEAD:
  for name, param in model.named_parameters():
    if 'classifier' not in name:
      param.requires_grad = False
    else:
      param.requires_grad = True
      print(name, param.requires_grad)

In [8]:
def optuna_hp_space(trial):
  return {        
    "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
    "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 16]),  
  }

In [9]:
import numpy as np

class AlmostAccuracy:
  def __call__(self, eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    true = np.abs(preds - labels) == 0
    return {"accuracy": true.astype(np.float32).mean().item()}

from transformers import Trainer

def model_init(_):
    return AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels = labels_map.shape[0])

trainer = Trainer(
  args=training_args,
  train_dataset=tokenized_datasets['train'],
  eval_dataset=tokenized_datasets['valid'],
  data_collator=data_collator,
  tokenizer=tokenizer,
  model_init=model_init,
  compute_metrics=AlmostAccuracy()
)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from typing import List, Dict
import transformers

# trainer.train()
transformers.logging.set_verbosity_error()
def compute_objective(metrics: Dict[str, float]) -> List[float]:
  return metrics["eval_loss"], metrics["eval_accuracy"]

from datetime import datetime

best_trials = trainer.hyperparameter_search(
  direction=["minimize", "maximize"],
  backend="optuna",
  hp_space=optuna_hp_space,
  n_trials=10,
  compute_objective=compute_objective,
  storage="sqlite:///db.sqlite3",  # Specify the storage URL here.
  study_name=f"GPT2",
  load_if_exists=True
)

[I 2024-05-30 15:11:46,546] A new study created in RDB with name: GPT2
[W 2024-05-30 15:11:47,071] Trial 0 failed with parameters: {'learning_rate': 1.2614670500207868e-05, 'per_device_train_batch_size': 16} because of the following error: ValueError("Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.").
Traceback (most recent call last):
  File "/home/hlt/HLT_PROJECT/.env/lib/python3.11/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/home/hlt/HLT_PROJECT/.env/lib/python3.11/site-packages/transformers/integrations/integration_utils.py", line 210, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/home/hlt/HLT_PROJECT/.env/lib/python3.11/site-packages/transformers/train

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [None]:
print(best_trials[0])
print(best_trials)

In [None]:
import numpy as np

predictions = trainer.predict(tokenized_datasets["valid"])
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from matplotlib import pyplot as plt

disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(predictions.label_ids, preds),
                              display_labels=labels_map)
disp.plot()


In [None]:
from sklearn.metrics import classification_report

print(classification_report(predictions.label_ids, preds, target_names=labels_map))

In [None]:
trainer.save_model(MODEL_DIR + '/end')