In [1]:
import pandas as pd
import mapply
mapply.init(progressbar=True, max_chunks_per_worker=100)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
clean = pd.read_feather('../data/lyrics_cleaned.feather')
clean = clean[clean['tag'] != 'misc']
# clean = clean[clean['tag'] != 'pop']

In [None]:
from sklearn.model_selection import train_test_split

SUBSAMPLE = 0.1

ds = clean.drop(['tokens', 'id'], axis=1)

TRAIN_P, VALID_P, TEST_P = 0.6, 0.2, 0.2
LABEL = 'tag'

ds = ds.rename(columns={LABEL: 'label'})
LABEL = 'label'

ds[LABEL], labels_map = pd.factorize(ds[LABEL])

assert TRAIN_P + VALID_P + TEST_P == 1

dev_ds, test_ds = train_test_split(ds, test_size=TEST_P, random_state=0, stratify=ds[LABEL])
train_ds, valid_ds = train_test_split(dev_ds, test_size=(VALID_P / (TRAIN_P + VALID_P)), random_state=0, stratify=dev_ds[LABEL])
  
if SUBSAMPLE:
  train_ds, _ = train_test_split(train_ds, train_size=SUBSAMPLE, random_state=0, stratify=train_ds[LABEL])
  valid_ds, _ = train_test_split(valid_ds, train_size=SUBSAMPLE, random_state=0, stratify=valid_ds[LABEL])
  test_ds, _ = train_test_split(test_ds, train_size=SUBSAMPLE, random_state=0, stratify=test_ds[LABEL])

In [None]:
from datasets import Dataset, DatasetDict

dataset_dict = {}
dataset_dict['train'] = Dataset.from_pandas(train_ds, split = 'Train').remove_columns(['__index_level_0__'])
dataset_dict['valid'] = Dataset.from_pandas(valid_ds, split = 'Validation').remove_columns(['__index_level_0__'])
dataset_dict['test'] = Dataset.from_pandas(test_ds, split = 'Test').remove_columns(['__index_level_0__'])

datasets = DatasetDict(dataset_dict)
datasets


DatasetDict({
    train: Dataset({
        features: ['title', 'artist', 'label', 'lyrics_clean'],
        num_rows: 421
    })
    valid: Dataset({
        features: ['title', 'artist', 'label', 'lyrics_clean'],
        num_rows: 140
    })
    test: Dataset({
        features: ['title', 'artist', 'label', 'lyrics_clean'],
        num_rows: 140
    })
})

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datetime import datetime

# MODEL = "FacebookAI/roberta-base"
# MODEL = "google-bert/bert-base-uncased"
MODEL = "distilbert/distilroberta-base"
MODEL_DIR = '../NAS/' + str(datetime.now()) + '/' + MODEL.split('/')[-1]
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def tokenize(row):
  return tokenizer(row['lyrics_clean'], truncation=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_datasets = datasets.map(tokenize, batched=True)

Map: 100%|██████████| 421/421 [00:00<00:00, 2974.46 examples/s]
Map: 100%|██████████| 140/140 [00:00<00:00, 3078.39 examples/s]
Map: 100%|██████████| 140/140 [00:00<00:00, 3124.12 examples/s]


In [None]:
from transformers import TrainingArguments, AutoModelForSequenceClassification

training_args = TrainingArguments(MODEL_DIR, save_strategy='no')

model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels = labels_map.shape[0])

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
TRAIN_ONLY_HEAD = False

if TRAIN_ONLY_HEAD:
  for name, param in model.named_parameters():
    if 'classifier' not in name:
      param.requires_grad = False
    else:
      param.requires_grad = True
      print(name, param.requires_grad)

In [None]:
def optuna_hp_space(trial):
  return {        
    "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
    "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [1, 4, 16]),  
  }

In [None]:
import numpy as np

class AlmostAccuracy:
  def __call__(self, eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    true = np.abs(preds - labels) == 0
    return {"accuracy": true.astype(np.float32).mean().item()}

from transformers import Trainer

def model_init(_):
    return AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels = labels_map.shape[0])

trainer = Trainer(
  args=training_args,
  train_dataset=tokenized_datasets['train'],
  eval_dataset=tokenized_datasets['valid'],
  data_collator=data_collator,
  tokenizer=tokenizer,
  model_init=model_init,
  compute_metrics=AlmostAccuracy()
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from typing import List, Dict
import transformers

# trainer.train()
transformers.logging.set_verbosity_error()
def compute_objective(metrics: Dict[str, float]) -> List[float]:
  return metrics["eval_loss"], metrics["eval_accuracy"]

from datetime import datetime

best_trials = trainer.hyperparameter_search(
  direction=["minimize", "maximize"],
  backend="optuna",
  hp_space=optuna_hp_space,
  n_trials=20,
  compute_objective=compute_objective,
  storage="sqlite:///db.sqlite3",  # Specify the storage URL here.
  study_name=f"hp_search",
  load_if_exists=True
)

[I 2024-05-30 11:01:30,505] Using an existing study with name 'hp_search' instead of creating a new one.


Step,Training Loss


[I 2024-05-30 11:02:26,778] Trial 2 finished with values: [1.3980571031570435, 0.4571428596973419] and parameters: {'learning_rate': 3.951573285238236e-06, 'per_device_train_batch_size': 4}. 


Step,Training Loss


[I 2024-05-30 11:03:21,430] Trial 3 finished with values: [1.6044014692306519, 0.20714285969734192] and parameters: {'learning_rate': 2.1641460651709705e-06, 'per_device_train_batch_size': 16}. 


Step,Training Loss


[I 2024-05-30 11:04:16,250] Trial 4 finished with values: [1.126822590827942, 0.550000011920929] and parameters: {'learning_rate': 2.4158437741894973e-05, 'per_device_train_batch_size': 16}. 


Step,Training Loss


[I 2024-05-30 11:05:11,116] Trial 5 finished with values: [1.6056479215621948, 0.20714285969734192] and parameters: {'learning_rate': 1.9014668679999422e-06, 'per_device_train_batch_size': 16}. 


Step,Training Loss


In [None]:
print(best_trials[0])
print(best_trials)

In [None]:
import numpy as np

predictions = trainer.predict(tokenized_datasets["valid"])
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from matplotlib import pyplot as plt

disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(predictions.label_ids, preds),
                              display_labels=labels_map)
disp.plot()


In [None]:
from sklearn.metrics import classification_report

print(classification_report(predictions.label_ids, preds, target_names=labels_map))

In [None]:
trainer.save_model(MODEL_DIR + '/end')