In [1]:
%run -i "../util/lang_utils.ipynb"

In [2]:
from datasets import load_dataset, Dataset, Features, Value, ClassLabel, Sequence, DatasetDict
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.model_selection import train_test_split
from evaluate import load

In [3]:
music_ner_df = pd.read_csv('../data/music_ner.csv')
def change_label(input_label):
    input_label = input_label.replace("_deduced", "")
    return input_label
music_ner_df["label"] = music_ner_df["label"].apply(change_label)
music_ner_df["text"] = music_ner_df["text"].apply(lambda x: x.replace("|", ","))
print(music_ner_df)

        id                                               text  start_offset  \
0    13434  i love radioheads kid a something similar , ki...             7   
1    13434  i love radioheads kid a something similar , ki...            61   
2    13435                anything similar to i fight dragons            20   
3    13436                music similar to ccrs travelin band            17   
4    13437                 songs similar to blackout by boris            17   
..     ...                                                ...           ...   
422  14028  songs like good news by mac miller , preferrab...            11   
423  14028  songs like good news by mac miller , preferrab...            24   
424  14030  something along the lines of either the chain ...            49   
425  14030  something along the lines of either the chain ...            29   
426  14032       heavy bass x gothic rap like oxygen by bones            29   

     end_offset          label  
0            17   

In [4]:
ids = list(set(music_ner_df["id"].values))
docs = {}
for id in ids:
    entity_rows = music_ner_df.loc[music_ner_df['id'] == id]
    text = entity_rows.head(1)["text"].values[0]
    doc = small_model(text)
    ents = []
    for index, row in entity_rows.iterrows():
        label = row["label"]
        start = row["start_offset"]
        end = row["end_offset"]
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        ents.append(span)
    doc.ents = ents
    docs[doc.text] = doc

In [5]:
data_file = "../data/music_ner_bio.bio"
tag_mapping = {"O": 0, "B-Artist": 1, "I-Artist": 2, "B-WoA": 3, "I-WoA": 4}
with open(data_file) as f:
    data = f.read()
tokens = []
ner_tags = []
spans = []
sentences = data.split("\n\n")
for sentence in sentences:
    words = []
    tags = []
    this_sentence_spans = []
    word_tag_pairs = sentence.split("\n")
    for pair in word_tag_pairs:
        (word, tag) = pair.split("\t")
        words.append(word)
        tags.append(tag_mapping[tag])
    sentence_text = " ".join(words)
    try:
        doc = docs[sentence_text]
    except:
        pass
    ent_dict = {}
    for ent in doc.ents:
        this_sentence_spans.append(f"{ent.label_}: {ent.text}")
    tokens.append(words)
    ner_tags.append(tags)
    spans.append(this_sentence_spans)


In [6]:
indices = range(0, len(spans))
train, test = train_test_split(indices, test_size=0.1)
train_tokens = []
test_tokens = []
train_ner_tags = []
test_ner_tags = []
train_spans = []
test_spans = []
for i, (token, ner_tag, span) in enumerate(zip(tokens, ner_tags, spans)):
    if i in train:
        train_tokens.append(token)
        train_ner_tags.append(ner_tag)
        train_spans.append(span)
    else:
        test_tokens.append(token)
        test_ner_tags.append(ner_tag)
        test_spans.append(span)        
        
print(len(train_spans))
print(len(test_spans))

539
60


In [7]:
training_df = pd.DataFrame({"tokens":train_tokens, "ner_tags": train_ner_tags, "spans": train_spans})
test_df = pd.DataFrame({"tokens": test_tokens, "ner_tags": test_ner_tags, "spans": test_spans})
training_df["text"] = training_df["tokens"].apply(lambda x: " ".join(x))
test_df["text"] = test_df["tokens"].apply(lambda x: " ".join(x))
training_df.dropna()
test_df.dropna()
print(test_df)

                                               tokens  \
0   [i, love, radioheads, kid, a, something, simil...   
1   [bluesy, songs, kinda, like, evil, woman, by, ...   
2                       [music, for, a, 1920s, party]   
3   [similar, to, 6, underground, by, sneaker, pimps]   
4   [gut, wrenching, screaming, over, acoustic, in...   
5                      [explosive, tracks, for, hiit]   
6   [i, recently, watched, this, film, and, really...   
7            [any, songs, that, make, you, emotional]   
8   [like, daydreaming, from, a, moon, shaped, poo...   
9   [looking, for, songs, similar, to, shes, gone,...   
10  [electronic, featuring, low, hymn, chant, simi...   
11  [song, similar, to, the, first, minute, of, ba...   
12                   [something, new, from, hip, hop]   
13  [songs, similar, to, the, brothers, brights, a...   
14           [looking, for, some, funky, inspiration]   
15    [songs, sounds, like, drive, by, by, lil, peep]   
16                        [sugg

In [8]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
#model = AutoModel.from_pretrained("bert-base-cased")
features = Features({'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 
            'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-Artist', 'I-Artist', 'B-WoA', 'I-WoA'], id=None), length=-1, id=None), 
            'spans': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
            'text': Value(dtype='string', id=None)
                    })
training_dataset = Dataset.from_pandas(training_df, features=features)
test_dataset = Dataset.from_pandas(test_df, features=features)
dataset = DatasetDict({"train":training_dataset, "test":test_dataset}) 
print(dataset["train"].features)
label_names = dataset["train"].features["ner_tags"].feature.names
print(dataset)

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-Artist', 'I-Artist', 'B-WoA', 'I-WoA'], id=None), length=-1, id=None), 'spans': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'text': Value(dtype='string', id=None)}
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'spans', 'text'],
        num_rows: 539
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'spans', 'text'],
        num_rows: 60
    })
})


In [12]:
def tokenize_adjust_labels(all_samples_per_split):
    tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["text"])
    total_adjusted_labels = []
    for k in range(0, len(tokenized_samples["input_ids"])):
        prev_wid = -1
        word_ids_list = tokenized_samples.word_ids(batch_index=k)
        existing_label_ids = all_samples_per_split["ner_tags"][k]
        i = -1
        adjusted_label_ids = []
        for wid in word_ids_list:
            if (wid is None):
                adjusted_label_ids.append(-100)
            elif (wid != prev_wid):
                i = i + 1
                adjusted_label_ids.append(existing_label_ids[i])
                prev_wid = wid
            else:
                label_name = label_names[existing_label_ids[i]]
                adjusted_label_ids.append(existing_label_ids[i])
        total_adjusted_labels.append(adjusted_label_ids)
    tokenized_samples["labels"] = total_adjusted_labels
    return tokenized_samples

In [13]:
tokenized_dataset = dataset.map(tokenize_adjust_labels, batched=True)

Map:   0%|          | 0/539 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

In [14]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [15]:
metric = load("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
      if(k not in flattened_results.keys()):
        flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results

In [18]:
# Train model
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_names))
training_args = TrainingArguments(
    output_dir="./fine_tune_bert_output",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=7,
    weight_decay=0.01,
    logging_steps = 1000,
    run_name = "ep_10_tokenized_11",
    save_strategy='no'
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=238, training_loss=0.24636973853872604, metrics={'train_runtime': 29.2275, 'train_samples_per_second': 129.091, 'train_steps_per_second': 8.143, 'total_flos': 51149717072610.0, 'train_loss': 0.24636973853872604, 'epoch': 7.0})

In [19]:
# Evaluate model
trainer.evaluate()

{'eval_loss': 0.2255576103925705,
 'eval_overall_precision': 0.6911764705882353,
 'eval_overall_recall': 0.7014925373134329,
 'eval_overall_f1': 0.6962962962962963,
 'eval_overall_accuracy': 0.9460869565217391,
 'eval_Artist_f1': 0.7526881720430108,
 'eval_WoA_f1': 0.5714285714285713,
 'eval_runtime': 0.4747,
 'eval_samples_per_second': 126.388,
 'eval_steps_per_second': 8.426,
 'epoch': 7.0}

In [21]:
# Save model
trainer.save_model("../models/bert_fine_tuned")

In [46]:
# Use model
model = AutoModelForTokenClassification.from_pretrained("../models/bert_fine_tuned")
tokenizer = AutoTokenizer.from_pretrained("../models/bert_fine_tuned")

In [48]:
text = "music similar to morphine robocobra quartet | featuring elements like saxophone prominent bass"
from transformers import pipeline
pipe = pipeline(task="token-classification", model=model.to("cpu"), tokenizer=tokenizer, aggregation_strategy="simple")
pipe(text)
# tag_mapping = {"O": 0, "B-Artist": 1, "I-Artist": 2, "B-WoA": 3, "I-WoA": 4}

[{'entity_group': 'LABEL_0',
  'score': 0.99922186,
  'word': 'music similar to',
  'start': 0,
  'end': 16},
 {'entity_group': 'LABEL_1',
  'score': 0.96955717,
  'word': 'morphine robocobra',
  'start': 17,
  'end': 35},
 {'entity_group': 'LABEL_2',
  'score': 0.5487502,
  'word': 'quartet',
  'start': 36,
  'end': 43},
 {'entity_group': 'LABEL_0',
  'score': 0.99884653,
  'word': '| featuring elements like saxophone prominent bass',
  'start': 44,
  'end': 94}]