In [51]:
%run -i "../util/lang_utils.ipynb"

In [52]:
import pandas as pd
from spacy.cli.train import train
from spacy.cli.evaluate import evaluate
from spacy.tokens import DocBin
from sklearn.model_selection import train_test_split

In [53]:
# Music NER
# https://github.com/deezer/music-ner-eacl2023

In [56]:
music_ner_df = pd.read_csv('../data/music_ner.csv')
print(music_ner_df)

        id                                               text  start_offset  \
0    13434  i love radioheads kid a something similar | ki...             7   
1    13434  i love radioheads kid a something similar | ki...            61   
2    13435                anything similar to i fight dragons            20   
3    13436                music similar to ccrs travelin band            17   
4    13437                 songs similar to blackout by boris            17   
..     ...                                                ...           ...   
423  14028  songs like good news by mac miller | preferrab...            11   
424  14028  songs like good news by mac miller | preferrab...            24   
425  14030  something along the lines of either the chain ...            49   
426  14030  something along the lines of either the chain ...            29   
427  14032       heavy bass x gothic rap like oxygen by bones            29   

     end_offset                  label  
0         

In [57]:
# Change labels to Artist, Artist_or_WoA or WoA
def change_label(input_label):
    label_parts = input_label.split("_")
    relevant_parts = label_parts[0:-1]
    return "_".join(relevant_parts)

music_ner_df["label"] = music_ner_df["label"].apply(change_label)
print(music_ner_df)

        id                                               text  start_offset  \
0    13434  i love radioheads kid a something similar | ki...             7   
1    13434  i love radioheads kid a something similar | ki...            61   
2    13435                anything similar to i fight dragons            20   
3    13436                music similar to ccrs travelin band            17   
4    13437                 songs similar to blackout by boris            17   
..     ...                                                ...           ...   
423  14028  songs like good news by mac miller | preferrab...            11   
424  14028  songs like good news by mac miller | preferrab...            24   
425  14030  something along the lines of either the chain ...            49   
426  14030  something along the lines of either the chain ...            29   
427  14032       heavy bass x gothic rap like oxygen by bones            29   

     end_offset          label  
0            17   

In [None]:
train_db = DocBin()
test_db = DocBin()

In [58]:
# Get a unique list of unique ids
ids = list(set(music_ner_df["id"].values))
print(len(ids))
# Split ids into training and test
train_ids, test_ids = train_test_split(ids)
print(len(train_ids))
print(len(test_ids))

227
170
57


In [60]:
# Go through the list of ids and get all the rows associated with each id
for id in ids:
    entity_rows = music_ner_df.loc[music_ner_df['id'] == id]
    text = entity_rows.head(1)["text"].values[0]
    doc = small_model(text)
    ents = []
    for index, row in entity_rows.iterrows():
        label = row["label"]
        start = row["start_offset"]
        end = row["end_offset"]
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        ents.append(span)
    doc.ents = ents
    if id in train_ids:
        train_db.add(doc)
    else:
        test_db.add(doc)
train_db.to_disk('../data/music_ner_train.spacy')
test_db.to_disk('../data/music_ner_test.spacy')

nghtmre street 0 14 Artist_or_WoA
anar 17 21 Artist_or_WoA
nujabes atlas 25 38 Artist_or_WoA
gsh 21 24 Artist_or_WoA
gaye 25 29 Artist_or_WoA
save yourself stabbing westward 28 59 Artist_or_WoA
zimmers 54 61 Artist
blade runner 2049 62 79 WoA
the llamas with hats 21 41 Artist
outro 42 47 WoA
bon iver 17 25 Artist
iron & wine 30 41 Artist
tally hall 23 33 Artist
miracle musical 37 52 Artist
system of a down 21 37 Artist
amon tobin & kid koala 30 52 Artist
untitled 53 61 WoA
code orange 0 11 Artist
dreams in inertia 12 29 WoA
code orange 37 48 Artist
the sounds of eden 29 47 Artist_or_WoA
blackbear and gnash 48 67 Artist_or_WoA
the muffs tilt 57 71 Artist
be your own pet 72 87 Artist
the soviettes tweens 88 108 Artist
dog party 109 118 Artist
ach so gern 11 22 WoA
kid rocks 29 38 Artist
greatest show on earth 39 61 WoA
airport bar 16 27 WoA
noah 31 35 Artist
sweet 17 22 Artist
smino sudan 13 24 Artist
archives 25 33 Artist
fjk 34 37 Artist
jessie reyez 38 50 Artist
tash sultana 54 66 Art

In [61]:
# Train the model
train("../data/spacy_config_ner.cfg", output_path="../models/spacy_music_ner")

[38;5;4mℹ Saving to output directory: ../models/spacy_music_ner[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'tagger', 'parser', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TAGGER  LOSS PARSER  LOSS NER  TAG_ACC  DEP_UAS  DEP_LAS  SENTS_F  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  -----------  -----------  --------  -------  -------  -------  -------  ------  ------  ------  ------
  0       0          0.00        85.39       265.35     63.00    35.26    22.92    14.84     3.66    0.00    0.00    0.00    0.18
  5     200        747.41      2873.85     10642.29   3960.21    77.27    69.25    59.52    55.17   37.07   36.19   38.00    0.60
 12     400        700.66       254.46      2183.84    370.70    77.92    72.38    63.15    82.26   44.55   44.12   45.00    0.63
 21     600        914.24       163.11      1288.12    407.55    79.24    69.08    61.67    71.21   39.80

In [62]:
# Use the trained model for prediction
nlp = spacy.load("../models/spacy_music_ner/model-last")
first_test_id = test_ids[0]
test_rows = music_ner_df.loc[music_ner_df['id'] == first_test_id]
input_text = entity_rows.head(1)["text"].values[0]
print(input_text)
print("Gold entities:")
for index, row in entity_rows.iterrows():
    label = row["label"]
    start = row["start_offset"]
    end = row["end_offset"]
    span = doc.char_span(start, end, label=label, alignment_mode="contract")
    print(span)
doc = nlp(input_text)
print("Predicted entities: ")
for entity in doc.ents:
    print(entity)

songs with themes of being unable to settle | ex hoziers someone new elle kings exes and ohs
Gold entities:
hoziers
someone new
elle kings
exes and ohs
Predicted entities: 
hoziers
someone new
elle kings
exes and


In [63]:
# Evaluate the model
evaluate('../models/spacy_music_ner/model-last', '../data/music_ner_test.spacy')

{'token_acc': 1.0,
 'token_p': 1.0,
 'token_r': 1.0,
 'token_f': 1.0,
 'tag_acc': 0.800658978583196,
 'sents_p': 0.7352941176470589,
 'sents_r': 0.847457627118644,
 'sents_f': 0.7874015748031497,
 'dep_uas': 0.7089859851607585,
 'dep_las': 0.6364385820280297,
 'dep_las_per_type': {'root': {'p': 0.6176470588235294,
   'r': 0.711864406779661,
   'f': 0.6614173228346457},
  'prep': {'p': 0.819047619047619, 'r': 0.86, 'f': 0.8390243902439023},
  'det': {'p': 0.8372093023255814, 'r': 0.9, 'f': 0.8674698795180723},
  'amod': {'p': 0.7678571428571429,
   'r': 0.7166666666666667,
   'f': 0.7413793103448276},
  'pobj': {'p': 0.7333333333333333,
   'r': 0.7857142857142857,
   'f': 0.7586206896551724},
  'nsubj': {'p': 0.5217391304347826,
   'r': 0.5454545454545454,
   'f': 0.5333333333333332},
  'relcl': {'p': 0.3333333333333333,
   'r': 0.2222222222222222,
   'f': 0.26666666666666666},
  'dobj': {'p': 0.75, 'r': 0.5294117647058824, 'f': 0.6206896551724139},
  'advmod': {'p': 0.38461538461538464