###Import Libraries###

In [87]:
import spacy 
from spacy import displacy
import pandas as pd

spacy.__version__

import numpy
numpy.random.seed(0)

###Download and Load Spacy Language Model###

In [88]:
#Download spacy small model
!python -m spacy download en_core_web_sm
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


###Updating NER###

In [89]:
# Getting the pipeline component
ner = nlp.get_pipe("ner")

## Load processed (and cleaned) Reddit data

* Go through every sentence's all word-tag pair (except "NONE") and calculate the start and end index.
* After getting the (start, end) pair, check if this pair was already calcualted (i.e., either the start_index, OR end_index, OR both are matching with the ones in list), and if so, discard the pair and continue calculuting again, skipping over the one discarded.

In [90]:
import pandas as pd
import re
from numpy.core.defchararray import find

col_names = ['text', 'entities']

data = pd.read_csv('./processed_data.csv', names=col_names)
entity_list = data.entities.to_list()

DATA = []

for index, ent in enumerate(entity_list):
  if(ent=="split_sentences"):
    continue
  
  ent = ent.split("), (")
  ent[0] = re.sub("[([]", "", ent[0])
  ent[-1] = re.sub("[)]]", "", ent[-1])

  # Initilize index list, to store pairs of (start, end) indices
  indices_list = [(-1, -1), (-1, -1)]

  annot_list = []
  start_index = 0
  end_index = 0

  # Analyze current "split_sentences"'s all word-pairs
  for index_ent, word_pair in enumerate(ent):
    # Split the word and its pair
    word_pair_list = word_pair.split("'")[1::2]
    if word_pair_list[1]!="NONE":

      # Remove any leading or beginning blank space
      word_pair_list[0] = word_pair_list[0].strip()

      start_index = find(data['text'][index].lower(), word_pair_list[0]).astype(numpy.int64)
      start_index = start_index + 0
      end_index = start_index + len(word_pair_list[0])

      # Doesn't happen, just for a check  
      if start_index == -1:
        print("-1 error")
        print(data['text'][index])
        break

      # Check if this start_index and/or end_index is already in the list:
      # (To prevent overlapping with already tagged words)
      while True:
        if ((start_index, end_index) in indices_list) or (end_index in [i[1] for i in indices_list]) or (start_index in [i[0] for i in indices_list]):
          start_index = find(data['text'][index].lower(), word_pair_list[0], start=end_index+1).astype(numpy.int64)
          start_index = start_index + 0
          end_index = start_index + len(word_pair_list[0])

        else:
          indices_list.append((start_index, end_index))
          break

      annot_list.append((start_index, end_index, word_pair_list[1]))

  DATA.append((data['text'][index].lower(), {"entities": annot_list}))
  # print(indices_list)



Randomly pull out 5 segments for test data

In [91]:
import random
random.shuffle(DATA)

# First 5 elements form test data after shuffling
TEST_DATA = DATA[:5]

for text, annotations in TEST_DATA:
  print(text)
  print(annotations)

TRAIN_DATA = DATA[5:len(DATA)]
print("\n")

# for text, annotations in TRAIN_DATA:
#   print(text)
#   print(annotations)

print("\nLength of test data: ", len(TEST_DATA))
print("Length of train data: ", len(TRAIN_DATA))


sometimes it 2019s just easier in given situations to hit a note on a different string like if you 2019re doing a pattern that you would have to break if you wanted to go to the next string maybe i don 2019t know not a guitar player other than basic chords there 2019s just options and in given situations some are easier and some are more difficult
{'entities': [(219, 225, 'INSTR')]}
electric guitar neck pick up bass set to 11
{'entities': [(0, 15, 'INSTR'), (29, 33, 'INSTR')]}
example the composition that i am trying to write would really benifit from a thick bass
{'entities': [(78, 83, 'QLTY'), (84, 88, 'INSTR')]}
dubstep drops like the ones au5 and virtual riot create are far more heavily focused on rhythm and sound design than traditional music theory they are absolutely writing in a different way than you re used to it s the groove that s most important dubstep producers almost always start with a drum pattern and a bass line anything else is just icing on the cake clean sounding d

## Adding labels to the `ner`


In [92]:
for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

###Disable pipeline components that is not changed

In [93]:
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

###Train NER###

In [94]:
# Import requirements
from spacy.util import minibatch, compounding
from pathlib import Path

ITERATIONS = 64
DROPOUT = 0.1

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):
  for iteration in range(ITERATIONS):
    # print("Iteration: ", iteration)
    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop = DROPOUT,  # dropout - make it harder to memorise data
                    losses=losses
                )
        # print("Losses", losses)

In [95]:
for example in TEST_DATA:
  print(example[0])
  doc = nlp(example[0])
  print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

sometimes it 2019s just easier in given situations to hit a note on a different string like if you 2019re doing a pattern that you would have to break if you wanted to go to the next string maybe i don 2019t know not a guitar player other than basic chords there 2019s just options and in given situations some are easier and some are more difficult
Entities [('guitar', 'INSTR')]
electric guitar neck pick up bass set to 11
Entities [('electric guitar', 'INSTR'), ('bass', 'INSTR')]
example the composition that i am trying to write would really benifit from a thick bass
Entities [('thick bass', 'INSTR')]
dubstep drops like the ones au5 and virtual riot create are far more heavily focused on rhythm and sound design than traditional music theory they are absolutely writing in a different way than you re used to it s the groove that s most important dubstep producers almost always start with a drum pattern and a bass line anything else is just icing on the cake clean sounding drops are all ab

In [96]:
for text, annotations in TEST_DATA:
  print(text)
  print(annotations)

sometimes it 2019s just easier in given situations to hit a note on a different string like if you 2019re doing a pattern that you would have to break if you wanted to go to the next string maybe i don 2019t know not a guitar player other than basic chords there 2019s just options and in given situations some are easier and some are more difficult
{'entities': [(219, 225, 'INSTR')]}
electric guitar neck pick up bass set to 11
{'entities': [(0, 15, 'INSTR'), (29, 33, 'INSTR')]}
example the composition that i am trying to write would really benifit from a thick bass
{'entities': [(78, 83, 'QLTY'), (84, 88, 'INSTR')]}
dubstep drops like the ones au5 and virtual riot create are far more heavily focused on rhythm and sound design than traditional music theory they are absolutely writing in a different way than you re used to it s the groove that s most important dubstep producers almost always start with a drum pattern and a bass line anything else is just icing on the cake clean sounding d

## Test on custom unseen data

In [99]:
doc = nlp("Play me a guitar, and it shouldn't be distorted.")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

doc = nlp("Give me a sharp cello.")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

doc = nlp("I used to play guitar, now I play violin and it has some kind of distortion.")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])


Entities [('guitar', 'INSTR'), ('distorted', 'QLTY')]
Entities [('sharp cello', 'INSTR')]
Entities [('guitar', 'INSTR'), ('violin', 'INSTR'), ('distortion', 'QLTY')]
