###Import Libraries###

In [5]:
import spacy 
from spacy import displacy
import pandas as pd

spacy.__version__

import numpy
numpy.random.seed(0)

###Download and Load Spacy Language Model###

In [6]:
#Download spacy small model
!python -m spacy download en_core_web_sm
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


###Updating NER###

In [7]:
# Getting the pipeline component
ner = nlp.get_pipe("ner")

## Load processed (and cleaned) Reddit data

* Go through every sentence's all word-tag pair (except "NONE") and calculate the start and end index.
* After getting the (start, end) pair, check if this pair was already calcualted (i.e., either the start_index, OR end_index, OR both are matching with the ones in list), and if so, discard the pair and continue calculuting again, skipping over the one discarded.

In [8]:
import pandas as pd
import re
from numpy.core.defchararray import find

col_names = ['text', 'entities']

data = pd.read_csv('./processed_data.csv', names=col_names)
entity_list = data.entities.to_list()

DATA = []

for index, ent in enumerate(entity_list):
  if(ent=="split_sentences"):
    continue
  
  ent = ent.split("), (")
  ent[0] = re.sub("[([]", "", ent[0])
  ent[-1] = re.sub("[)]]", "", ent[-1])

  # Initilize index list, to store pairs of (start, end) indices
  indices_list = [(-1, -1), (-1, -1)]

  annot_list = []
  start_index = 0
  end_index = 0

  # Analyze current "split_sentences"'s all word-pairs
  for index_ent, word_pair in enumerate(ent):
    # Split the word and its pair
    word_pair_list = word_pair.split("'")[1::2]
    if word_pair_list[1]!="NONE":

      # Remove any leading or beginning blank space
      word_pair_list[0] = word_pair_list[0].strip()

      start_index = find(data['text'][index].lower(), word_pair_list[0]).astype(numpy.int64)
      start_index = start_index + 0
      end_index = start_index + len(word_pair_list[0])

      # Doesn't happen, just for a check  
      if start_index == -1:
        print("-1 error")
        print(data['text'][index])
        break

      # Check if this start_index and/or end_index is already in the list:
      # (To prevent overlapping with already tagged words)
      while True:
        if ((start_index, end_index) in indices_list) or (end_index in [i[1] for i in indices_list]) or (start_index in [i[0] for i in indices_list]):
          start_index = find(data['text'][index].lower(), word_pair_list[0], start=end_index+1).astype(numpy.int64)
          start_index = start_index + 0
          end_index = start_index + len(word_pair_list[0])

        else:
          indices_list.append((start_index, end_index))
          break

      annot_list.append((start_index, end_index, word_pair_list[1]))

  DATA.append((data['text'][index].lower(), {"entities": annot_list}))
  # print(indices_list)



Randomly pull out 5 segments for test data

In [9]:
import random
random.shuffle(DATA)

# First 5 elements form test data after shuffling
TEST_DATA = DATA[:5]

for text, annotations in TEST_DATA:
  print(text)
  print(annotations)

TRAIN_DATA = DATA[5:len(DATA)]
print("\n")

# for text, annotations in TRAIN_DATA:
#   print(text)
#   print(annotations)

print("\nLength of test data: ", len(TEST_DATA))
print("Length of train data: ", len(TRAIN_DATA))


sometimes it 2019s just easier in given situations to hit a note on a different string like if you 2019re doing a pattern that you would have to break if you wanted to go to the next string maybe i don 2019t know not a guitar player other than basic chords there 2019s just options and in given situations some are easier and some are more difficult
{'entities': [(219, 225, 'INSTR')]}
use a low or medium tension classical guitar string set the guitar d is wound for your low g strings d g b e tune to g c e a the same tuning and tension as capo 5 on a guitar i did a bunch of research on various strings and their compatibility on the ukulele some are not suitable as the tension is too high hard i use augustine black classical guitar strings on my tenor they will work on concert and soprano as well d addario actually repackage their classical strings as uke strings
{'entities': [(38, 44, 'INSTR'), (60, 66, 'INSTR'), (251, 258, 'INSTR')]}
more about his tone rather than the actual guitars but

## Adding labels to the `ner`


In [10]:
for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

###Disable pipeline components that is not changed

In [11]:
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

###Train NER###

In [12]:
# Import requirements
from spacy.util import minibatch, compounding
from pathlib import Path

ITERATIONS = 64
DROPOUT = 0.1

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):
  for iteration in range(ITERATIONS):
    # print("Iteration: ", iteration)
    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop = DROPOUT,  # dropout - make it harder to memorise data
                    losses=losses
                )
        # print("Losses", losses)

### Test on TEST_DATA

In [13]:
for example in TEST_DATA:
  print(example[0])
  doc = nlp(example[0])
  print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

sometimes it 2019s just easier in given situations to hit a note on a different string like if you 2019re doing a pattern that you would have to break if you wanted to go to the next string maybe i don 2019t know not a guitar player other than basic chords there 2019s just options and in given situations some are easier and some are more difficult
Entities []
use a low or medium tension classical guitar string set the guitar d is wound for your low g strings d g b e tune to g c e a the same tuning and tension as capo 5 on a guitar i did a bunch of research on various strings and their compatibility on the ukulele some are not suitable as the tension is too high hard i use augustine black classical guitar strings on my tenor they will work on concert and soprano as well d addario actually repackage their classical strings as uke strings
Entities [('guitar', 'INSTR'), ('guitar', 'INSTR'), ('soprano', 'INSTR')]
more about his tone rather than the actual guitars but you might like this as 

### Print original TEST_DATA

In [14]:
for text, annotations in TEST_DATA:
  print(text)
  print(annotations)

sometimes it 2019s just easier in given situations to hit a note on a different string like if you 2019re doing a pattern that you would have to break if you wanted to go to the next string maybe i don 2019t know not a guitar player other than basic chords there 2019s just options and in given situations some are easier and some are more difficult
{'entities': [(219, 225, 'INSTR')]}
use a low or medium tension classical guitar string set the guitar d is wound for your low g strings d g b e tune to g c e a the same tuning and tension as capo 5 on a guitar i did a bunch of research on various strings and their compatibility on the ukulele some are not suitable as the tension is too high hard i use augustine black classical guitar strings on my tenor they will work on concert and soprano as well d addario actually repackage their classical strings as uke strings
{'entities': [(38, 44, 'INSTR'), (60, 66, 'INSTR'), (251, 258, 'INSTR')]}
more about his tone rather than the actual guitars but

### Extracting Entities
(Not used)

In [26]:
for text, annotations in TEST_DATA:
  print(list(annotations.values()))

dict_values([[(219, 225, 'INSTR')]])
dict_values([[(38, 44, 'INSTR'), (60, 66, 'INSTR'), (251, 258, 'INSTR')]])
dict_values([[(43, 50, 'INSTR')]])
dict_values([[(394, 400, 'INSTR'), (405, 410, 'INSTR')]])
dict_values([[(105, 111, 'INSTR')]])


In [44]:
for ent_iterator in range(len(TEST_DATA)):
  print(list(*TEST_DATA[ent_iterator][1].values()))

[(219, 225, 'INSTR')]
[(38, 44, 'INSTR'), (60, 66, 'INSTR'), (251, 258, 'INSTR')]
[(43, 50, 'INSTR')]
[(394, 400, 'INSTR'), (405, 410, 'INSTR')]
[(105, 111, 'INSTR')]


In [29]:
TEST_DATA[0][1]

{'entities': [(219, 225, 'INSTR')]}

### Evaluate scores on TEST_DATA

In [54]:
from spacy.gold import GoldParse
from spacy.scorer import Scorer

scorer = Scorer()

for text, annot in TEST_DATA:
  # Create a Doc of our text
  doc_gold_text = nlp.make_doc(text)

  # Create gold-standard using the Doc of text
  # and original (correct) entities
  gold = GoldParse(doc_gold_text, entities=annot['entities'])

  pred_value = nlp(text)

  # Generate scores by comparing predicted with gold-standard values
  scorer.score(pred_value, gold)

print("All scores: ", scorer.scores)

print("\nents_p (aka Precision): ", scorer.scores['ents_p'])
print("ents_r (aka Recall): ", scorer.scores['ents_r'])
print("ents_f (aka fscore): ", scorer.scores['ents_f'])

All scores:  {'uas': 0.0, 'las': 0.0, 'las_per_type': {'advmod': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'nsubj': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'appos': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'root': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'prep': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'amod': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'pobj': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'aux': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'acl': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'det': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'dobj': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'mark': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'advcl': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'relcl': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'xcomp': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'nummod': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'neg': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'compound': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'npadvmod': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'cc': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'pcomp': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'acomp': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'conj': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'nmod': {'p'

## Test on custom unseen data

In [16]:
doc = nlp("Play me a guitar, and it shouldn't be distorted.")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

doc = nlp("Give me a sharp cello.")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

doc = nlp("I used to play guitar, now I play violin and it has some kind of distortion.")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])


Entities [('guitar', 'INSTR')]
Entities [('sharp', 'QLTY')]
Entities [('guitar', 'INSTR'), ('violin', 'INSTR'), ('distortion', 'QLTY')]
