### Import Libraries

In [48]:
import spacy

print("spaCy version: ", spacy.__version__)

import numpy
numpy.random.seed(0)

spaCy version:  3.0.6


### Download and Load Spacy Language Model

In [49]:
#Download spacy small model
!python -m spacy download en_core_web_sm
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

2021-05-18 18:52:51.063488: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-05-18 18:52:51.064232: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
You should consider upgrading via the 'C:\Users\archi\PycharmProjects\TSOAI\venv\Scripts\python.exe -m pip install --upgrade pip' command.


Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


###Updating NER###

In [50]:
# Getting the pipeline component
ner = nlp.get_pipe("ner")

## Load processed (and cleaned) Reddit data

* Go through every sentence's all word-tag pair (except "NONE") and calculate the start and end index.
* After getting the (start, end) pair, check if this pair was already calcualted (i.e., either the start_index, OR end_index, OR both are matching with the ones in list), and if so, discard the pair and continue calculuting again, skipping over the one discarded.

In [51]:
import pandas as pd
import re
from numpy.core.defchararray import find

col_names = ['text', 'entities']

data = pd.read_csv('./processed_data.csv', names=col_names)
entity_list = data.entities.to_list()

DATA = []

for index, ent in enumerate(entity_list):
  if(ent=="split_sentences"):
    continue
  
  ent = ent.split("), (")
  ent[0] = re.sub("[([]", "", ent[0])
  ent[-1] = re.sub("[)]]", "", ent[-1])

  # Initilize index list, to store pairs of (start, end) indices
  indices_list = [(-1, -1), (-1, -1)]

  annot_list = []
  start_index = 0
  end_index = 0

  # Analyze current "split_sentences"'s all word-pairs
  for index_ent, word_pair in enumerate(ent):
    # Split the word and its pair
    word_pair_list = word_pair.split("'")[1::2]
    if word_pair_list[1]!="NONE":

      # Remove any leading or beginning blank space
      word_pair_list[0] = word_pair_list[0].strip()

      start_index = find(data['text'][index].lower(), word_pair_list[0]).astype(numpy.int64)
      start_index = start_index + 0
      end_index = start_index + len(word_pair_list[0])

      # Doesn't happen, just for a check  
      if start_index == -1:
        print("-1 error")
        print(data['text'][index])
        break

      # Check if this start_index and/or end_index is already in the list:
      # (To prevent overlapping with already tagged words)
      while True:
        if ((start_index, end_index) in indices_list) or (end_index in [i[1] for i in indices_list]) or (start_index in [i[0] for i in indices_list]):
          start_index = find(data['text'][index].lower(), word_pair_list[0], start=end_index+1).astype(numpy.int64)
          start_index = start_index + 0
          end_index = start_index + len(word_pair_list[0])

        else:
          indices_list.append((start_index, end_index))
          break

      annot_list.append((start_index, end_index, word_pair_list[1]))

  DATA.append((data['text'][index].lower(), {"entities": annot_list}))
  # print(indices_list)



Randomly pull out 5 segments for test data

In [52]:
import random
random.shuffle(DATA)

# First 5 elements form test data after shuffling
TEST_DATA = DATA[:5]

for text, annotations in TEST_DATA:
  print(text)
  print(annotations)

TRAIN_DATA = DATA[5:len(DATA)]
print("\n")

# for text, annotations in TRAIN_DATA:
#   print(text)
#   print(annotations)

print("\nLength of test data: ", len(TEST_DATA))
print("Length of train data: ", len(TRAIN_DATA))


guitar amp sim in addition to my clean bass guitar i compress it really good and in a separate track i add a guitar amp sim on it with some light distortion find a balance that sounds good in the mix and it sounds good enough to take a bite out of
{'entities': [(0, 6, 'INSTR'), (33, 38, 'QLTY'), (140, 156, 'QLTY')]}
i learned to play folk on a classical about 30 years ago they re different instruments but they do both have the long sticky bit with 6 strings attached a classical neck is wider and flatter so get a classical as soon as you can but no i don t think you re screwed
{'entities': [(28, 37, 'INSTR')]}
post rock used to be a lot broader described bands with quite different sounds like slint tortoise or talk talk feels like now it is more narrow bands tend to sound like eits clones with that chiming guitar sound and big crescendos
{'entities': [(192, 199, 'QLTY'), (200, 206, 'INSTR'), (217, 231, 'QLTY')]}
the amount to which different pitches are present is different for instance

## Adding labels to the `ner`


In [53]:
for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

### Disable pipeline components that is not changed

In [54]:
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

### Train NER

In [55]:
# Import requirements
from spacy.util import minibatch, compounding
from spacy.training import Example
# from pathlib import Path

ITERATIONS = 50
DROPOUT = 0.1

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):
    # Create a list of Examples objects
    examples = []
    for text, annots in TRAIN_DATA:
        examples.append(Example.from_dict(nlp.make_doc(text), annots))

    for iteration in range(ITERATIONS):
        # print("Iteration: ", iteration)
        # shuffling examples  before every iteration
        random.shuffle(examples)
        losses = {}

        # batch up the examples using spaCy's minibatch
        batches = minibatch(examples, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            
            nlp.update(
                        batch,
                        drop = DROPOUT,  # dropout - make it harder to memorise data
                        losses=losses
                    )
            # print("Losses", losses)

  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,


### Test on TEST_DATA

In [56]:
for example in TEST_DATA:
  print(example[0])
  doc = nlp(example[0])
  print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

guitar amp sim in addition to my clean bass guitar i compress it really good and in a separate track i add a guitar amp sim on it with some light distortion find a balance that sounds good in the mix and it sounds good enough to take a bite out of
Entities [('guitar', 'INSTR'), ('clean', 'QLTY'), ('bass guitar', 'INSTR'), ('distortion', 'QLTY')]
i learned to play folk on a classical about 30 years ago they re different instruments but they do both have the long sticky bit with 6 strings attached a classical neck is wider and flatter so get a classical as soon as you can but no i don t think you re screwed
Entities []
post rock used to be a lot broader described bands with quite different sounds like slint tortoise or talk talk feels like now it is more narrow bands tend to sound like eits clones with that chiming guitar sound and big crescendos
Entities [('guitar', 'INSTR')]
the amount to which different pitches are present is different for instance i think clarinets only have two freq

### Print original TEST_DATA

In [57]:
for text, annotations in TEST_DATA:
  print(text)
  print(annotations)

guitar amp sim in addition to my clean bass guitar i compress it really good and in a separate track i add a guitar amp sim on it with some light distortion find a balance that sounds good in the mix and it sounds good enough to take a bite out of
{'entities': [(0, 6, 'INSTR'), (33, 38, 'QLTY'), (140, 156, 'QLTY')]}
i learned to play folk on a classical about 30 years ago they re different instruments but they do both have the long sticky bit with 6 strings attached a classical neck is wider and flatter so get a classical as soon as you can but no i don t think you re screwed
{'entities': [(28, 37, 'INSTR')]}
post rock used to be a lot broader described bands with quite different sounds like slint tortoise or talk talk feels like now it is more narrow bands tend to sound like eits clones with that chiming guitar sound and big crescendos
{'entities': [(192, 199, 'QLTY'), (200, 206, 'INSTR'), (217, 231, 'QLTY')]}
the amount to which different pitches are present is different for instance

### Extracting Entities
(Not used)

In [58]:
for text, annotations in TEST_DATA:
  print(list(annotations.values()))

[[(0, 6, 'INSTR'), (33, 38, 'QLTY'), (140, 156, 'QLTY')]]
[[(28, 37, 'INSTR')]]
[[(192, 199, 'QLTY'), (200, 206, 'INSTR'), (217, 231, 'QLTY')]]
[[(84, 93, 'INSTR'), (168, 177, 'INSTR')]]
[[(207, 213, 'QLTY'), (218, 223, 'QLTY'), (286, 290, 'QLTY'), (291, 295, 'QLTY'), (296, 301, 'QLTY'), (681, 697, 'INSTR')]]


In [59]:
for ent_iterator in range(len(TEST_DATA)):
  print(list(*TEST_DATA[ent_iterator][1].values()))

[(0, 6, 'INSTR'), (33, 38, 'QLTY'), (140, 156, 'QLTY')]
[(28, 37, 'INSTR')]
[(192, 199, 'QLTY'), (200, 206, 'INSTR'), (217, 231, 'QLTY')]
[(84, 93, 'INSTR'), (168, 177, 'INSTR')]
[(207, 213, 'QLTY'), (218, 223, 'QLTY'), (286, 290, 'QLTY'), (291, 295, 'QLTY'), (296, 301, 'QLTY'), (681, 697, 'INSTR')]


In [60]:
TEST_DATA[0][1]

{'entities': [(0, 6, 'INSTR'), (33, 38, 'QLTY'), (140, 156, 'QLTY')]}

### Evaluate scores on TEST_DATA

In [70]:
# from spacy.gold import GoldParse
from spacy.scorer import Scorer

scorer = Scorer()
example_list = []

for text, annot in TEST_DATA:
  # Create a Doc of our text
  doc_gold_text = nlp.make_doc(text)

  # Create gold-standard using the Doc of text
  # and original (correct) entities

  # v2.x style:
  # gold = GoldParse(doc_gold_text, entities=annot['entities'])

  # v3.x style:
  # example = Example.from_dict(doc_gold_text, {"entities": annot["entities"]})

  pred_value = nlp(text)
  reference = (Example.from_dict(doc_gold_text, annot))
  gold_standard = {"text": doc_gold_text, "entities": annot["entities"]}

  example_list.append(Example.from_dict(pred_value, gold_standard))

# Generate per-entity scores by comparing predicted with gold-standard values
scores = scorer.score(examples=example_list)

print("All scores: ", scores)

print("\nents_p (aka Precision): ", scores['ents_p'])
print("ents_r (aka Recall): ", scores['ents_r'])
print("ents_f (aka fscore): ", scores['ents_f'])

print("\nINSTR: ", scores['ents_per_type']['INSTR'])
print("QLTY: ", scores['ents_per_type']['QLTY'])

All scores:  {'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'sents_p': None, 'sents_r': None, 'sents_f': None, 'tag_acc': None, 'pos_acc': None, 'morph_acc': None, 'morph_per_feat': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'ents_p': 0.45454545454545453, 'ents_r': 0.3333333333333333, 'ents_f': 0.3846153846153846, 'ents_per_type': {'INSTR': {'p': 0.42857142857142855, 'r': 0.5, 'f': 0.4615384615384615}, 'QLTY': {'p': 0.5, 'r': 0.2222222222222222, 'f': 0.30769230769230765}}, 'cats_score': 0.0, 'cats_score_desc': 'macro F', 'cats_micro_p': 0.0, 'cats_micro_r': 0.0, 'cats_micro_f': 0.0, 'cats_macro_p': 0.0, 'cats_macro_r': 0.0, 'cats_macro_f': 0.0, 'cats_macro_auc': 0.0, 'cats_f_per_type': {}, 'cats_auc_per_type': {}}

ents_p (aka Precision):  0.45454545454545453
ents_r (aka Recall):  0.3333333333333333
ents_f (aka fscore):  0.3846153846153846

INSTR:  {'p': 0.42857142857142855, 'r': 0.5, 'f': 0.4615384615384615}
QLTY:  {'p': 0.5, 'r': 0.222222222

### Calculate ROC-AUC

`scorer.score_cats()` requires spaCy v3 :(

TODO: Fix ROC-AUC outputs

In [77]:
labels = ["QLTY", "INSTR"]

# for example in TEST_DATA:
cat_scores = scorer.score_cats(example_list, attr="cats", labels=labels)
# print(cat_scores)
for key, cat in cat_scores.items():
    print(key)
    if isinstance(cat, float) or isinstance(cat, str):
        print("\t", cat)
    else:
        for attribute, value in cat.items():
            print('\t{} : {}'.format(attribute, value))

cats_score
	 0.0
cats_score_desc
	 macro AUC
cats_micro_p
	 0.0
cats_micro_r
	 0.0
cats_micro_f
	 0.0
cats_macro_p
	 0.0
cats_macro_r
	 0.0
cats_macro_f
	 0.0
cats_macro_auc
	 0.0
cats_f_per_type
	QLTY : {'p': 0.0, 'r': 0.0, 'f': 0.0}
	INSTR : {'p': 0.0, 'r': 0.0, 'f': 0.0}
cats_auc_per_type
	QLTY : None
	INSTR : None


## Test on custom unseen data

In [63]:
doc = nlp("Play me a guitar, and it shouldn't be distorted.")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

doc = nlp("Give me a sharp cello.")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

doc = nlp("I used to play guitar, now I play violin and it has some kind of distortion.")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])


Entities [('distorted', 'QLTY')]
Entities []
Entities [('violin', 'INSTR'), ('distortion', 'QLTY')]
