# NER Evaluation of Augmented data

* This evaluation is done in Google Colab because of enormous dataset size

* Link to the dataset: https://drive.google.com/file/d/1qdjtMgGyafPNN_x2EANkcGLz5S-SB8mG/view?usp=sharing

* The dataset is of size 2.6 GB, and contains more than 2M sentences.


## Mount Google Drive to get acccess to CSV file

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


## Install spaCy and download English model file

In [None]:
# !pip install cupy-cuda112
!pip install spacy==3.0.6

In [12]:
# Download spacy small model
!python -m spacy download en_core_web_sm

## Utilities

In [7]:
import math
import pickle
import re
import numpy
from numpy.core.defchararray import find
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np

ITERATIONS = 5
DROPOUT = 0.1
LEARN_RATE = 0.001

DATA_PATH = "./data_filter_csv.csv"

# Path to the 2.6 GB augmented data
BIG_DATA_PATH = "/content/gdrive/MyDrive/spacy_ner_data/augmented_dataset.csv"


def load_cleaned_data(data_path=BIG_DATA_PATH):
    """
    Go through every sentence's all word-tag pair (except "NONE")
    and calculate the start and end index.
    After getting the (start, end) pair, check if this pair was already calculated
    (i.e., either the start_index, OR end_index, OR both are matching with the ones in list),
    and if so, discard the pair and continue calculating again, skipping over the one discarded.
    :return: DATA
    """
    col_names = ['text', 'entities']

    data = pd.read_csv(data_path, names=col_names)
    # print(data.head())
    entity_list = data.entities.to_list()

    DATA = []

    for index, ent in enumerate(entity_list):
        if ent == "tokens":
            continue

        ent = ent.split("), (")
        ent[0] = re.sub("[([]", "", ent[0])
        ent[-1] = re.sub("[)]]", "", ent[-1])

        # Initialize index list, to store pairs of (start, end) indices
        indices_list = [(-1, -1), (-1, -1)]

        annot_list = []
        start_index = 0
        end_index = 0

        # print(index)
        # print(data['text'][index].lower())

        # Analyze current "split_sentences"'s all word-pairs
        for index_ent, word_pair in enumerate(ent):
            # Split the word and its pair
            word_pair_list = word_pair.split("'")[1::2]
            if word_pair_list[1] != "NONE":

                # Remove any leading or beginning blank space
                word_pair_list[0] = word_pair_list[0].strip()

                start_index = find(data['text'][index].lower(), word_pair_list[0]).astype(numpy.int64)
                start_index = start_index + 0
                end_index = start_index + len(word_pair_list[0])

                # Incase word not found in the sentence
                if start_index == -1:
                    print("-1 error")
                    print(data['text'][index])
                    break

                both_present = lambda: (start_index, end_index) in indices_list
                start_present = lambda: start_index in [i[0] for i in indices_list]
                end_present = lambda: end_index in [i[1] for i in indices_list]
                left_blank = lambda: data['text'][index][start_index - 1] != " "

                def right_blank():
                    # return true if there is no blank space after the end_index,
                    # as long as end_index is not at the end of the sentence
                    if len(data['text'][index].lower()) != end_index:
                        return data['text'][index][end_index] != " "
                
                # Check if this start_index and/or end_index is already in the list:
                # (To prevent overlapping with already tagged words)
                flag = 0
                while True:
                    if (start_index == -1 or end_index == -1):
                        flag = 1
                        break
                    if (both_present()) or (start_present()) or (end_present()) or (left_blank()) or (right_blank()):
                    
                        start_index = find(data['text'][index].lower(), word_pair_list[0],
                                           start=end_index + 1).astype(
                            numpy.int64)
                        start_index = start_index + 0
                        end_index = start_index + len(word_pair_list[0])

                    else:
                        indices_list.append((start_index, end_index))
                        break
                
                if (flag == 1):
                    # Don't bother checking rest of the current sentence
                    break
                
                annot_list.append((start_index, end_index, word_pair_list[1]))
        # print(data['text'][index].lower())
        # print(annot_list)
        DATA.append((data['text'][index].lower(), {"entities": annot_list}))

    # save_list_to_txt(DATA)
    return DATA


def save_list_to_txt(data, keyword):
    with open(keyword + ".txt", 'w') as f:
        for item in data:
            f.write("%s\n" % str(item))


def save_list_to_pickle(list, name):
    # If the directory does not exist, create it
    if not os.path.exists("data"):
        os.makedirs("data")

    with open('data/' + name + '.pkl', 'wb') as f:
        pickle.dump(list, f)


def load_list_from_pickle(filename):
    with open("data/" + filename + '.pkl', 'rb') as f:
        list = pickle.load(f)
    return list


def split_data(DATA):
    random.shuffle(DATA)

    # Randomly pull out 10 % segments of DATA for test + eval
    test_length = math.floor((10 / 100) * len(DATA))
    TEST = DATA[:test_length]

    random.shuffle(TEST)
    # Randomly pull out 50 % segments of TEST_DATA for EVAL_DATA
    eval_length = math.floor((50 / 100) * len(TEST))
    EVAL_DATA = TEST[:eval_length]
    TEST_DATA = TEST[eval_length:len(TEST)]

    # Time consuming for our BIG data
    # TRAIN_DATA = [x for x in DATA if x not in TEST]

    TRAIN_DATA = DATA[test_length:len(DATA)]
    print("\n")

    print("\nTotal sentences: ", len(DATA))
    print("Length of train data: ", len(TRAIN_DATA))
    print("Length of evaluation data: ", len(EVAL_DATA))
    print("Length of test data: ", len(TEST_DATA))

    TEST.clear()

    return TRAIN_DATA, EVAL_DATA, TEST_DATA


def plot_graph(title, keyword, precision=None, recall=None, fscore=None):
    my_dpi = 200
    plt.rcParams['figure.figsize'] = 10, 5
    plt.figure(figsize=(1280 / my_dpi, 720 / my_dpi), dpi=my_dpi)
    x = list(range(1, ITERATIONS + 1))
    legend_to_show = ()
    if precision is not None:
        plt.plot(x, precision, color='red', linestyle='solid', linewidth=1,
                 marker='o', markerfacecolor='red', markersize=2)
        legend_to_show += ("precision",)
    if recall is not None:
        plt.plot(x, recall, color='blue', linestyle='solid', linewidth=1,
                 marker='o', markerfacecolor='blue', markersize=2)
        legend_to_show += ("recall",)
    if fscore is not None:
        plt.plot(x, fscore, color='green', linestyle='solid', linewidth=1,
                 marker='o', markerfacecolor='green', markersize=2)
        legend_to_show += ("fscore",)
    plt.gca().legend(legend_to_show, loc='best')
    plt.xlabel('Epoch')
    plt.ylabel('Score')

    plt.title(title + " PRF Scores [" + keyword + "]")

    # If the directory does not exist, create it
    if not os.path.exists("img"):
        os.makedirs("img")

    plt.savefig("img/plot_" + title + "_" + keyword + ".png", format="png", dpi=my_dpi)
    # plt.show()


def draw_prf_graph(train_scores, keyword="", overall=True, instr=True, qlty=True, edge=True):
    precision = []
    recall = []
    fscore = []

    qlty_p = []
    qlty_r = []
    qlty_f = []

    instr_p = []
    instr_r = []
    instr_f = []

    edge_p = []
    edge_r = []
    edge_f = []

    # Extract P, R, F from train_score
    for i, train_score in enumerate(train_scores):
        for key, cat in train_score.items():
            if key == "ents_p": precision.append(cat)
            if key == "ents_r": recall.append(cat)
            if key == "ents_f": fscore.append(cat)
            if key == "ents_per_type":
                for attribute, value in cat.items():
                    if attribute == "QLTY":
                        for k, sc in value.items():
                            if k == "p": qlty_p.append(sc)
                            if k == "r": qlty_r.append(sc)
                            if k == "f": qlty_f.append(sc)
                    if attribute == "INSTR":
                        for k, sc in value.items():
                            if k == "p": instr_p.append(sc)
                            if k == "r": instr_r.append(sc)
                            if k == "f": instr_f.append(sc)
                    if attribute == "EDGE":
                        for k, sc in value.items():
                            if k == "p": edge_p.append(sc)
                            if k == "r": edge_r.append(sc)
                            if k == "f": edge_f.append(sc)

    if overall is True:
        plot_graph(title=keyword, keyword="overall", precision=precision, recall=recall,
                   fscore=fscore)
    if qlty is True:
        plot_graph(title=keyword, keyword="qlty", precision=qlty_p, recall=qlty_r, fscore=qlty_f)
    if instr is True:
        plot_graph(title=keyword, keyword="instr", precision=instr_p, recall=instr_r,
                   fscore=instr_f)
    if edge is True:
        plot_graph(title=keyword, keyword="edge", precision=edge_p, recall=edge_r, fscore=edge_f)


def draw_train_eval_compare_graph(train_scores, eval_scores):
    train_fscore = []
    eval_fscore = []

    for i, train_score in enumerate(train_scores):
        for key, cat in train_score.items():
            if key == "ents_f": train_fscore.append(cat)

    for i, eval_score in enumerate(eval_scores):
        for key, cat in eval_score.items():
            if key == "ents_f": eval_fscore.append(cat)

    my_dpi = 200
    plt.rcParams['figure.figsize'] = 10, 5
    plt.figure(figsize=(1280 / my_dpi, 720 / my_dpi), dpi=my_dpi)
    x = list(range(1, ITERATIONS + 1))

    poly_order = 4

    plt.plot(x, train_fscore, color='red', linestyle='solid', linewidth=1,
             marker='o', markerfacecolor='red', markersize=2)
    train_reg_line = np.polyfit(np.array(x), np.array(train_fscore), poly_order)
    p = np.poly1d(train_reg_line)
    plt.plot(x, p(x), color='red', linestyle='--', linewidth=0.6,
             marker='o', markerfacecolor='red', markersize=1, label='_nolegend_')

    plt.plot(x, eval_fscore, color='blue', linestyle='solid', linewidth=1,
             marker='o', markerfacecolor='blue', markersize=2)
    eval_reg_line = np.polyfit(np.array(x), np.array(eval_fscore), poly_order)
    p = np.poly1d(eval_reg_line)
    plt.plot(x, p(x), color='blue', linestyle='--', linewidth=0.6,
             marker='o', markerfacecolor='blue', markersize=1, label='_nolegend_')

    plt.gca().legend(("train", "eval"), loc='best')
    plt.xlabel('Epoch')
    plt.ylabel('Score')
    plt.title("F-Score vs Epochs")
    plt.ylim(0.00, 1.00)
    plt.savefig("img/plot_fscore_train_vs_eval.png", format="png", dpi=my_dpi)
    plt.show()


def plot_training_loss_graph(losses, title):
    my_dpi = 200
    plt.rcParams['figure.figsize'] = 10, 5
    plt.figure(figsize=(1280 / my_dpi, 720 / my_dpi), dpi=my_dpi)
    x = list(range(1, ITERATIONS + 1))
    plt.plot(x, losses, color='blue', linestyle='solid', linewidth=1,
             marker='o', markerfacecolor='green', markersize=2)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(title)

    # If the directory does not exist, create it
    if not os.path.exists("img"):
        os.makedirs("img")

    plt.savefig("img/plot_loss_training" + ".png", format="png", dpi=my_dpi)
    plt.show()
    save_list_to_txt(losses, "img/losses_list")


## Train + Evaluate

In [10]:
import numpy
import random
import spacy
from spacy import displacy
from spacy.util import minibatch, compounding
from spacy.training import Example
from spacy.scorer import Scorer
from sklearn.base import BaseEstimator
import pickle

numpy.random.seed(0)

def load_partially_trained_model():
    nlp = spacy.load("./saved_model")
    ner = nlp.get_pipe("ner")
    return ner, nlp

def load_spacy():
    nlp = spacy.load("en_core_web_sm")
    # Getting the pipeline component
    ner = nlp.get_pipe("ner")
    return ner, nlp


class NerModel(BaseEstimator):
    def __init__(self, ner, nlp, n_iter=64, dropout=0.1, lr=0.001, **model_hyper_parameters):
        super().__init__()
        self.ner = ner
        self.nlp = nlp
        self.n_iter = n_iter
        self.dropout = dropout
        self.lr = lr

    def clear_model(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.ner = self.nlp.get_pipe("ner")

    def fit(self, train_data, eval_data):
        """ train the Named Entity Recognition model

        :param eval_data: evaluation data for testing after every epoch
        :param train_data: processed training data
        :return: None
        """
        # Adding labels to the NER
        for _, annotations in train_data:
            for ent in annotations.get("entities"):
                self.ner.add_label(ent[2])

        # Disable pipeline components that are not changed
        pipe_exceptions = ["ner"]
        unaffected_pipes = [pipe for pipe in self.nlp.pipe_names if pipe not in pipe_exceptions]

        scorer = Scorer()

        # Store the PRF scores for every iteration
        train_scores = []
        eval_scores = []

        # Store losses after every iteration
        # Each loss is itself an average of losses within a single iteration
        loss_list = []

        # Train the NER model
        with self.nlp.select_pipes(enable=pipe_exceptions, disable=unaffected_pipes):
            # Create a list of Examples objects
            examples = []

            for text, annots in train_data:
                examples.append(Example.from_dict(self.nlp.make_doc(text), annots))

            # Create an optimizer for the pipeline component, and set lr
            optimizer = self.nlp.create_optimizer()

            # optimizer = nlp.initialize()
            # NOTE: Cannot use nlp.initilaize (v3) (aka nlp.begin_training for v2) on pretrained models.
            # Use nlp.create_optimizer for training on existing model (We used pretrained en_core_web_sm).
            # ref: https://stackoverflow.com/a/66369163/6475377

            optimizer.learn_rate = self.lr

            for iteration in range(ITERATIONS):
                # print("Iteration: ", iteration)
                # shuffling examples  before every iteration
                random.shuffle(examples)
                losses = {}

                # optimizer = self.nlp.resume_training()

                # batch up the examples using spaCy's minibatch
                batches = minibatch(examples, size=compounding(4.0, 32.0, 1.001))
                for count, batch in enumerate(batches):
                    self.nlp.update(
                        batch,
                        drop=DROPOUT,  # dropout - make it harder to memorise data
                        losses=losses,
                        sgd=optimizer
                    )

                loss = losses["ner"] / (count + 1)
                # print(f"Loss at epoch {iteration}: ", loss)
                loss_list.append(loss)
                # After training every iteration, calculate scores
                example_list = []
                for text, annot in train_data:
                    # Create a Doc of our text
                    # doc_gold_text = nlp.make_doc(text)
                    pred_value = self.nlp(text)
                    # reference = (Example.from_dict(doc_gold_text, annot))
                    gold_standard = {"entities": annot["entities"]}

                    # Store prediction and gold standard ref. for each sentence
                    # (to be used by Scorer.score)
                    example_list.append(Example.from_dict(pred_value, gold_standard))

                # Generate per-entity scores by comparing predicted with gold-standard values
                scores = scorer.score(examples=example_list)
                train_scores.append(scores)

                # Evaluate on eval_data
                eval_scores.append(self.evaluate(test_data=eval_data))
                print(eval_scores[-1])

        # draw_prf_graph(train_scores, keyword="train")
        # draw_prf_graph(eval_scores, keyword="eval")
        # draw_train_eval_compare_graph(train_scores, eval_scores)
        # plot_training_loss_graph(loss_list, "Losses with epochs")

        # Just write the last epoch's eval fscore in txt file
        eval_fscore = []
        for i, eval_score in enumerate(eval_scores):
            for key, cat in eval_score.items():
                if key == "ents_f": eval_fscore.append(cat)

        # with open("k_cv_scores.txt", 'a') as f:
        #     f.write("%s\n" % str(eval_fscore[-1]))

        # self.nlp.to_disk("./saved_model")
        return eval_fscore

    def evaluate(self, test_data):
        """ evaluate the trained NER model

        :param test_data: processed test data
        :return: None
        """

        scorer = Scorer(self.nlp)
        example_list = []

        random.shuffle(test_data)

        # Get the PRF scores for test_data
        for text, annot in test_data:
            # Create a Doc of our text
            doc_gold_text = self.nlp.make_doc(text)

            # Create gold-standard using the Doc of text
            # and original (correct) entities
            gold_standard = {"text": doc_gold_text, "entities": annot["entities"]}

            # Get the predictions of current test data sentence
            pred_value = self.nlp(text)

            # Create and append to the example list (of type Example) the prediction
            # as well as the gold standard (reference)
            example_list.append(Example.from_dict(pred_value, gold_standard))

        # Generate per-entity scores by comparing predicted with gold-standard values
        scores = scorer.score(examples=example_list)
        return scores

    def test(self, test_data):
        """
        Perform final testing on unseen test_data
        :param test_data: the unseen test data
        :return:
        """
        print(self.evaluate(test_data))

    def predict(self, X):
        """ make inferences on unseen data

        :param X: sentence to make inferences on
        :return: None
        """
        self.nlp = spacy.load("./saved_model")
        doc = self.nlp(X)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
   
    def k_cross_validation(self, data, k=10):
        print(f"{k}-fold Cross Validation")
        random.shuffle(data)
        num_groups = int(len(data) / k)
        print(f"Size of each eval set: {num_groups}\n")
        batches = minibatch(data, size=num_groups)

        for count, batch in enumerate(batches):
            # Discard the last batch if it has very few example sentences
            if len(batch) > num_groups / 2:
                print(f"Fold no.: {count + 1}")
                train_data = [x for x in data if x not in batch]
                test_data = batch
                print(f"Train, Test :: {len(train_data)}, {len(test_data)}")
                fscore = self.fit(train_data=train_data, eval_data=test_data)
                print(f"fscore: {fscore}\n")

            self.clear_model()


    def train_by_parts(self, TRAIN, EVAL):
        """
        Train the NER model in parts by training (1/1000)th part at a time.
        Also, save the progress after having trained 10 such iterations. 
        """
        train_size_mega_iter = int(len(TRAIN) / 1000)
        eval_size_mega_iter = int(len(EVAL) / 1000)

        for mega_iteration in range(1000):
            print(f"Mega-iteration: {mega_iteration}")
            TRAIN_DATA = TRAIN[(mega_iteration * train_size_mega_iter) : ((mega_iteration + 1) * train_size_mega_iter)]
            EVAL_DATA = EVAL[(mega_iteration * eval_size_mega_iter) : ((mega_iteration + 1) * eval_size_mega_iter)]
            eval_fscore = self.fit(train_data=TRAIN_DATA, eval_data=EVAL_DATA)

            with open("eval_mega_f_scores.txt", 'a') as f:
                for i in range(len(eval_fscore)):
                    f.write("%s\n" % str(eval_fscore[i]))
                f.write("\n")

            print(f"Mega Eval fscore: {eval_fscore[-1]}\n")

            # Create periodic checkpoints by saving the model after every 10th
            # mega-iteration
            if mega_iteration % 10 == 0:
                self.nlp.to_disk("./saved_model")
                print(f"Model saved @ mega-iteration: {mega_iteration}\n")



# Main Script

In [11]:
if __name__ == '__main__':
    print("spaCy version: ", spacy.__version__)
    
    # ner, nlp = load_spacy()
    # DATA = load_cleaned_data()
    # TRAIN_DATA, EVAL_DATA, TEST_DATA = split_data(DATA)
    # save_list_to_pickle(TRAIN_DATA, "TRAIN_DATA")
    # save_list_to_pickle(EVAL_DATA, "EVAL_DATA")
    # save_list_to_pickle(TEST_DATA, "TEST_DATA")

    # Load back the partially trained model from last time
    ner, nlp = load_partially_trained_model()

    # Load pickled data list from data folder
    TRAIN_DATA = load_list_from_pickle("TRAIN_DATA")
    EVAL_DATA = load_list_from_pickle("EVAL_DATA")
    TEST_DATA = load_list_from_pickle("TEST_DATA")

    print("\nTrain + Evaluation")

    # Create the NER model class consisting of fit and evaluate methods.
    ner_model = NerModel(ner, nlp, n_iter=ITERATIONS, dropout=DROPOUT, lr=LEARN_RATE)

    # We're gonna use TEST (5% + 5% = 10%) for evaluation
    # TEST = EVAL_DATA + TEST_DATA
    print(f"Size of total TRAIN data: {len(TRAIN_DATA)}")
    print(f"Size of TEST data: {len(TEST_DATA)}")
    print(f"Size of EVAL data: {len(EVAL_DATA)}\n")
    
    ner_model.train_by_parts(TRAIN_DATA, EVAL_DATA)
    
    # ner_model.fit(TRAIN_DATA, EVAL_DATA)
    # Perform k-fold Cross Validation
    # ner_model.k_cross_validation(data=TRAIN_DATA + EVAL_DATA + TEST_DATA, k=10)

    printf("\nPerforming final testing...")
    ner_model.test(TEST_DATA)

    # sentence = 'I really like the distortion in this guitar'
    # ner.predict(sentence)


spaCy version:  3.0.6

Train + Evaluation
Size of total TRAIN data: 2425500
Size of TEST data: 134750
Size of EVAL data: 134750

Mega-iteration: 0
{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.9227941176470589, 'ents_r': 0.9194139194139194, 'ents_f': 0.9211009174311927, 'ents_per_type': {'INSTR': {'p': 0.937888198757764, 'r': 0.9869281045751634, 'f': 0.9617834394904458}, 'QLTY': {'p': 0.9009009009009009, 'r': 0.8333333333333334, 'f': 0.8658008658008659}}}
{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.9581749049429658, 'ents_r': 0.9230769230769231, 'ents_f': 0.9402985074626866, 'ents_per_type': {'INSTR': {'p': 0.9867549668874173, 'r': 0.9738562091503268, 'f': 0.9802631578947368}, 'QLTY': {'p': 0.9196428571428571, 'r': 0.8583333333333333, 'f': 0.8879310344827586}}}
{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.9628252788104089, 'ents_r': 0.9487179487179487, 'ents_f': 0.955719557195572, 'ent

KeyboardInterrupt: ignored

## Archive the generated model/data/images

In [13]:
# !unzip /content/data.zip
!zip -r /content/data.zip /content/data
# !zip -r /content/saved_model.zip /content/saved_model

  adding: content/data/ (stored 0%)
  adding: content/data/TEST_DATA.pkl (deflated 66%)
  adding: content/data/TRAIN_DATA.pkl (deflated 66%)
  adding: content/data/EVAL_DATA.pkl (deflated 66%)
