# NER Evaluation of Augmented data

* This evaluation is done in Google Colab because of:
    * Enormous dataset size
    * Transformer based architecture involving GPU usage


## Install spaCy and download English model file

In [None]:
# !pip install cupy-cuda112
!pip install spacy==3.0.6

In [None]:
# Download spacy small model
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_trf

In [None]:
!nvidia-smi

## Install torch

* Install torch specifc to the Google Colab's CUDA version
* CUDA version 11.1 works

In [None]:
!pip3 install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html

## Extract Project files

In [None]:
!unzip /content/project.zip

## Pre-process and save to json

In [None]:
!unzip /content/augmented_dataset_2021-06-21.zip

In [None]:
import pandas as pd
import os
import re
import numpy
from numpy.core.defchararray import find

TRAIN_DATA_PATH = "./augmented_dataset_2021-06-21/train.csv"
TEST_CONTENT_DATA_PATH = "./augmented_dataset_2021-06-21/test_content.csv"
TEST_CONTEXT_DATA_PATH = "./augmented_dataset_2021-06-21/test_context.csv"
TEST_UNSEEN = "./augmented_dataset_2021-06-21/test_unseen.csv"

def load_cleaned_data(data_path):
    """
    Go through every sentence's all word-tag pair (except "NONE")
    and calculate the start and end index.
    After getting the (start, end) pair, check if this pair was already calculated
    (i.e., either the start_index, OR end_index, OR both are matching with the ones in list),
    and if so, discard the pair and continue calculating again, skipping over the one discarded.
    :return: DATA
    """
    col_names = ['text', 'entities']

    data = pd.read_csv(data_path, names=col_names, usecols=[0, 1])
    # print(data.head())
    entity_list = data.entities.to_list()

    DATA = []

    for index, ent in enumerate(entity_list):
        if ent == "tokens":
            continue

        ent = ent.split("), (")
        ent[0] = re.sub("[([]", "", ent[0])
        ent[-1] = re.sub("[)]]", "", ent[-1])

        # Initialize index list, to store pairs of (start, end) indices
        indices_list = [(-1, -1), (-1, -1)]

        annot_list = []
        start_index = 0
        end_index = 0

        # print(index)
        # print(data['text'][index].lower())

        # Analyze current "split_sentences"'s all word-pairs
        for index_ent, word_pair in enumerate(ent):
            # Split the word and its pair
            word_pair_list = word_pair.split("'")[1::2]
            if word_pair_list[1] != "NONE":

                # Remove any leading or beginning blank space
                word_pair_list[0] = word_pair_list[0].strip()

                start_index = find(data['text'][index].lower(), word_pair_list[0]).astype(numpy.int64)
                start_index = int(start_index + 0)
                end_index = int(start_index + len(word_pair_list[0]))

                # Incase word not found in the sentence
                if start_index == -1:
                    print("-1 error")
                    print(data['text'][index])
                    break

                both_present = lambda: (start_index, end_index) in indices_list
                start_present = lambda: start_index in [i[0] for i in indices_list]
                end_present = lambda: end_index in [i[1] for i in indices_list]
                left_blank = lambda: data['text'][index][start_index - 1] != " "

                def right_blank():
                    # return true if there is no blank space after the end_index,
                    # as long as end_index is not at the end of the sentence
                    if len(data['text'][index].lower()) != end_index:
                        return data['text'][index][end_index] != " "
                
                # Check if this start_index and/or end_index is already in the list:
                # (To prevent overlapping with already tagged words)
                flag = 0
                while True:
                    if (start_index == -1 or end_index == -1):
                        flag = 1
                        break
                    if (both_present()) or (start_present()) or (end_present()) or (left_blank()) or (right_blank()):
                    
                        start_index = find(data['text'][index].lower(), word_pair_list[0],
                                           start=end_index + 1).astype(
                            numpy.int64)
                        start_index = int(start_index + 0)
                        end_index = int(start_index + len(word_pair_list[0]))

                    else:
                        indices_list.append((start_index, end_index))
                        break
                
                if (flag == 1):
                    # Don't bother checking rest of the current sentence
                    break
                
                annot_list.append((start_index, end_index, word_pair_list[1]))
        # print(data['text'][index].lower())
        # print(annot_list)
        DATA.append({"text": data['text'][index].lower(), "entities": annot_list})

    # save_list_to_txt(DATA)
    return DATA


TRAIN_DATA = load_cleaned_data(TRAIN_DATA_PATH)
TEST_CONTENT = load_cleaned_data(TEST_CONTENT_DATA_PATH)
TEST_CONTEXT = load_cleaned_data(TEST_CONTEXT_DATA_PATH)
UNSEEN_DATA = load_cleaned_data(TEST_UNSEEN)



In [None]:
import json
if not os.path.exists("assets"):
        os.makedirs("assets")


with open('assets/TRAIN_DATA.jsonl', 'w') as f:
    for entry in TRAIN_DATA:
        json.dump(entry, f)
        f.write('\n')

with open('assets/TEST_CONTENT.jsonl', 'w') as f:
    for entry in TEST_CONTENT:
        json.dump(entry, f)
        f.write('\n')

with open('assets/TEST_CONTEXT.jsonl', 'w') as f:
    for entry in TEST_CONTEXT:
        json.dump(entry, f)
        f.write('\n')

with open('assets/UNSEEN_DATA.jsonl', 'w') as f:
    for entry in UNSEEN_DATA:
        json.dump(entry, f)
        f.write('\n')


In [None]:
!zip -r /content/assets.zip /content/assets

In [14]:
!unzip /content/assets.zip

Archive:  /content/assets.zip
   creating: assets/
  inflating: assets/TEST_CONTENT.jsonl  
  inflating: assets/TEST_CONTEXT.jsonl  
  inflating: assets/TRAIN_DATA.jsonl  
  inflating: assets/UNSEEN_DATA.jsonl  


## Convert the data to spaCy's binary format

In [15]:
!python -m spacy project run preprocess

2021-06-25 15:24:54.441621: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[1m
Running command: /usr/bin/python3 scripts/preprocess.py assets/TRAIN_DATA.jsonl corpus/TRAIN_DATA.spacy
2021-06-25 15:24:58.646674: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Traceback (most recent call last):
  File "scripts/preprocess.py", line 31, in <module>
    typer.run(main)
  File "/usr/local/lib/python3.7/dist-packages/typer/main.py", line 859, in run
    app()
  File "/usr/local/lib/python3.7/dist-packages/typer/main.py", line 214, in __call__
    return get_command(self)(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/click/core.py", line 829, in __call__
    return self.main(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/click/core.py", line 782, in main
    rv = self.invoke(ctx)
  File "/usr/local/lib/python3.7/dist-pa

## Check the config file

In [None]:
!python -m spacy debug data configs/config.cfg

2021-06-25 11:17:13.327521: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[1m
[38;5;2m✔ Corpus is loadable[0m
Downloading: 100% 481/481 [00:00<00:00, 371kB/s]
Downloading: 100% 899k/899k [00:00<00:00, 3.52MB/s]
Downloading: 100% 456k/456k [00:00<00:00, 2.70MB/s]
Downloading: 100% 1.36M/1.36M [00:00<00:00, 4.07MB/s]
Downloading: 100% 501M/501M [00:13<00:00, 37.2MB/s]
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint o

## Train

In [None]:
import spacy
spacy.prefer_gpu()

True

In [None]:
# !python -m spacy project run train
!python -m spacy train configs/config.cfg --output training/ --paths.train corpus/fashion_brands_training.spacy --paths.dev corpus/fashion_brands_eval.spacy --gpu-id 0

2021-06-25 11:29:50.403870: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[38;5;4mℹ Using GPU: 0[0m
[1m
[2021-06-25 11:30:06,473] [INFO] Set up nlp object from config
[2021-06-25 11:30:06,487] [INFO] Pipeline: ['transformer', 'ner']
[2021-06-25 11:30:06,492] [INFO] Created vocabulary
[2021-06-25 11:30:06,492] [INFO] Finished initializing nlp object
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you

## Evaluate

In [None]:
# !python -m spacy project run evaluate
!python -m spacy evaluate training/model-best corpus/fashion_brands_eval.spacy --output training/metrics.json --gpu-id 0

2021-06-25 12:16:42.570823: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     99.99
NER P   75.41
NER R   77.31
NER F   76.35
SPEED   1619 

[1m

                    P       R       F
FASHION_BRAND   75.41   77.31   76.35

[38;5;2m✔ Saved results to training/metrics.json[0m


## Archive the generated model/data/images

In [None]:
# !unzip /content/data.zip
# !unzip /content/saved_model.zip
# !zip -r /content/data.zip /content/data
!zip -r /content/img.zip /content/img
# !zip -r /content/saved_model.zip /content/saved_model