# NER Evaluation of Augmented data

* This evaluation is done in Google Colab because of:
    * Enormous dataset size
    * Transformer based architecture involving GPU usage


## 1. Install spaCy and download English model file

In [None]:
# !pip install cupy-cuda112
!pip install spacy==3.0.6

In [None]:
# Download spacy small model
# !python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_trf

In [None]:
!nvidia-smi

Sun Jul  4 14:16:26 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## 2. Install torch

* Install torch specifc to the Google Colab's CUDA version
* CUDA version 11.1 works
* Update: Not really required (on Google Colab atleast), `en_core_web_trf` suffices.

In [None]:
!pip3 install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html

## 3. Pre-process and save to jsonl

* Section 3.3 and 3.4 process the csv files all at once
* Section 3.5 processes the CSV files in batches

### 3.1 Extract the augmented dataset

In [None]:
!unzip /content/augmented_dataset_2021-06-30.zip

### 3.2 Loader function

In [None]:
import pandas as pd
import os
import re
import numpy
from numpy.core.defchararray import find

TRAIN_DATA_PATH = "./augmented_dataset_2021-06-30/train.csv"
TEST_CONTENT_DATA_PATH = "./augmented_dataset_2021-06-30/test_content.csv"
TEST_CONTEXT_DATA_PATH = "./augmented_dataset_2021-06-30/test_context.csv"
TEST_UNSEEN = "./augmented_dataset_2021-06-30/test_unseen.csv"

def load_cleaned_data(data_path, train_data_only=None, train_data_pd=None):
    """
    Go through every sentence's all word-tag pair (except "NONE")
    and calculate the start and end index.
    After getting the (start, end) pair, check if this pair was already calculated
    (i.e., either the start_index, OR end_index, OR both are matching with the ones in list),
    and if so, discard the pair and continue calculating again, skipping over the one discarded.
    :return: DATA
    """
    if train_data_only is None:
        col_names = ['text', 'entities']

        data = pd.read_csv(data_path, names=col_names, usecols=[0, 1])
        entity_list = data.entities.to_list()

    else:
        # Incoming `train_data_only` is itself a pandas,
        # so just process it.
        entity_list = train_data_only
        data = train_data_pd

    DATA = []

    for index, ent in enumerate(entity_list):
        if ent == "tokens":
            continue

        ent = ent.split("), (")
        ent[0] = re.sub("[([]", "", ent[0])
        ent[-1] = re.sub("[)]]", "", ent[-1])

        # Initialize index list, to store pairs of (start, end) indices
        indices_list = [(-1, -1), (-1, -1)]

        tokens_list = []
        spans_list = []

        start_index = 0
        end_index = 0

        # Analyze current "split_sentences"'s all word-pairs
        for index_ent, word_pair in enumerate(ent):
            word_pair_list = []
            
            # Split the word and its pair
            word_pair_list = word_pair.split("'")[1::2]

            # Remove any leading or beginning blank space
            word_pair_list[0] = word_pair_list[0].strip()

            start_index = find(data['text'][index].lower(), word_pair_list[0]).astype(numpy.int64)
            start_index = int(start_index + 0)
            end_index = int(start_index + len(word_pair_list[0]))

            # Incase word not found in the sentence
            if start_index == -1:
                print("\n-1 error")
                print("Couldn't find:")
                print(word_pair_list[0])
                print("in:")
                print(data['text'][index])
                break

            both_present = lambda: (start_index, end_index) in indices_list
            start_present = lambda: start_index in [i[0] for i in indices_list]
            end_present = lambda: end_index in [i[1] for i in indices_list]
            left_blank = lambda: data['text'][index][start_index - 1] != " "

            def right_blank():
                # return true if there is no blank space after the end_index,
                # as long as end_index is not at the end of the sentence
                if len(data['text'][index].lower()) != end_index:
                    return data['text'][index][end_index] != " "
            
            # Check if this start_index and/or end_index is already in the list:
            # (To prevent overlapping with already tagged words)
            flag = 0
            while True:
                if (start_index == -1 or end_index == -1):
                    flag = 1
                    break
                if (both_present()) or (start_present()) or (end_present()) or (left_blank()) or (right_blank()):
                
                    start_index = find(data['text'][index].lower(), word_pair_list[0],
                                        start=end_index + 1).astype(numpy.int64)
                    start_index = int(start_index + 0)
                    end_index = int(start_index + len(word_pair_list[0]))

                else:
                    indices_list.append((start_index, end_index))
                    break
            
            if (flag == 1):
                # Don't bother checking rest of the current sentence
                break
            
            # Add ALL the words and their positions to a "tokens" list
            tokens_list.append({"text": word_pair_list[0], "start": start_index, "end": end_index})

            # Add the specially tagged words to a "spans" list
            if word_pair_list[1] != "NONE":
                spans_list.append({"start": start_index, "end": end_index, "label": word_pair_list[1]})

        DATA.append({"text": data['text'][index].lower(), "tokens": tokens_list, "spans": spans_list, "answer": "accept"})
        
    return DATA


### 3.3 Convert the CSV files to Python list

In [None]:
# TRAIN_DATA = load_cleaned_data(TRAIN_DATA_PATH)
# TEST_CONTENT = load_cleaned_data(TEST_CONTENT_DATA_PATH)
# TEST_CONTEXT = load_cleaned_data(TEST_CONTEXT_DATA_PATH)
UNSEEN_DATA = load_cleaned_data(TEST_UNSEEN)

### 3.4 Save to JSONL

In [None]:
import json

if not os.path.exists("assets"):
        os.makedirs("assets")

# with open('assets/TRAIN_DATA.jsonl', 'w') as f:
#     for entry in TRAIN_DATA:
#         json.dump(entry, f)
#         f.write('\n')

# with open('assets/TEST_CONTENT.jsonl', 'w') as f:
#     for entry in TEST_CONTENT:
#         json.dump(entry, f)
#         f.write('\n')

# with open('assets/TEST_CONTEXT.jsonl', 'w') as f:
#     for entry in TEST_CONTEXT:
#         json.dump(entry, f)
#         f.write('\n')

with open('assets/UNSEEN_DATA.jsonl', 'w') as f:
    for entry in UNSEEN_DATA:
        json.dump(entry, f)
        f.write('\n')


### 3.5 Load, preprocess and save (to JSONL) the CSV data in batches

In [None]:
from pandas import DataFrame
from spacy.util import minibatch
import json

# Create assets directory if it doesn't already exist
if not os.path.exists("assets"):
    os.makedirs("assets")

# Read the CSV file as Pandas df
col_names = ['text', 'entities']
data = pd.read_csv(TEST_CONTEXT_DATA_PATH, names=col_names, usecols=[0, 1])

# Shuffle the whole train data
data = data.sample(frac=1).reset_index(drop=True)

# Calulate size of each of the `div` batches
tot_size = len(data)
div = 4
num_groups = int(tot_size / div)
print(f"Size of each part: {num_groups}\n")

# Divide the data into batches
entity_list = data.entities.to_list()
entity_batches = minibatch(entity_list, size=num_groups)
data_batches = minibatch(data.values.tolist(), size=num_groups)

# Process each batch one by one, and save its result in a seperate jsonl file
for count, (entity_batch, data_batch) in enumerate(zip(entity_batches, data_batches)):
    # if count < 10:
    #     # Continue from the desired last batch
    #     continue

    # Convert the data_batches back to Pandas
    data_df = DataFrame(data_batch, columns=col_names)

    TRAIN_DATA = load_cleaned_data(data_path=TRAIN_DATA_PATH,
                                   train_data_only=entity_batch,
                                   train_data_pd=data_df)

    with open(f"assets/TEST_CONTEXT{count}.jsonl", 'w') as f:
        for entry in TRAIN_DATA:
            json.dump(entry, f)
            f.write('\n')

    print(f"Batch {count} procesed and saved.")
    
    del TRAIN_DATA
    del data_df


Size of each part: 38584

Batch 0 procesed and saved.
Batch 1 procesed and saved.
Batch 2 procesed and saved.
Batch 3 procesed and saved.


In [None]:
# !!! Forcefully reset RAM by injecting a list of size 10^10 !!!
[1]*10**10

### 3.6 Zip/Unzip the JSONL files

In [None]:
!zip -r /content/assets.zip /content/assets

In [None]:
!unzip /content/assets.zip

## 4. Mount Google Drive for access to files

* All the processed jsonl files are stored in a folder in Google Drive

```
MyDrive
    └───spacy_ner_data
        ├───augmented_dataset_2021-06-30
        │   ├───processed_jsonl_files
        │   └───processed_spacy_files
        └───models
            ├───model_both
            ├───model_content
            └───model_context
```

In [None]:
from google.colab import drive

drive.mount("/content/gdrive")

Mounted at /content/gdrive


## 5. Convert the data to spaCy's binary format

A shell script is made in order to run the preprocess Python script multiple times, iterating over all the jsonl files of dataset


In [None]:
%%shell

mkdir -p corpus

drive_path="/content/gdrive/MyDrive/spacy_ner_data/augmented_dataset_2021-06-30/processed_jsonl_files/TEST_CONTENT"
# drive_path="/content/assets/UNSEEN_DATA"
jsonl_ext=".jsonl"

saved_path="/content/corpus/TEST_CONTENT"
spacy_ext=".spacy"

for file_iter in {0..3}
do
    jsonl_drive_path="$drive_path$file_iter$jsonl_ext"
    spacy_file_path="$saved_path$file_iter$spacy_ext"

    python scripts/preprocess.py "$jsonl_drive_path" "$spacy_file_path"
done

In [None]:
!zip -r /content/corpus.zip ./corpus

### 5.1 Pre-process single jsonl file only

In [None]:
# !python -m spacy project run preprocess
!python scripts/preprocess.py merged_file_quarter.jsonl corpus/TRAIN_ALL_QUARTER.spacy

## 6. Check the config file

* Cannot check properly with large dataset because of memory issues

In [None]:
!python -m spacy debug data configs/config.cfg

2021-07-01 20:28:05.745933: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[1m
^C


## 7. Train

In [None]:
%%shell

train_path="/content/gdrive/MyDrive/spacy_ner_data/augmented_dataset_2021-06-30/processed_spacy_files/TRAIN_DATA0.spacy"
dev_path="/content/gdrive/MyDrive/spacy_ner_data/augmented_dataset_2021-06-30/processed_spacy_files/TEST_CONTENT_CONTEXT0.spacy"

# !python -m spacy project run train
python -m spacy train configs/config.cfg --output training/ --paths.train $train_path --paths.dev $dev_path --gpu-id 0

2021-07-03 10:27:20.098515: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[38;5;2m✔ Created output directory: training[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2021-07-03 10:27:22,612] [INFO] Set up nlp object from config
[2021-07-03 10:27:22,622] [INFO] Pipeline: ['transformer', 'ner']
[2021-07-03 10:27:22,626] [INFO] Created vocabulary
[2021-07-03 10:27:22,626] [INFO] Finished initializing nlp object
Downloading: 100% 481/481 [00:00<00:00, 462kB/s]
Downloading: 100% 899k/899k [00:00<00:00, 3.43MB/s]
Downloading: 100% 456k/456k [00:00<00:00, 2.26MB/s]
Downloading: 100% 1.36M/1.36M [00:00<00:00, 5.09MB/s]
Downloading: 100% 501M/501M [00:08<00:00, 61.3MB/s]
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if

CalledProcessError: ignored

## 8. Evaluate

There are 3 models stored in the `models` folder.

* model_both - Training on train data, and evaluation done using both content and context data

* model_content - Training on train data, and evaluation done using only the content data

* model_context - Training on train data, and evaluation done using only the context data

Each of the 3 models have 2 models, the last model, and the best one. It's recommended to test on the `best-model`.

```
models
    ├───model_both
    │   └───training
    │       ├───model-best
    │       │   ├───ner
    │       │   ├───transformer
    │       │   │   └───model
    │       │   └───vocab
    │       └───model-last
    │           ├───ner
    │           ├───transformer
    │           │   └───model
    │           └───vocab
    ├───model_content
    │   └───training
    │       ├───model-best
    │       │   ├───ner
    │       │   ├───transformer
    │       │   │   └───model
    │       │   └───vocab
    │       └───model-last
    │           ├───ner
    │           ├───transformer
    │           │   └───model
    │           └───vocab
    └───model_context
        └───training
            ├───model-best
            │   ├───ner
            │   ├───transformer
            │   │   └───model
            │   └───vocab
            └───model-last
                ├───ner
                ├───transformer
                │   └───model
                └───vocab
```

In [None]:
%%shell

# For displacy's HTML rendering of annotated outputs
# unseen_data_size controls how many examples to annotate in the HTML file,
# so it can be less than the actual number of examples in the test data
unseen_data_size=1696
mkdir -p displacy

# Unseen data path (aka data to be tested/evaluated with stored model)
test_unseen_path="/content/gdrive/MyDrive/spacy_ner_data/augmented_dataset_2021-06-30/processed_spacy_files/UNSEEN_DATA.spacy"

# Path to the stored model
model_path="/content/gdrive/MyDrive/spacy_ner_data/models/model_both/training/model-best"

python -m spacy evaluate $model_path $test_unseen_path --output metrics.json --displacy-path displacy --displacy-limit $unseen_data_size --gpu-id 0

2021-07-04 14:19:56.223983: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     100.00
NER P   86.77 
NER R   78.71 
NER F   82.54 
SPEED   6753  

[1m

            P       R       F
QLTY    79.91   62.94   70.42
INSTR   93.48   99.59   96.44

[38;5;2m✔ Generated 1696 parses as HTML[0m
displacy
[38;5;2m✔ Saved results to metrics.json[0m




## 9. Archive the generated model/data/images

In [None]:
# !unzip /content/data.zip
# !unzip /content/saved_model.zip
# !zip -r /content/data.zip /content/data
# !zip -r /content/img.zip /content/img
# !zip -r /content/saved_model.zip /content/saved_model
# !zip -r /content/training.zip /content/training
!unzip /content/training.zip