# NER Evaluation of Augmented data

* This evaluation is done in Google Colab because of:
    * Enormous dataset size
    * Transformer based architecture involving GPU usage


## Install spaCy and download English model file

In [None]:
# !pip install cupy-cuda112
!pip install spacy==3.0.6

In [None]:
# Download spacy small model
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_trf

In [None]:
!nvidia-smi

## Install torch

* Install torch specifc to the Google Colab's CUDA version
* CUDA version 11.1 works

In [None]:
!pip3 install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html

## Extract Project files

In [None]:
!unzip /content/project.zip

## Pre-process and save to json

### Extract the augmented dataset

In [None]:
!unzip /content/augmented_dataset_2021-06-21.zip

Archive:  /content/augmented_dataset_2021-06-21.zip
   creating: augmented_dataset_2021-06-21/
  inflating: augmented_dataset_2021-06-21/keyword_ids.csv  
  inflating: augmented_dataset_2021-06-21/pattern_ids.csv  
  inflating: augmented_dataset_2021-06-21/test_content.csv  
  inflating: augmented_dataset_2021-06-21/test_context.csv  
  inflating: augmented_dataset_2021-06-21/test_unseen.csv  
  inflating: augmented_dataset_2021-06-21/train.csv  


### Loader function

In [None]:
import pandas as pd
import os
import re
import numpy
from numpy.core.defchararray import find

TRAIN_DATA_PATH = "./augmented_dataset_2021-06-21/train.csv"
TEST_CONTENT_DATA_PATH = "./augmented_dataset_2021-06-21/test_content.csv"
TEST_CONTEXT_DATA_PATH = "./augmented_dataset_2021-06-21/test_context.csv"
TEST_UNSEEN = "./augmented_dataset_2021-06-21/test_unseen.csv"

def load_cleaned_data(data_path, train_data_only=None, train_data_pd=None):
    """
    Go through every sentence's all word-tag pair (except "NONE")
    and calculate the start and end index.
    After getting the (start, end) pair, check if this pair was already calculated
    (i.e., either the start_index, OR end_index, OR both are matching with the ones in list),
    and if so, discard the pair and continue calculating again, skipping over the one discarded.
    :return: DATA
    """
    if train_data_only is None:
        col_names = ['text', 'entities']

        data = pd.read_csv(data_path, names=col_names, usecols=[0, 1])
        entity_list = data.entities.to_list()

    else:
        # Incoming `train_data_only` is itself a pandas,
        # so just process it.
        entity_list = train_data_only
        data = train_data_pd

    DATA = []

    for index, ent in enumerate(entity_list):
        if ent == "tokens":
            continue

        ent = ent.split("), (")
        ent[0] = re.sub("[([]", "", ent[0])
        ent[-1] = re.sub("[)]]", "", ent[-1])

        # Initialize index list, to store pairs of (start, end) indices
        indices_list = [(-1, -1), (-1, -1)]

        tokens_list = []
        spans_list = []

        start_index = 0
        end_index = 0

        # Analyze current "split_sentences"'s all word-pairs
        for index_ent, word_pair in enumerate(ent):
            word_pair_list = []
            
            # Split the word and its pair
            word_pair_list = word_pair.split("'")[1::2]

            # Remove any leading or beginning blank space
            word_pair_list[0] = word_pair_list[0].strip()

            start_index = find(data['text'][index].lower(), word_pair_list[0]).astype(numpy.int64)
            start_index = int(start_index + 0)
            end_index = int(start_index + len(word_pair_list[0]))

            # Incase word not found in the sentence
            if start_index == -1:
                print("\n-1 error")
                print("Couldn't find:")
                print(word_pair_list[0])
                print("in:")
                print(data['text'][index])
                break

            both_present = lambda: (start_index, end_index) in indices_list
            start_present = lambda: start_index in [i[0] for i in indices_list]
            end_present = lambda: end_index in [i[1] for i in indices_list]
            left_blank = lambda: data['text'][index][start_index - 1] != " "

            def right_blank():
                # return true if there is no blank space after the end_index,
                # as long as end_index is not at the end of the sentence
                if len(data['text'][index].lower()) != end_index:
                    return data['text'][index][end_index] != " "
            
            # Check if this start_index and/or end_index is already in the list:
            # (To prevent overlapping with already tagged words)
            flag = 0
            while True:
                if (start_index == -1 or end_index == -1):
                    flag = 1
                    break
                if (both_present()) or (start_present()) or (end_present()) or (left_blank()) or (right_blank()):
                
                    start_index = find(data['text'][index].lower(), word_pair_list[0],
                                        start=end_index + 1).astype(numpy.int64)
                    start_index = int(start_index + 0)
                    end_index = int(start_index + len(word_pair_list[0]))

                else:
                    indices_list.append((start_index, end_index))
                    break
            
            if (flag == 1):
                # Don't bother checking rest of the current sentence
                break
            
            # Add ALL the words and their positions to a "tokens" list
            tokens_list.append({"text": word_pair_list[0], "start": start_index, "end": end_index})

            # Add the specially tagged words to a "spans" list
            if word_pair_list[1] != "NONE":
                spans_list.append({"start": start_index, "end": end_index, "label": word_pair_list[1]})

        DATA.append({"text": data['text'][index].lower(), "tokens": tokens_list, "spans": spans_list, "answer": "accept"})
        
    return DATA


# TRAIN_DATA = load_cleaned_data(TRAIN_DATA_PATH)
# TEST_CONTENT = load_cleaned_data(TEST_CONTENT_DATA_PATH)
# TEST_CONTEXT = load_cleaned_data(TEST_CONTEXT_DATA_PATH)
# UNSEEN_DATA = load_cleaned_data(TEST_UNSEEN)


### Load and save `TRAIN_DATA` in batches

In [None]:
from pandas import DataFrame
from spacy.util import minibatch
import json

# Create assets directory if it doesn't already exist
if not os.path.exists("assets"):
    os.makedirs("assets")

# Read the CSV file as Pandas df
col_names = ['text', 'entities']
data = pd.read_csv(TRAIN_DATA_PATH, names=col_names, usecols=[0, 1])

# Shuffle the whole train data
data = data.sample(frac=1).reset_index(drop=True)

# Calulate size of each of the `div` batches
tot_size = len(data)
div = 100
num_groups = int(tot_size / div)
print(f"Size of each part: {num_groups}\n")

# Divide the data into batches
entity_list = data.entities.to_list()
entity_batches = minibatch(entity_list, size=num_groups)
data_batches = minibatch(data.values.tolist(), size=num_groups)

# Process each batch one by one, and save its result in a seperate jsonl file
for count, (entity_batch, data_batch) in enumerate(zip(entity_batches, data_batches)):
    # if count < 10:
    #     # Continue from the desired last batch
    #     continue

    # Convert the data_batches back to Pandas
    data_df = DataFrame(data_batch, columns=col_names)

    TRAIN_DATA = load_cleaned_data(data_path=TRAIN_DATA_PATH,
                                   train_data_only=entity_batch,
                                   train_data_pd=data_df)

    with open(f"assets/TRAIN_DATA{count}.jsonl", 'w') as f:
        for entry in TRAIN_DATA:
            json.dump(entry, f)
            f.write('\n')

    print(f"Batch {count} procesed and saved.")
    
    del TRAIN_DATA
    del data_df


Size of each part: 19852

Batch 0 procesed and saved.
Batch 1 procesed and saved.
Batch 2 procesed and saved.
Batch 3 procesed and saved.
Batch 4 procesed and saved.
Batch 5 procesed and saved.
Batch 6 procesed and saved.
Batch 7 procesed and saved.
Batch 8 procesed and saved.
Batch 9 procesed and saved.
Batch 10 procesed and saved.
Batch 11 procesed and saved.
Batch 12 procesed and saved.
Batch 13 procesed and saved.
Batch 14 procesed and saved.
Batch 15 procesed and saved.
Batch 16 procesed and saved.
Batch 17 procesed and saved.
Batch 18 procesed and saved.
Batch 19 procesed and saved.
Batch 20 procesed and saved.
Batch 21 procesed and saved.
Batch 22 procesed and saved.
Batch 23 procesed and saved.
Batch 24 procesed and saved.
Batch 25 procesed and saved.
Batch 26 procesed and saved.
Batch 27 procesed and saved.
Batch 28 procesed and saved.
Batch 29 procesed and saved.
Batch 30 procesed and saved.
Batch 31 procesed and saved.
Batch 32 procesed and saved.
Batch 33 procesed and saved

In [None]:
# Clear the assets folder
! rm -r assets/

In [None]:
# !!! Forcefully reset RAM by injecting a list of size 10^10 !!!
[1]*10**10

### Save to JSONL

In [None]:
import json

if not os.path.exists("assets"):
        os.makedirs("assets")

# with open('assets/TRAIN_DATA.jsonl', 'w') as f:
#     for entry in TRAIN_DATA:
#         json.dump(entry, f)
#         f.write('\n')

# with open('assets/TEST_CONTENT.jsonl', 'w') as f:
#     for entry in TEST_CONTENT:
#         json.dump(entry, f)
#         f.write('\n')

# with open('assets/TEST_CONTEXT.jsonl', 'w') as f:
#     for entry in TEST_CONTEXT:
#         json.dump(entry, f)
#         f.write('\n')

# with open('assets/UNSEEN_DATA.jsonl', 'w') as f:
#     for entry in UNSEEN_DATA:
#         json.dump(entry, f)
#         f.write('\n')


### Zip the JSONL files

In [None]:
!zip -r /content/assets.zip /content/assets

## Extract assets

In [None]:
!unzip /content/assets.zip

## Get the pre-processed JSONL dataset from Google Drive

The below cell joins the jsonl files, but **does not format them properly**.  
Probably not gonna be used.

In [None]:
import json
from google.colab import drive
import glob

drive.mount("/content/gdrive")

TRAIN = []

for file_iter, f in enumerate(glob.glob("/content/gdrive/MyDrive/spacy_ner_data/augmented_dataset_2021-06-21/train_jsonl_files/shuffled/*.jsonl")):
    with open(f, "rb") as infile:
        json_list = list(infile)
        TRAIN.append(json_list)
    print(f"File {file_iter} appended.")

with open("merged_file_10th.jsonl", "w") as outfile:
     json.dump(TRAIN[0:198522], outfile)


# read_files = glob.glob("/content/gdrive/MyDrive/spacy_ner_data/augmented_dataset_2021-06-21/train_jsonl_files/shuffled/*.jsonl")
# with open("merged_file.jsonl", "wb") as outfile:
#     outfile.write('[{}]'.format(
#         b','.join([open(f, "rb").read() for f in read_files])))

# for file_iter in range(100 + 1):
#     BIG_DATA_PATH = f"/content/gdrive/MyDrive/spacy_ner_data/augmented_dataset_2021-06-21/train_jsonl_files/shuffled/TRAIN_DATA{file_iter}.jsonl"
#     with open(BIG_DATA_PATH, 'r') as f:
#         TRAIN.append(json.load(f))
#     print(f"File {file_iter} appended.")

In [19]:
del TRAIN

In [None]:
!zip -r /merged_file_quarter.zip /content/merged_file_quarter.jsonl

## Convert the data to spaCy's binary format

A shell script is made in order to run the preprocess Python script multiple times, iterating over all the 100 jsonl files of training dataset

* The error logs generated by spaCy point to 11 sentences, each having 100 duplicates in the original train.csv file.
* The actual reason of these spaCy errors are not duplicates but the inability of the preprocessing function (`load_cleaned_data`) to identiy tagged INTR and/or QLTY of very few specific sentences (To be fixed)

In [32]:
%%shell

mkdir -p corpus

drive_path="/content/gdrive/MyDrive/spacy_ner_data/augmented_dataset_2021-06-21/train_jsonl_files/shuffled/TRAIN_DATA"
jsonl_ext=".jsonl"

saved_path="/content/corpus/TRAIN_DATA"
spacy_ext=".spacy"

for file_iter in {0..100}
do
    jsonl_drive_path="$drive_path$file_iter$jsonl_ext"
    spacy_file_path="$saved_path$file_iter$spacy_ext"

    python scripts/preprocess.py "$jsonl_drive_path" "$spacy_file_path"
done

2021-06-30 15:04:49.834760: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Processed 19852 documents: TRAIN_DATA0.spacy
2021-06-30 15:05:10.528826: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Processed 19852 documents: TRAIN_DATA1.spacy
2021-06-30 15:05:28.217638: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Processed 19852 documents: TRAIN_DATA2.spacy
2021-06-30 15:05:44.900325: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Processed 19852 documents: TRAIN_DATA3.spacy
2021-06-30 15:06:01.431999: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Processed 19852 documents: TRAIN_DATA4.spacy
2021-06-30 15:06:18.185448: I tensorflow/stre



In [None]:
!zip -r /corpus.zip /content/corpus

In [None]:
# !python -m spacy project run preprocess
!python scripts/preprocess.py merged_file_quarter.jsonl corpus/TRAIN_ALL_QUARTER.spacy

## Check the config file

* Cannot check properly with large dataset because of memory issues

In [None]:
!python -m spacy debug data configs/config.cfg

## Train

In [None]:
# !python -m spacy project run train
!python -m spacy train configs/config.cfg --output training/ --paths.train corpus/TEST_CONTEXT.spacy --paths.dev corpus/TEST_CONTENT.spacy --gpu-id 0

## Evaluate

In [None]:
# !python -m spacy project run evaluate
!python -m spacy evaluate training/model-best corpus/fashion_brands_eval.spacy --output training/metrics.json --gpu-id 0

## Archive the generated model/data/images

In [None]:
# !unzip /content/data.zip
# !unzip /content/saved_model.zip
# !zip -r /content/data.zip /content/data
# !zip -r /content/img.zip /content/img
# !zip -r /content/saved_model.zip /content/saved_model