# NBME Augmentations 🎨

This notebook is an attempt to use nlpaug to augment pn_history to produce more data for NBME model training . This is based on augmenting good labelled data for now .

In [None]:
!pip install nlpaug -q

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 400)
#pd.set_option('max_colwidth', -1)
pd.set_option('expand_frame_repr', True)
from tqdm.auto import tqdm

In [None]:
import spacy
import numpy as np

def plot_annotation(df, pn_num,plot_pred = False):
    options = {"colors": {}}

    df_text = df[df["pn_num"] == pn_num].reset_index(drop=True)

    text = df_text["pn_history"][0]
    ents = []

    for spans, location , feature_text, feature_num in df_text[["span","location", "feature_text", "feature_num"]].values:
        span=  location[0].split(";")
        if len(span) >0:
            for sp in span:
                s = sp.split(" ")
                if len(s) >1:
                    ents.append({"start": int(s[0]), "end": int(s[1]), "label": feature_text})

        options["colors"][feature_text] =  f"rgb{tuple(np.random.randint(100, 255, size=3))}"

    doc = {"text": text, "ents": sorted(ents, key=lambda i: i["start"])}

    spacy.displacy.render(doc, style="ent", options=options, manual=True, jupyter=True)
    
    
def plot_annotation_aug(df, pn_num,text_var = "pn_history",location_var = "location",plot_aug = False):
    if len(df) <1:
        return
        
    options = {"colors": {}}
    df_text = df[df["pn_num"] == pn_num].reset_index(drop=True)
    text = df_text[text_var][0] 
    print(text)
    ents = []

    for location , feature_text, feature_num in df_text[[ location_var, "feature_text", "feature_num"]].values:
        span=  location
        if len(span) >0:
            for l in span:
                for st in [ st for st in l.split(";") ]:
                    s = st.split()
                    if len(s) >1:
                        ents.append({"start": int(s[0]), "end": int(s[1]), "label": feature_text})

        options["colors"][feature_text] =  f"rgb{tuple(np.random.randint(100, 255, size=3))}"

    doc = {"text": text, "ents": sorted(ents, key=lambda i: i["start"])}

    spacy.displacy.render(doc, style="ent", options=options, manual=True, jupyter=True)

# Augmentations 🛎️

In [None]:
fold0 = pd.read_pickle("../input/helpers-for-the-ride-folds/train_folds_5.pickle")
fold0['span'] = fold0.location

In [None]:
# data augmentation
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
from sklearn.utils import shuffle
import nltk
import re 
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
aug = naw.ContextualWordEmbsAug(model_path="../input/deberta-v3-large/deberta-v3-large",
                                device="cpu", action="substitute")
keyAug = nac.KeyboardAug(name='Keyboard_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3, aug_word_p=0.3, 
                      aug_word_min=1, aug_word_max=10, stopwords=None, tokenizer=None, reverse_tokenizer=None, 
                      include_special_char=True, include_numeric=True, include_upper_case=True, lang='en', verbose=1, 
                      stopwords_regex=None, model_path=None, min_char=4)
antonymAug = naw.AntonymAug(name='Antonym_Aug', aug_min=1, aug_max=10, aug_p=0.3, lang='eng', stopwords=None, tokenizer=None, 
                     reverse_tokenizer=None, stopwords_regex=None, verbose=1)
spelLAug = naw.SpellingAug(dict_path=None, name='Spelling_Aug', aug_min=1, aug_max=10, aug_p=0.3, stopwords=None, 
                      tokenizer=None, reverse_tokenizer=None, include_reverse=True, stopwords_regex=None, verbose=1)
synAug = naw.SynonymAug(aug_src='wordnet', model_path=None, name='Synonym_Aug', aug_min=1, aug_max=10, aug_p=0.3, lang='eng', 
                     stopwords=None, tokenizer=None, reverse_tokenizer=None, stopwords_regex=None, force_reload=False, 
                     verbose=1)

randomeDelaug =naw.RandomWordAug(action='delete', name='RandomWord_Aug', aug_min=1, aug_max=10, aug_p=0.3,
                                 stopwords=None, target_words=None, tokenizer=None, reverse_tokenizer=None, stopwords_regex=None, verbose=1)

randomeSwapaug =naw.RandomWordAug(action='swap', name='RandomWord_Aug', aug_min=1, aug_max=10, aug_p=0.3,
                                 stopwords=None, target_words=None, tokenizer=None, reverse_tokenizer=None, stopwords_regex=None, verbose=1)

randomeCropaug =naw.RandomWordAug(action='crop', name='RandomWord_Aug', aug_min=1, aug_max=10, aug_p=0.3,
                                 stopwords=None, target_words=None, tokenizer=None, reverse_tokenizer=None, stopwords_regex=None, verbose=1)

backtranslation= naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en'
)
augment_dict = {
    "word":aug,
    "key":keyAug,
    "antonymAug": antonymAug,
    "spelLaug" : spelLAug,
    "synAug": synAug,
    "backtra":backtranslation,
    "randomeCropaug":randomeCropaug,
    "randomeDelaug":randomeDelaug,
    "randomeSwapaug":randomeSwapaug
}

import re
# The _tokenizer is not good enough as punctuation will be removed in return.
def _tokenizer(text, token_pattern=r"(?u)\b\w\s\w+\b"):
            token_pattern = re.compile(token_pattern)
            return token_pattern.findall(text)


def agument_text_partial(df, flavor="synAug" ,pr=0.4):
    """Augments text and then adds back new NER positions
    """
    
    def labels(x):
        # Add stopwords
        stopwords = []
        if x.annotation_length > 0:
            for anna in x.location: 
                for text in [e.split() for e in anna.split(";")]:
                    stopwords.append(x.pn_history[int(text[0]):int(text[1])])
        return stopwords
        
    def augs(x): 
        #aug.tokenizer = CFG.tokenizer
        print(f"Got stopwords {x.labels}")
        aug = augment_dict[flavor]    
        aug.aug_p=pr 
        aug.stopwords = x.labels
        return aug.augment(x.pn_history)
    
    def add_aug_location(x):
        an = []
        lables = x.labels
        idx = 0
        if x.annotation_length > 0:
            for anna in x.location:
                a_i = ""
                for orig_locn in [e for e in anna.split(";")]:
                    text = lables[idx]
                    start = x.aug_history.find(text)
                    #print(f"Got {text} {start}  orig {orig_locn,a_i}")    
                    if start>=0:    
                        a_i += f"{start} {start+len(text)};"
                        an.append(f"{a_i[:-1]}")
                    idx+=1    
                #print(f"{idx}[{a_i}]")    
            return an
        else:
            return []
    # Augment    
    df['labels'] = df.apply(lambda x : labels(x),axis=1) 
    df['aug_history'] = df.apply(lambda x : augs(x),axis=1) 
    df['aug_location'] = df.apply(lambda x : add_aug_location(x),axis=1) 
    df['aug_location_len'] = df.aug_location.apply(len)
    # Only keep if NER labels count matches
    return df[df.aug_location_len == df.annotation_length]

print("\nSynonym Augmentation")
adf = agument_text_partial(fold0[1:3])
display(adf[["annotation","aug_location","location","aug_location_len","annotation_length","pn_num","labels","pn_history","aug_history"]])
if len(adf) >0:
    plot_annotation_aug(adf , adf['pn_num'][2])
    print(f"\nAugmented {adf['pn_num'][2]}")
    plot_annotation_aug(adf , adf['pn_num'][2],"aug_history","aug_location", True)

In [None]:
#fold0.iloc[1:4]

In [None]:
print("\n Spelling Augmentation")
sadf = agument_text_partial(fold0[1:4],"spelLaug") 
display(sadf)
plot_annotation_aug(sadf , sadf['pn_num'][3])
plot_annotation_aug(sadf , sadf['pn_num'][3],"aug_history","aug_location", True) 

In [None]:
print("randomeSwapaug Augmentation")
adf_par = agument_text_partial(fold0.iloc[1:3],"randomeSwapaug")
display(adf_par)
plot_annotation_aug(adf_par , adf_par['pn_num'][2])
plot_annotation_aug(adf_par , adf_par['pn_num'][2],"aug_history","aug_location", True) 

In [None]:
print("\n key Augmentation")
keyadf = agument_text_partial(fold0[1:3],"key")
display(keyadf.head())
plot_annotation_aug(keyadf , keyadf['pn_num'][2])
plot_annotation_aug(keyadf , keyadf['pn_num'][2],"aug_history","aug_location", True) 

In [None]:
print("RandomeCropaug Augmentation\n")
adfc = agument_text_partial(fold0.iloc[1:3],"randomeCropaug")
display(adfc.head())
plot_annotation_aug(adfc , adfc['pn_num'][2])
plot_annotation_aug(adfc, adfc['pn_num'][2],"aug_history","aug_location", True) 

# Thats all folks 😃

Hope it helps , I have not shared the code on how to include this into the pipeline though that should be pretty straightforward . There might be some mistakes in stopword impn in **nlpaug** so for now I am just excluding those and not adding them to model training .

⚠️ Also disclaimer there might be mistakes here as this is just a reference code so use our own judgement and testing to implement it . 