In [None]:
'''
Citation:

[1] “Pandas,” pandas. [Online]. Available: https://pandas.pydata.org/.
[2] "NumPy." [Online]. Available: https://numpy.org/.
[3] “NER Model Architectures · spacy API documentation,” Model Architectures. [Online]. Available: https://spacy.io/api/architectures#parser.
'''

In [None]:
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
import json
import random
import pandas as pd

In [None]:
O_R_df = pd.read_csv('../Webscrapping/webdata/Opiate-Recovery.csv')
R_df = pd.read_csv('../Webscrapping/webdata/reddit_effects.csv')
T_df = pd.read_csv('../Webscrapping/webdata/twitter_effects.csv')
Comb_R_T_df =  pd.concat([R_df,T_df])
OR_ls = list(O_R_df['Comments']) + list(Comb_R_T_df['Comments'])

In [None]:
with open('NERData\SUB_EFF_patterns.json', "r", encoding="utf-8") as f:
    data = json.load(f)

nlp = English()
ruler = EntityRuler(nlp)
ruler.add_patterns(data)
nlp.add_pipe(ruler)
nlp.to_disk("ner")

nlp = spacy.load("ner")
train_ls = []
for o1 in OR_ls:
    doc = nlp(o1)
    results = []
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))
    if len(entities) > 0:
        results = [o1, {"entities": entities}]

    if results != None:
        train_ls.append(results)

with open ('NERData/SUB_EFF_training.json', "w", encoding="utf-8") as f:
    json.dump(train_ls, f, indent=4)

In [None]:
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def save_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

def train_spacy(data, iterations):
    TRAIN_DATA = data
    nlp = spacy.blank("en")
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    # Disable the other pipeline components, for not effecting them while training new ones.
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print ("Starting iteration " + str(itn))

            # Shuffling to make sure that the model does not make generalizations based on the order of the examples.
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                            [text],
                            [annotations],
                            drop=0.2,
                            sgd=optimizer,
                            losses=losses
                )
            print (losses)
    return (nlp)

In [None]:
TRAIN_DATA = load_data("NERData/SUB_EFF_training.json")
nlp = train_spacy(TRAIN_DATA, 30)
nlp.to_disk("en_reddit_ner/en_reddit_ner")