<a href="https://colab.research.google.com/github/VintageGold/Text_Summarization/blob/main/notebooks/Spacy_BaseTransformer_NER_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Highly Recommend GPU Environment

# NER Entity Notebook

* Train NER
* Combine with main file Telehealth.csv
* Write to File

# NER Model

In [14]:
#Spacy org
#!pip install spacy
#!python3 -m spacy download en_core_web_sm
#!python3 -m spacy download en_core_web_md
#!python3 -m spacy download en_core_web_lg
#!pip install spacy-transformers
#!python -m spacy download en_core_web_trf

In [2]:
try:
    from google.colab import drive

    drive.mount('./drive/')

    %cd drive/My \ Drive/Text_Summarization

except:
    print("No Colab Environment")

Drive already mounted at ./drive/; to attempt to forcibly remount, call drive.mount("./drive/", force_remount=True).
/content/drive/My  Drive/Text_Summarization


# Train Model

In [3]:
import pandas as pd

df_overview = pd.read_csv("./Data/processed/Telehealth.csv")

In [4]:
#Load Transformer Model for NER
import spacy
from spacy.lang.en import English
from datetime import date

corpus = df_overview["Content"]

nlp = spacy.load("en_core_web_trf")

docs = list(nlp.pipe(corpus,disable=["parser","textcat"]))
# https://spacy.io/api/language#from_disk
nlp.to_disk(f"Models/standard_spacy_transformer_{date.today()}")

In [5]:
ner_dict = dict() #used throughout analysis

ner_label_dict = dict() #contains dictionary of the NER Terms and meanings

for index,doc in enumerate(docs):

    for ent in doc.ents:
        
        if str(ent) not in ner_dict.keys():
            
            ner_label_dict[ent.label_] = spacy.explain(ent.label_)

            ner_dict[ent.text] = {"NER_Label":str(ent.label_),
                                  "Unique_Documents":{str(index)},"Full_Documents":list(str(index))
                                      }
            
        else:
            ner_dict[ent.text]["Unique_Documents"].add(index)
            ner_dict[ent.text]["Full_Documents"].append(index)
            

In [7]:
import json
    
with open("references/ner_label_description.json","w") as f:
    json.dump(ner_label_dict, f, indent = 4)

# Join with Telehealth.csv for Classifications

In [8]:
df_ner = pd.DataFrame.from_dict(ner_dict,orient="index")
df_ner.head(5)

Unnamed: 0,NER_Label,Unique_Documents,Full_Documents
Institute of Medicine,ORG,"{0, 10, 42, 0}","[0, 0, 10, 10, 42]"
2014,DATE,"{0, 1, 2, 3, 5, 6, 7, 8, 11, 12, 0, 14, 15, 16...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
Tanielian,PERSON,"{0, 0}","[0, 0, 0, 0, 0, 0, 0, 0]"
2008,DATE,"{0, 1, 2, 3, 5, 6, 7, 8, 11, 0, 15, 16, 17, 18...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ..."
Substance Abuse and Mental Health Services Administration,ORG,"{21, 27, 0}","[0, 21, 27]"


In [9]:
df_ner[["NER_Label"]].to_json("./references/ner_dictionary.json",orient="index",indent=4)

In [10]:
#Load abrv dictionary to add long form.  This dictionary comes from the Wrangling notebook

with open("./references/abbreviation_table_processed.json", "r") as r:
    
    abrv_dict = json.load(r)


In [11]:
#Apply abbrv to entities found
df_ner = df_ner.reset_index().rename(columns={"index":"Term"})
df_ner["Long_Form"] = df_ner["Term"].apply(lambda x: abrv_dict[x]["LongForm"] if x in abrv_dict.keys() else x)

df_ner.head(5)

Unnamed: 0,Term,NER_Label,Unique_Documents,Full_Documents,Long_Form
0,Institute of Medicine,ORG,"{0, 10, 42, 0}","[0, 0, 10, 10, 42]",Institute of Medicine
1,2014,DATE,"{0, 1, 2, 3, 5, 6, 7, 8, 11, 12, 0, 14, 15, 16...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2014
2,Tanielian,PERSON,"{0, 0}","[0, 0, 0, 0, 0, 0, 0, 0]",Tanielian
3,2008,DATE,"{0, 1, 2, 3, 5, 6, 7, 8, 11, 0, 15, 16, 17, 18...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...",2008
4,Substance Abuse and Mental Health Services Adm...,ORG,"{21, 27, 0}","[0, 21, 27]",Substance Abuse and Mental Health Services Adm...


# Write to File

In [13]:
df_ner.to_json("Data/processed/ner_analysis.json",orient="records",indent=4)