In [1]:
from cdp_data import CDPInstances, datasets
import pandas as pd
import numpy as np
import spacy 

In [2]:
# load custom model
nlp_custom = spacy.load("local-interest-groups-ner-model-v1/")



In [3]:
truth_df = pd.read_csv('ground truth dataset.csv')
truth_df.head()

Unnamed: 0,text,ent_1,label_1,ent_2,label_2
0,"Good morning. As you said, I'm a downtown resi...",,,,
1,Down morning. I'm chair of tree pack. It's dis...,,,,
2,"So Doug and Andrew if you are out there, call ...",Richard Ellison. I'm a retired community colle...,PERSON,,
3,"Thank you. Hi, I just want to bring attention ...",Howard Gale with Seattle stop.org,PERSON-AFFLIATED-WITH-ORG,,
4,"Yes, I'm here. I'm unmuted, it appears. Yes. O...",,,,


## Apply the custom NER model

In [100]:
# use the custom model to recognize entities in the 28 examples

df = pd.read_json('local-interest-groups-irr-annotation-set.jsonl', lines=True)

ner_result = []
for _, row in df.iterrows():
    doc = nlp_custom(row['text'])
    entities = list(doc.ents)
    
    if len(entities) >= 2:
        ner_result.append([row['text'], entities[0].text, entities[0].label_, 
                           entities[1].text, entities[1].label_])
    elif len(entities) == 1:
        ner_result.append([row['text'], entities[0].text, entities[0].label_, np.nan, np.nan])
    else:
        ner_result.append([row['text'], np.nan, np.nan, np.nan, np.nan])
        

ner_df = pd.DataFrame(ner_result, columns=["text", "predicted_ent_1", "named_entity_type_1", 
                                           "predicted_ent_2", "named_entity_type_2"])
ner_df


Unnamed: 0,text,predicted_ent_1,named_entity_type_1,predicted_ent_2,named_entity_type_2
0,"Good morning. As you said, I'm a downtown resi...",,,,
1,Down morning. I'm chair of tree pack. It's dis...,,,,
2,"So Doug and Andrew if you are out there, call ...",Richard Ellison. I'm a retired community colle...,PERSON,,
3,"Thank you. Hi, I just want to bring attention ...",,,,
4,"Yes, I'm here. I'm unmuted, it appears. Yes. O...",,,,
5,"Good afternoon, Council. Thank you for the cha...",Christy Heffaker. And I have proudly worked fo...,PERSON-AFFLIATED-WITH-ORG,,
6,"Good morning, Pete her. Good morning. I'm in d...",,,,
7,"Jacob, good morning. Hi. I'm Jacob sheer, Orga...",,,,
8,"Good morning. I am Madison, resident of distri...",,,,
9,Good morning. I want to address your agenda it...,,,,


## Model Evaluation

In [83]:
# keep track of "perfect accuracy" by just comparing the text of each found entity

# calculate the accuracy of the model based on string comparison between the predicted 
# entities and the ground truth dataset (truth_df)

def any_label_at_all(truth_df, ner_result_df):

def model_evaluation_span_position(truth_df, ner_result_df):

    total_predictions = len(truth_df)
    correct_predictions = 0 

    for i, truth_row in truth_df.iterrows():
        ner_result_row = ner_result_df.loc[i]
    
    
        if (truth_row['ent_1'] == ner_result_row['predicted_ent_1']) and (truth_row['ent_2'] == ner_result_row['predicted_ent_2']):
            correct_predictions += 1

        # if truth_df doesn't have a value for either ent_1 or ent_2 and the model doesn't find anything
        if (pd.isna(truth_row['ent_1']) and pd.isna(truth_row['ent_2'])) and (pd.isna(ner_result_row['predicted_ent_1']) and pd.isna(ner_result_row['predicted_ent_2'])):
            correct_predictions += 1
    

    accuracy = (correct_predictions / total_predictions) * 100

    print(f"{accuracy:.2f}% of the ground truth dataset was predicted correctly.")

model_evaluation(truth_df, ner_df)


def model_evaluation_label_accuracy:
    

32.14% of the ground truth dataset was predicted correctly.


## writing a function that increases accuracy

In [102]:
# the custom model finds the general area or maybe one of the entities we want

# write a wrapper function for getting better results, combine with spacy's model

nlp_spacy = spacy.load("en_core_web_sm")

# The NERs we are interested in (for spacy)
ners = ['PERSON', 'ORG', 'NORP']

ner_result = []
for i, row in df.iterrows():
    
    doc_custom = nlp_custom(row['text'])
    
    entities = list(doc_custom.ents)
    print(entities)
    for j, e in enumerate(entities):
        doc_spacy = nlp_spacy(str(e))        
        entities_spacy = [ent for ent in doc_spacy.ents if ent.label_ in ners]
        
        if entities_spacy:
            person_ent = next((ent for ent in entities_spacy if ent.label_ == 'PERSON'), None)
            print(person_ent)
            if person_ent:
                start = person_ent.start_char - e.start_char
                end = person_ent.end_char - e.start_char
                entities[j] = doc_custom.char_span(start, end, label='PERSON')
        
    
    if entities:
        if len(entities) >= 2:
            ner_result.append([row['text'], entities[0].text, entities[0].label_, 
                           entities[1].text, entities[1].label_])
        elif len(entities) == 1:
            ner_result.append([row['text'], entities[0].text, entities[0].label_, np.nan, np.nan])
    else:
        ner_result.append([row['text'], np.nan, np.nan, np.nan, np.nan])
        

new_ner_df = pd.DataFrame(ner_result, columns=["text", "predicted_ent_1", "named_entity_type_1", 
                                           "predicted_ent_2", "named_entity_type_2"])
new_ner_df

[]
[]
[Richard Ellison. I'm a retired community college professor of biology and environmental science. And also a board member of a political action committee for trees.]
Richard Ellison


AttributeError: 'NoneType' object has no attribute 'text'

In [87]:
# model evaluation after the wrapper function

model_evaluation(truth_df, new_ner_df)

32.14% of the ground truth dataset was predicted correctly.
