In [2]:
from cdp_data import CDPInstances, datasets
import pandas as pd
import spacy 
import numpy as np

# Load Model and Data

In [3]:
nlp_customized = spacy.load("/Users/angelzhou/Desktop/CDP_research/local-interest-groups-ner-model-v1/")

# The NERs we are interested in
ners = ['PERSON', 'PERSON-AFFILIATED-WITH-ORG']



In [4]:
df = pd.read_json('local-interest-groups-irr-annotation-set.jsonl', lines=True)
df.head()

Unnamed: 0,text,meta
0,"Good morning. As you said, I'm a downtown resi...","{'muni': 'seattle', 'session_id': '6c40d8abf3c9'}"
1,Down morning. I'm chair of tree pack. It's dis...,"{'muni': 'seattle', 'session_id': '6c40d8abf3c9'}"
2,"So Doug and Andrew if you are out there, call ...","{'muni': 'seattle', 'session_id': '6c40d8abf3c9'}"
3,"Thank you. Hi, I just want to bring attention ...","{'muni': 'seattle', 'session_id': 'c6bbc7ceec24'}"
4,"Yes, I'm here. I'm unmuted, it appears. Yes. O...","{'muni': 'seattle', 'session_id': 'c6bbc7ceec24'}"


In [5]:
ner_result = []
for row in df.iterrows():
    doc = nlp_customized(row[1].text)
    entities = list(doc.ents)
    if len(entities) >= 2:
        ner_result.append([row[1].text, entities[0].text, entities[0].label_, 
                           entities[1].text, entities[1].label_])
    elif len(entities) == 1:
        ner_result.append([row[1].text, entities[0].text, entities[0].label_, np.nan, np.nan])
    else:
        ner_result.append([row[1].text, np.nan, np.nan, np.nan, np.nan])

ner_df = pd.DataFrame(ner_result, columns=["text", "predicted_entity_1", "label_1", "predicted_entity_2", "label_2"])
ner_df.head()

Unnamed: 0,text,predicted_entity_1,label_1,predicted_entity_2,label_2
0,"Good morning. As you said, I'm a downtown resi...",,,,
1,Down morning. I'm chair of tree pack. It's dis...,,,,
2,"So Doug and Andrew if you are out there, call ...",Richard Ellison. I'm a retired community colle...,PERSON,,
3,"Thank you. Hi, I just want to bring attention ...",,,,
4,"Yes, I'm here. I'm unmuted, it appears. Yes. O...",,,,


In [6]:
truth = pd.read_csv('ground truth dataset.csv')
truth.head()

Unnamed: 0,text,ent_1,label_1,ent_2,label_2
0,"Good morning. As you said, I'm a downtown resi...",,,,
1,Down morning. I'm chair of tree pack. It's dis...,,,,
2,"So Doug and Andrew if you are out there, call ...",Richard Ellison. I'm a retired community colle...,PERSON,,
3,"Thank you. Hi, I just want to bring attention ...",Howard Gale with Seattle stop.org,PERSON-AFFLIATED-WITH-ORG,,
4,"Yes, I'm here. I'm unmuted, it appears. Yes. O...",,,,


# Strict accuracy 

In [8]:
# check if entity and label is strictly correct
correct_label_count = 0
for i in range(len(ner_df)):
    # if all are NA, it's correct
    if (pd.isna(ner_df.iloc[i].predicted_entity_1) and pd.isna(truth.iloc[i].ent_1) and 
        pd.isna(ner_df.iloc[i].predicted_entity_2) and pd.isna(truth.iloc[i].ent_2)):
        correct_label_count += 1
    # elif both entities and labels are the same
    elif ((ner_df.iloc[i].label_1 == truth.iloc[i].label_1) and 
          (ner_df.iloc[i].predicted_entity_1 == truth.iloc[i].ent_1) and 
          (ner_df.iloc[i].label_2 == truth.iloc[i].label_2) and
        (ner_df.iloc[i].predicted_entity_2 == truth.iloc[i].ent_2)):
        correct_label_count += 1
    # elif first entities and labels are the same, and there's no second ent
    elif ((ner_df.iloc[i].label_1 == truth.iloc[i].label_1) and 
          (ner_df.iloc[i].predicted_entity_1 == truth.iloc[i].ent_1) and 
          (pd.isna(ner_df.iloc[i].predicted_entity_2) and pd.isna(truth.iloc[i].ent_2))):
        correct_label_count += 1
    else:
        print("ent_1 should be '", truth.iloc[i].ent_1, "'")
        print("predicted as '", ner_df.iloc[i].predicted_entity_1, "'")
        print("label_1 should be '", truth.iloc[i].label_1, "'")
        print("predicted as '", ner_df.iloc[i].label_1, "'")
        print("ent_2 should be '", truth.iloc[i].ent_2, "'")
        print("predicted as '", ner_df.iloc[i].predicted_entity_2, "'")
        print("label_2 should be '", truth.iloc[i].label_2, "'")
        print("predicted as '", ner_df.iloc[i].label_2, "'")

correct_label_count / len(truth) 

ent_1 should be ' Howard Gale with Seattle stop.org '
predicted as ' nan '
label_1 should be ' PERSON-AFFLIATED-WITH-ORG '
predicted as ' nan '
ent_2 should be ' nan '
predicted as ' nan '
label_2 should be ' nan '
predicted as ' nan '
ent_1 should be ' Christy Heffaker. And I have proudly worked for the city by serving Seattle City Light '
predicted as ' Christy Heffaker. And I have proudly worked for the city by serving Seattle City Light for the last 28 years. And I wanted to just bring to the attention of the council the fact that since I was hired in 1993, one of my proudest things about working for the city of Seattle was the city value, diversity, tolerance, and inclusivity. '
label_1 should be ' PERSON-AFFLIATED-WITH-ORG '
predicted as ' PERSON-AFFLIATED-WITH-ORG '
ent_2 should be ' nan '
predicted as ' nan '
label_2 should be ' nan '
predicted as ' nan '
ent_1 should be ' Jacob sheer, Organizer with real change. '
predicted as ' nan '
label_1 should be ' PERSON-AFFLIATED-WITH-

0.35714285714285715

There are the following cases:

1. entity and label were not predicted at all
2. entity is predicted but span is not accurate, label is correct
3. entity is predicted but span is not accurate, label is wrong
4. entity is predicted but span is wrong, label is wrong
5. missed one entity (ent1 is not predicted but ent2 is)

In [9]:
import difflib
# check if entity and label is strictly correct
refined_correct_label_count = 0
nlp_spacy = spacy.load("en_core_web_sm")
# The NERs we are interested in
ners = ['PERSON', 'ORG', 'NORP']

for i in range(len(ner_df)):
    # if all are NA, it's correct
    if (pd.isna(ner_df.iloc[i].predicted_entity_1) and pd.isna(truth.iloc[i].ent_1) and 
        pd.isna(ner_df.iloc[i].predicted_entity_2) and pd.isna(truth.iloc[i].ent_2)):
        refined_correct_label_count += 1
    # elif both entities and labels are the same
    elif ((ner_df.iloc[i].label_1 == truth.iloc[i].label_1) and 
          (ner_df.iloc[i].predicted_entity_1 == truth.iloc[i].ent_1) and 
          (ner_df.iloc[i].label_2 == truth.iloc[i].label_2) and
        (ner_df.iloc[i].predicted_entity_2 == truth.iloc[i].ent_2)):
        refined_correct_label_count += 1
    # elif first entities and labels are the same, and there's no second ent
    elif ((ner_df.iloc[i].label_1 == truth.iloc[i].label_1) and 
          (ner_df.iloc[i].predicted_entity_1 == truth.iloc[i].ent_1) and 
          (pd.isna(ner_df.iloc[i].predicted_entity_2) and pd.isna(truth.iloc[i].ent_2))):
        correct_label_count += 1
    # elif predicted NA but truth is not NA, apply spacy model to identify any person/organization
    elif (pd.isna(ner_df.iloc[i].predicted_entity_1) and pd.notna(truth.iloc[i].ent_1)):
        doc = nlp_spacy(ner_df.iloc[i].text)
        # find the lowest and highest index of the entities the spacy model recognize
        lowest_index = len(ner_df.iloc[i].text)
        highest_index = 0
        print(ner_df.iloc[i].text)
        for ent in doc.ents:
            if ent.label_ in ners:
                lowest_index = min(lowest_index, ent.start_char)
                highest_index = max(highest_index, ent.end_char)
                print(i, [ent.label_, ent.text, ent.start_char])
        print(lowest_index, highest_index)
        # new range is (lowest_index, highest_index) +-20
        new_text = ner_df.iloc[i].text[max(lowest_index-20, 0):min(highest_index+20, len(ner_df.iloc[i].text))]
        print("new range: ", new_text)
        # run custom model in that new range again
        doc = nlp_customized(new_text)
        entities = list(doc.ents)
        print(entities)
        if len(entities) >= 2:
            ner_df.iloc[i]["predicted_entity_1"] = entities[0].text
            ner_df.iloc[i]["predicted_entity_2"] = entities[1].text
            ner_df.iloc[i]["label_1"] = entities[0].label_
            ner_df.iloc[i]["label_1"] = entities[1].label_
        elif len(entities) == 1:
            ner_df.iloc[i]["predicted_entity_1"] = entities[0].text
            ner_df.iloc[i]["label_1"] = entities[0].label_
        else:
            ner_df.iloc[i]["predicted_entity_1"] = np.nan
            ner_df.iloc[i]["predicted_entity_2"] = np.nan
            ner_df.iloc[i]["label_1"] = np.nan
            ner_df.iloc[i]["label_1"] = np.nan
        # compute accuracy again
        if ((ner_df.iloc[i].label_1 == truth.iloc[i].label_1) and 
          (ner_df.iloc[i].predicted_entity_1 == truth.iloc[i].ent_1) and 
          (ner_df.iloc[i].label_2 == truth.iloc[i].label_2) and
        (ner_df.iloc[i].predicted_entity_2 == truth.iloc[i].ent_2)):
            refined_correct_label_count += 1
    # loosely correct (extracted partially correct entity and correct label)
    # elif ((difflib.SequenceMatcher(None,ner_df.iloc[i].predicted_entity_1 ,truth.iloc[i].ent_1).ratio() > .2) and
    #       (ner_df.iloc[i].label_1 == truth.iloc[i].label_1) and
    #       (difflib.SequenceMatcher(None,ner_df.iloc[i].predicted_entity_2 ,truth.iloc[i].ent_2).ratio() > .2) and
    #       (ner_df.iloc[i].label_2 == truth.iloc[i].label_2)):
    #     # run spacy model 
    #     refined_correct_label_count += 1
    else:
        refined_correct_label_count += 0
        # print("ent_1 should be '", truth.iloc[i].ent_1, "'")
        # print("predicted as '", ner_df.iloc[i].predicted_entity_1, "'")
        # print("label_1 should be '", truth.iloc[i].label_1, "'")
        # print("predicted as '", ner_df.iloc[i].label_1, "'")
refined_correct_label_count

Thank you. Hi, I just want to bring attention to the fact that the line quality in the last couple of weeks has been problematic and it's at the council end. Good afternoon, Howard Gale with Seattle stop.org commenting on our failed police accountability system. Last week, Carolyn Dick at the South Seattle Emerald published her 21st article over the last 14 months, investigating the failures, malfeasance and corruption in our police accountability system. Last week article reveals, once again, shocking mismanagement and misdeeds require the city to seek investigations by entities outside of the city.
3 ['PERSON', 'Howard Gale', 174]
3 ['PERSON', 'Carolyn Dick', 274]
174 286
new range:  nd. Good afternoon, Howard Gale with Seattle stop.org commenting on our failed police accountability system. Last week, Carolyn Dick at the South Seattl
[]
Jacob, good morning. Hi. I'm Jacob sheer, Organizer with real change. I'm calling on behalf of real change Interveners to adjust the Mayor's proposed

9

For some reasons, running the custom model after spacy can't detect any entity, don't know if my code is wrong or if it's the model's fault

In [26]:
ner_df

Unnamed: 0,text,predicted_entity_1,label_1,predicted_entity_2,label_2
0,"Good morning. As you said, I'm a downtown resi...",,,,
1,Down morning. I'm chair of tree pack. It's dis...,,,,
2,"So Doug and Andrew if you are out there, call ...",Richard Ellison. I'm a retired community colle...,PERSON,,
3,"Thank you. Hi, I just want to bring attention ...",,,,
4,"Yes, I'm here. I'm unmuted, it appears. Yes. O...",,,,
5,"Good afternoon, Council. Thank you for the cha...",Christy Heffaker. And I have proudly worked fo...,PERSON-AFFLIATED-WITH-ORG,,
6,"Good morning, Pete her. Good morning. I'm in d...",,,,
7,"Jacob, good morning. Hi. I'm Jacob sheer, Orga...",,,,
8,"Good morning. I am Madison, resident of distri...",,,,
9,Good morning. I want to address your agenda it...,,,,
