In [153]:
import pandas as pd
import spacy
from spacy.displacy.render import EntityRenderer
from IPython.core.display import display, HTML

## Utils and Prep

In [154]:
def custom_render(doc, df, column, options={}, page=False, minify=False, idx=0):
    renderer, converter = EntityRenderer, parse_custom_ents
    renderer = renderer(options=options)
    parsed = [converter(doc, df=df, idx=idx, column=column)]
    html = renderer.render(parsed, page=page, minify=minify).strip()  
    return display(HTML(html))

def parse_custom_ents(doc, df, idx, column):
    if column in df.columns:
        entities = df[column][idx]
        ents = [{'start': ent[1], 'end': ent[2], 'label': ent[3]} 
                for ent in entities]
    else:
        ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
            for ent in doc.ents]
    return {'text': doc.text, 'ents': ents, 'title': None}

def render_entities(idx, df, options={}, column='entities_v1'):
    query = df['queries'][idx]
    custom_render(nlp(query), df=df, column=column, options=options, idx=idx)

In [3]:
nlp = spacy.load('en_core_web_lg')

pd.options.mode.chained_assignment = None # prevent warning about working on a copy of a df

In [155]:
entity_df = pd.read_csv('queries.csv', header=None)
entity_df.columns = ['queries']
pd.set_option('display.max_rows', 5)
mini_entity_df = entity_df[:5]

df = mini_entity_df

In [156]:
options = {'colors': {'COMPOUND': '#FE6BFE', 'PROPN': '#18CFE6', 'NOUN': '#18CFE6', 'NP': '#1EECA6'}}

## Step 1: Inspect data

In [157]:
display(df)

Unnamed: 0,queries
0,i need to reschedule my appointment which was ...
1,this is the third time i made an appointment w...
2,why if i make a appointment for 8 am do they t...
3,called pasadena store sunday am . asked how lo...
4,is it normal to have a 2 pm appointment for ti...


## Step 2: Extract named entities

In [158]:
def extract_named_entities(query):
    return [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in nlp(query).ents]

def add_named_entities(df):
    df['named_entities'] = df['queries'].apply(extract_named_entities)    

In [159]:
add_named_entities(df)
display(df)

Unnamed: 0,queries,named_entities
0,i need to reschedule my appointment which was ...,"[(11/24, 60, 65, CARDINAL), (1267154747, 82, 9..."
1,this is the third time i made an appointment w...,"[(third, 12, 17, ORDINAL), (ny, 70, 72, GPE), ..."
2,why if i make a appointment for 8 am do they t...,"[(8, 32, 33, DATE), (30, 159, 161, QUANTITY), ..."
3,called pasadena store sunday am . asked how lo...,"[(sunday, 22, 28, DATE), (am, 29, 31, TIME), (..."
4,is it normal to have a 2 pm appointment for ti...,"[(2 pm, 23, 27, TIME), (2 hours, 203, 210, TIM..."


In [160]:
column = 'named_entities'
render_entities(3, df, options=options, column=column)

## Step 2: Extract all nouns/propn

In [161]:
def extract_nouns(query):
    keep_pos = ['PROPN', 'NOUN']
    return [(tok.text, tok.idx, tok.idx+len(tok.text), tok.pos_) for tok in nlp(query) if tok.pos_ in keep_pos]

def add_nouns(df):
    df['nouns'] = df['queries'].apply(extract_nouns)

In [162]:
add_nouns(df)
display(df)

Unnamed: 0,queries,named_entities,nouns
0,i need to reschedule my appointment which was ...,"[(11/24, 60, 65, CARDINAL), (1267154747, 82, 9...","[(appointment, 24, 35, NOUN), (order, 68, 73, ..."
1,this is the third time i made an appointment w...,"[(third, 12, 17, ORDINAL), (ny, 70, 72, GPE), ...","[(time, 18, 22, NOUN), (appointment, 33, 44, N..."
2,why if i make a appointment for 8 am do they t...,"[(8, 32, 33, DATE), (30, 159, 161, QUANTITY), ...","[(appointment, 16, 27, NOUN), (min, 86, 89, NO..."
3,called pasadena store sunday am . asked how lo...,"[(sunday, 22, 28, DATE), (am, 29, 31, TIME), (...","[(pasadena, 7, 15, NOUN), (store, 16, 21, NOUN..."
4,is it normal to have a 2 pm appointment for ti...,"[(2 pm, 23, 27, TIME), (2 hours, 203, 210, TIM...","[(pm, 25, 27, NOUN), (appointment, 28, 39, NOU..."


In [163]:
column = 'nouns'
render_entities(3, df, options=options, column=column)

## Step 3: Combine nouns/propn & non-numerical entities (v1)

In [164]:
def extract_entities_v1(row_series):
    entities = set()
    idxs = set()
    for noun_tuple in row_series['nouns']:
        for named_entity_tuple in row_series['named_entities']:
            if noun_tuple[1] == named_entity_tuple[1]: 
                idxs.add(noun_tuple[1])
                entities.add(named_entity_tuple)
        if noun_tuple[1] not in idxs:
            entities.add(noun_tuple)
    
    return sorted(list(entities), key=lambda x: x[1])

def add_entities_v1(df):
    df['entities_v1'] = df.apply(extract_entities_v1, axis=1)

In [165]:
add_entities_v1(df)
display(df)

Unnamed: 0,queries,named_entities,nouns,entities_v1
0,i need to reschedule my appointment which was ...,"[(11/24, 60, 65, CARDINAL), (1267154747, 82, 9...","[(appointment, 24, 35, NOUN), (order, 68, 73, ...","[(appointment, 24, 35, NOUN), (order, 68, 73, ..."
1,this is the third time i made an appointment w...,"[(third, 12, 17, ORDINAL), (ny, 70, 72, GPE), ...","[(time, 18, 22, NOUN), (appointment, 33, 44, N...","[(time, 18, 22, NOUN), (appointment, 33, 44, N..."
2,why if i make a appointment for 8 am do they t...,"[(8, 32, 33, DATE), (30, 159, 161, QUANTITY), ...","[(appointment, 16, 27, NOUN), (min, 86, 89, NO...","[(appointment, 16, 27, NOUN), (min, 86, 89, NO..."
3,called pasadena store sunday am . asked how lo...,"[(sunday, 22, 28, DATE), (am, 29, 31, TIME), (...","[(pasadena, 7, 15, NOUN), (store, 16, 21, NOUN...","[(pasadena, 7, 15, NOUN), (store, 16, 21, NOUN..."
4,is it normal to have a 2 pm appointment for ti...,"[(2 pm, 23, 27, TIME), (2 hours, 203, 210, TIM...","[(pm, 25, 27, NOUN), (appointment, 28, 39, NOU...","[(pm, 25, 27, NOUN), (appointment, 28, 39, NOU..."


In [177]:
column = 'entities_v1'
render_entities(3, df, options=options, column=column)

## Step 4: Extract noun phrases (v2)

In [167]:
def extract_entities_v2(query):
    return [(chunk.text, chunk.start_char, chunk.end_char, chunk.label_) for chunk in nlp(query).noun_chunks]

def add_entities_v2(df):
    df['entities_v2'] = df['queries'].apply(extract_entities_v2)

In [168]:
add_entities_v2(df)
display(df)

Unnamed: 0,queries,named_entities,nouns,entities_v1,entities_v2
0,i need to reschedule my appointment which was ...,"[(11/24, 60, 65, CARDINAL), (1267154747, 82, 9...","[(appointment, 24, 35, NOUN), (order, 68, 73, ...","[(appointment, 24, 35, NOUN), (order, 68, 73, ...","[(i, 0, 1, NP), (my appointment, 21, 35, NP), ..."
1,this is the third time i made an appointment w...,"[(third, 12, 17, ORDINAL), (ny, 70, 72, GPE), ...","[(time, 18, 22, NOUN), (appointment, 33, 44, N...","[(time, 18, 22, NOUN), (appointment, 33, 44, N...","[(the third time, 8, 22, NP), (i, 23, 24, NP),..."
2,why if i make a appointment for 8 am do they t...,"[(8, 32, 33, DATE), (30, 159, 161, QUANTITY), ...","[(appointment, 16, 27, NOUN), (min, 86, 89, NO...","[(appointment, 16, 27, NOUN), (min, 86, 89, NO...","[(i, 7, 8, NP), (a appointment, 14, 27, NP), (..."
3,called pasadena store sunday am . asked how lo...,"[(sunday, 22, 28, DATE), (am, 29, 31, TIME), (...","[(pasadena, 7, 15, NOUN), (store, 16, 21, NOUN...","[(pasadena, 7, 15, NOUN), (store, 16, 21, NOUN...","[(sunday, 22, 28, NP), (the wait, 49, 57, NP),..."
4,is it normal to have a 2 pm appointment for ti...,"[(2 pm, 23, 27, TIME), (2 hours, 203, 210, TIM...","[(pm, 25, 27, NOUN), (appointment, 28, 39, NOU...","[(pm, 25, 27, NOUN), (appointment, 28, 39, NOU...","[(it, 3, 5, NP), (a 2 pm appointment, 21, 39, ..."


In [176]:
column = 'entities_v2'
render_entities(3, df, options=options, column=column)

## Step 5: Extract compound noun phrases (v4)

In [170]:
def extract_entities_v3(query):
    compound_nps = []
    for tok in nlp(query):
        if tok.dep_ == 'compound':
            compound = ' '.join([tok.text, tok.head.text])
            compound_nps.append((compound, tok.idx, tok.idx+len(compound), tok.dep_.upper()))
    return compound_nps 

def add_entities_v3(df):
    df['entities_v3'] = df['queries'].apply(extract_entities_v3)

In [171]:
add_entities_v3(df)
display(df)

Unnamed: 0,queries,named_entities,nouns,entities_v1,entities_v2,entities_v3
0,i need to reschedule my appointment which was ...,"[(11/24, 60, 65, CARDINAL), (1267154747, 82, 9...","[(appointment, 24, 35, NOUN), (order, 68, 73, ...","[(appointment, 24, 35, NOUN), (order, 68, 73, ...","[(i, 0, 1, NP), (my appointment, 21, 35, NP), ...","[(order number, 68, 80, COMPOUND), (appointmen..."
1,this is the third time i made an appointment w...,"[(third, 12, 17, ORDINAL), (ny, 70, 72, GPE), ...","[(time, 18, 22, NOUN), (appointment, 33, 44, N...","[(time, 18, 22, NOUN), (appointment, 33, 44, N...","[(the third time, 8, 22, NP), (i, 23, 24, NP),...","[(acme corp, 50, 59, COMPOUND), (corp ny, 55, ..."
2,why if i make a appointment for 8 am do they t...,"[(8, 32, 33, DATE), (30, 159, 161, QUANTITY), ...","[(appointment, 16, 27, NOUN), (min, 86, 89, NO...","[(appointment, 16, 27, NOUN), (min, 86, 89, NO...","[(i, 7, 8, NP), (a appointment, 14, 27, NP), (...",[]
3,called pasadena store sunday am . asked how lo...,"[(sunday, 22, 28, DATE), (am, 29, 31, TIME), (...","[(pasadena, 7, 15, NOUN), (store, 16, 21, NOUN...","[(pasadena, 7, 15, NOUN), (store, 16, 21, NOUN...","[(sunday, 22, 28, NP), (the wait, 49, 57, NP),...","[(pasadena store, 7, 21, COMPOUND), (oil chang..."
4,is it normal to have a 2 pm appointment for ti...,"[(2 pm, 23, 27, TIME), (2 hours, 203, 210, TIM...","[(pm, 25, 27, NOUN), (appointment, 28, 39, NOU...","[(pm, 25, 27, NOUN), (appointment, 28, 39, NOU...","[(it, 3, 5, NP), (a 2 pm appointment, 21, 39, ...","[(pm appointment, 25, 39, COMPOUND), (tire bal..."


In [173]:
column = 'entities_v3'
render_entities(3, df, options=options, column=column)

## Step 6: Combine noun/propn + named entities, and compound noun phrases (v4)

In [175]:
# TODO: combine entity_v1 and entity_v3
# TODO: Extract time as well
# TODO: fix noun phrases to fit company entities models
# TODO: extract alternate values