In [212]:
import pandas as pd
import spacy
from spacy.displacy.render import EntityRenderer
from IPython.core.display import display, HTML

## Utils and Prep

In [213]:
def custom_render(doc, df, column, options={}, page=False, minify=False, idx=0):
    renderer, converter = EntityRenderer, parse_custom_ents
    renderer = renderer(options=options)
    parsed = [converter(doc, df=df, idx=idx, column=column)]
    html = renderer.render(parsed, page=page, minify=minify).strip()  
    return display(HTML(html))

def parse_custom_ents(doc, df, idx, column):
    if column in df.columns:
        entities = df[column][idx]
        ents = [{'start': ent[1], 'end': ent[2], 'label': ent[3]} 
                for ent in entities]
    else:
        ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
            for ent in doc.ents]
    return {'text': doc.text, 'ents': ents, 'title': None}

def render_entities(idx, df, options={}, column='entities_v1'):
    query = df['queries'][idx]
    custom_render(nlp(query), df=df, column=column, options=options, idx=idx)

In [214]:
nlp = spacy.load('en_core_web_lg')

pd.options.mode.chained_assignment = None # prevent warning about working on a copy of a df

In [215]:
entity_df = pd.read_csv('queries.csv', header=None)
entity_df.columns = ['queries']
pd.set_option('display.max_rows', 5)
mini_entity_df = entity_df[:5]

df = entity_df
# df = mini_entity_df

In [216]:
options = {'colors': {'COMPOUND': '#FE6BFE', 'PROPN': '#18CFE6', 'NOUN': '#18CFE6', 'NP': '#1EECA6'}}

## Step 1: Inspect data

In [217]:
display(df)

Unnamed: 0,queries
0,i need to reschedule my appointment which was ...
1,this is the third time i made an appointment w...
...,...
4620,hola me podrÌ£åÁn cotizar
4621,can you install the resonator for me ?


## Step 2: Extract named entities

In [218]:
def extract_named_entities(query):
    return [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in nlp(query).ents]

def add_named_entities(df):
    df['named_entities'] = df['queries'].apply(extract_named_entities)    

In [219]:
add_named_entities(df)
display(df)

Unnamed: 0,queries,named_entities
0,i need to reschedule my appointment which was ...,"[(11/24, 60, 65, CARDINAL), (1267154747, 82, 9..."
1,this is the third time i made an appointment w...,"[(third, 12, 17, ORDINAL), (ny, 70, 72, GPE), ..."
...,...,...
4620,hola me podrÌ£åÁn cotizar,[]
4621,can you install the resonator for me ?,[]


In [220]:
column = 'named_entities'
render_entities(3, df, options=options, column=column)

## Step 2: Extract all nouns/propn

In [221]:
def extract_nouns(query):
    keep_pos = ['PROPN', 'NOUN']
    return [(tok.text, tok.idx, tok.idx+len(tok.text), tok.pos_) for tok in nlp(query) if tok.pos_ in keep_pos]

def add_nouns(df):
    df['nouns'] = df['queries'].apply(extract_nouns)

In [222]:
add_nouns(df)
display(df)

Unnamed: 0,queries,named_entities,nouns
0,i need to reschedule my appointment which was ...,"[(11/24, 60, 65, CARDINAL), (1267154747, 82, 9...","[(appointment, 24, 35, NOUN), (order, 68, 73, ..."
1,this is the third time i made an appointment w...,"[(third, 12, 17, ORDINAL), (ny, 70, 72, GPE), ...","[(time, 18, 22, NOUN), (appointment, 33, 44, N..."
...,...,...,...
4620,hola me podrÌ£åÁn cotizar,[],"[(cotizar, 18, 25, NOUN)]"
4621,can you install the resonator for me ?,[],"[(resonator, 20, 29, NOUN)]"


In [223]:
column = 'nouns'
render_entities(0, df, options=options, column=column)

## Step 3: Combine nouns/propn & non-numerical entities (v1)

In [224]:
def extract_entities_v1(row_series):
    entities = set()
    idxs = set()
    for noun_tuple in row_series['nouns']:
        for named_entity_tuple in row_series['named_entities']:
            if noun_tuple[1] == named_entity_tuple[1]: 
                idxs.add(noun_tuple[1])
                entities.add(named_entity_tuple)
        if noun_tuple[1] not in idxs:
            entities.add(noun_tuple)
    
    return sorted(list(entities), key=lambda x: x[1])

def add_entities_v1(df):
    df['entities_v1'] = df.apply(extract_entities_v1, axis=1)

In [225]:
add_entities_v1(df)
display(df)

Unnamed: 0,queries,named_entities,nouns,entities_v1
0,i need to reschedule my appointment which was ...,"[(11/24, 60, 65, CARDINAL), (1267154747, 82, 9...","[(appointment, 24, 35, NOUN), (order, 68, 73, ...","[(appointment, 24, 35, NOUN), (order, 68, 73, ..."
1,this is the third time i made an appointment w...,"[(third, 12, 17, ORDINAL), (ny, 70, 72, GPE), ...","[(time, 18, 22, NOUN), (appointment, 33, 44, N...","[(time, 18, 22, NOUN), (appointment, 33, 44, N..."
...,...,...,...,...
4620,hola me podrÌ£åÁn cotizar,[],"[(cotizar, 18, 25, NOUN)]","[(cotizar, 18, 25, NOUN)]"
4621,can you install the resonator for me ?,[],"[(resonator, 20, 29, NOUN)]","[(resonator, 20, 29, NOUN)]"


In [226]:
column = 'entities_v1'
render_entities(1, df, options=options, column=column)

## Step 4: Extract noun phrases (v2)

In [227]:
def extract_entities_v2(query):
    return [(chunk.text, chunk.start_char, chunk.end_char, chunk.label_) for chunk in nlp(query).noun_chunks]

def add_entities_v2(df):
    df['entities_v2'] = df['queries'].apply(extract_entities_v2)

In [228]:
add_entities_v2(df)
display(df)

Unnamed: 0,queries,named_entities,nouns,entities_v1,entities_v2
0,i need to reschedule my appointment which was ...,"[(11/24, 60, 65, CARDINAL), (1267154747, 82, 9...","[(appointment, 24, 35, NOUN), (order, 68, 73, ...","[(appointment, 24, 35, NOUN), (order, 68, 73, ...","[(i, 0, 1, NP), (my appointment, 21, 35, NP), ..."
1,this is the third time i made an appointment w...,"[(third, 12, 17, ORDINAL), (ny, 70, 72, GPE), ...","[(time, 18, 22, NOUN), (appointment, 33, 44, N...","[(time, 18, 22, NOUN), (appointment, 33, 44, N...","[(the third time, 8, 22, NP), (i, 23, 24, NP),..."
...,...,...,...,...,...
4620,hola me podrÌ£åÁn cotizar,[],"[(cotizar, 18, 25, NOUN)]","[(cotizar, 18, 25, NOUN)]","[(me, 5, 7, NP), (podrÌ£åÁn cotizar, 8, 25, NP)]"
4621,can you install the resonator for me ?,[],"[(resonator, 20, 29, NOUN)]","[(resonator, 20, 29, NOUN)]","[(you, 4, 7, NP), (the resonator, 16, 29, NP),..."


In [229]:
column = 'entities_v2'
render_entities(3, df, options=options, column=column)

## Step 5: Extract compound noun phrases (v4)

In [230]:
def extract_entities_v3(query):
    compound_nps = []
    for tok in nlp(query):
        if tok.dep_ == 'compound':
            compound = ' '.join([tok.text, tok.head.text])
            compound_nps.append((compound, tok.idx, tok.idx+len(compound), tok.dep_.upper()))
    return compound_nps 

def add_entities_v3(df):
    df['entities_v3'] = df['queries'].apply(extract_entities_v3)

In [231]:
add_entities_v3(df)
display(df)

Unnamed: 0,queries,named_entities,nouns,entities_v1,entities_v2,entities_v3
0,i need to reschedule my appointment which was ...,"[(11/24, 60, 65, CARDINAL), (1267154747, 82, 9...","[(appointment, 24, 35, NOUN), (order, 68, 73, ...","[(appointment, 24, 35, NOUN), (order, 68, 73, ...","[(i, 0, 1, NP), (my appointment, 21, 35, NP), ...","[(order number, 68, 80, COMPOUND), (appointmen..."
1,this is the third time i made an appointment w...,"[(third, 12, 17, ORDINAL), (ny, 70, 72, GPE), ...","[(time, 18, 22, NOUN), (appointment, 33, 44, N...","[(time, 18, 22, NOUN), (appointment, 33, 44, N...","[(the third time, 8, 22, NP), (i, 23, 24, NP),...","[(acme corp, 50, 59, COMPOUND), (corp ny, 55, ..."
...,...,...,...,...,...,...
4620,hola me podrÌ£åÁn cotizar,[],"[(cotizar, 18, 25, NOUN)]","[(cotizar, 18, 25, NOUN)]","[(me, 5, 7, NP), (podrÌ£åÁn cotizar, 8, 25, NP)]",[]
4621,can you install the resonator for me ?,[],"[(resonator, 20, 29, NOUN)]","[(resonator, 20, 29, NOUN)]","[(you, 4, 7, NP), (the resonator, 16, 29, NP),...",[]


In [232]:
column = 'entities_v3'
render_entities(0, df, options=options, column=column)

## Step 6: Combine noun/propn + named entities, and compound noun phrases (v4)

In [233]:
def extract_entities_v4(row_series, cols=[]):
    return {noun_tuple[0] for col in cols for noun_tuple in row_series[col]}

def add_entities_v4(df, cols=[]):
    df['entities_v4'] = df.apply(extract_entities_v4, axis=1, cols=cols)

In [234]:
cols = ['nouns', 'entities_v3']
add_entities_v4(df, cols=cols)
display(df)

Unnamed: 0,queries,named_entities,nouns,entities_v1,entities_v2,entities_v3,entities_v4
0,i need to reschedule my appointment which was ...,"[(11/24, 60, 65, CARDINAL), (1267154747, 82, 9...","[(appointment, 24, 35, NOUN), (order, 68, 73, ...","[(appointment, 24, 35, NOUN), (order, 68, 73, ...","[(i, 0, 1, NP), (my appointment, 21, 35, NP), ...","[(order number, 68, 80, COMPOUND), (appointmen...","{order number, location, time, follow, compens..."
1,this is the third time i made an appointment w...,"[(third, 12, 17, ORDINAL), (ny, 70, 72, GPE), ...","[(time, 18, 22, NOUN), (appointment, 33, 44, N...","[(time, 18, 22, NOUN), (appointment, 33, 44, N...","[(the third time, 8, 22, NP), (i, 23, 24, NP),...","[(acme corp, 50, 59, COMPOUND), (corp ny, 55, ...","{out, time, internet, email, home, patchogue, ..."
...,...,...,...,...,...,...,...
4620,hola me podrÌ£åÁn cotizar,[],"[(cotizar, 18, 25, NOUN)]","[(cotizar, 18, 25, NOUN)]","[(me, 5, 7, NP), (podrÌ£åÁn cotizar, 8, 25, NP)]",[],{cotizar}
4621,can you install the resonator for me ?,[],"[(resonator, 20, 29, NOUN)]","[(resonator, 20, 29, NOUN)]","[(you, 4, 7, NP), (the resonator, 16, 29, NP),...",[],{resonator}


## Extract all unique entities

In [235]:
def extract_all_entities(df, col):
    return {entity for group in df[col] for entity in group}

In [236]:
entities = extract_all_entities(df, 'entities_v4')
entities

{'answer',
 'ht51',
 'email address',
 'sunrise',
 'scrambler s',
 'city store',
 'spacers',
 'aparece muchos',
 'word',
 'gates cap',
 'complaint',
 'dash mount',
 'rouge',
 'corp customer',
 'bosch gauges',
 'ha;f',
 'rotation',
 'sacramento',
 'mike',
 'toyotya le',
 'performance mods',
 'tire sensors',
 'milford boy',
 'matrix driver',
 'arabia',
 'chevy tahoe',
 'gxp',
 'intake hose',
 'password',
 'bench',
 '4000 lb',
 'act',
 'aspen',
 'okay thanks',
 'disk',
 'axle',
 'dickson',
 'chevrolet',
 'sr7087',
 'hubcap',
 'position',
 'gmc pickup',
 'windshield',
 'igition',
 'style coils',
 'chevy tie',
 'tow service',
 'anaheim',
 'fram',
 'handling',
 'rs5144',
 'thing',
 'brz',
 'brembo calipers',
 'pickup 12/7/2017',
 'side mirror',
 'v2 tire',
 'rears',
 'information lance',
 'pad elizabeth',
 'nerd',
 's10 4.3l',
 'acme store',
 'pickup date',
 'trunk lifts',
 'cylinder head',
 'cv half',
 'w. hillsborough',
 'air cleaner',
 'shoes',
 'accord',
 'sri lanka',
 'cover audis',
 'p

In [238]:
entity_dump_df = pd.DataFrame(pd.Series(list(entities)), columns=['entities'])
entity_dump_df

Unnamed: 0,entities
0,answer
1,ht51
...,...
7597,s10 chevy
7598,macon mo


In [239]:
entity_dump_df.to_csv('extracted_entities.csv', index=False, header=True)

In [262]:
entities = pd.read_csv('extracted_entities.csv')
entities = set(entities['entities'])
entities

{'answer',
 'ht51',
 'email address',
 'sunrise',
 'scrambler s',
 'city store',
 'spacers',
 'aparece muchos',
 'word',
 'gates cap',
 'complaint',
 'dash mount',
 'rouge',
 'corp customer',
 'bosch gauges',
 'ha;f',
 'rotation',
 'sacramento',
 'mike',
 'toyotya le',
 'performance mods',
 'tire sensors',
 'milford boy',
 'matrix driver',
 'arabia',
 'chevy tahoe',
 'gxp',
 'intake hose',
 'password',
 'bench',
 '4000 lb',
 'act',
 'aspen',
 'okay thanks',
 'disk',
 'axle',
 'dickson',
 'chevrolet',
 'sr7087',
 'hubcap',
 'position',
 'gmc pickup',
 'windshield',
 'igition',
 'style coils',
 'chevy tie',
 'tow service',
 'anaheim',
 'fram',
 'handling',
 'rs5144',
 'thing',
 'brz',
 'brembo calipers',
 'pickup 12/7/2017',
 'nerd',
 'side mirror',
 'v2 tire',
 'rears',
 'information lance',
 'pad elizabeth',
 's10 4.3l',
 'acme store',
 'pickup date',
 'trunk lifts',
 'cylinder head',
 'cv half',
 'w. hillsborough',
 'air cleaner',
 'shoes',
 'accord',
 'sri lanka',
 'cover audis',
 'p

## Clean Entities

In [264]:
def drop_syms(entities):
    drop_char = list(';/$,.~`\\\{\}|\'[]<>?"!@#$%^&*()_+-=')
    drop_entities = {ent for ent in entities for char in drop_char if ent.find(char) != -1}
    print(f'Number of entities: {len(entities)} -> {len(entities - drop_entities)}')
    return entities - drop_entities

def drop_nums(entities):
    drop_char = list('0123456789')
    drop_entities = {ent for ent in entities for char in drop_char if ent.find(char) != -1}
    print(f'Number of entities: {len(entities)} -> {len(entities - drop_entities)}')
    return entities - drop_entities

In [265]:
entities = drop_syms(entities)
entities = drop_nums(entities)

Number of entities: 7599 -> 7254
Number of entities: 7254 -> 6626


In [280]:
string = "h     ello  world"
' '.join([ent for ent in string.split(' ') if not len(ent) == 1])

'    ello  world'

In [26]:
# TODO: combine alternate values into parent entity
# TODO: remove all ones with single letters
# TODO: remove all the stop words and small talk ones
# TODO: fix noun phrases to fit company entities models
# TODO: extract alternate values