In [137]:
import pandas as pd
import spacy
from spacy.displacy.render import EntityRenderer
from spacy.tokens import Doc
from IPython.core.display import display, HTML

## Step 1: Inspect data

In [122]:
entity_df = pd.read_csv('queries.csv', header=None)
entity_df.columns = ['queries']
pd.set_option('display.max_rows', 5)
display(entity_df)

Unnamed: 0,queries
0,i need to reschedule my appointment which was ...
1,this is the third time i made an appointment w...
...,...
4620,hola me podrÌ£åÁn cotizar
4621,can you install the resonator for me ?


In [123]:
nlp = spacy.load('en_core_web_lg')

In [143]:
def custom_render(doc, df, options={}, page=False, minify=False, idx=0):
    renderer, converter = EntityRenderer, parse_custom_ents
    renderer = renderer(options=options)
    parsed = [converter(doc, df, idx)]
    html = renderer.render(parsed, page=page, minify=minify).strip()  
    return display(HTML(html))

def parse_custom_ents(doc, df, idx):
    if 'entities' in df.columns:
        entities = df['entities'][idx]
        ents = [{'start': ent[1], 'end': ent[2], 'label': ent[3]} 
                for ent in entities]
    else:
        ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
            for ent in doc.ents]
    title = None
    return {'text': doc.text, 'ents': ents, 'title': title}

def extract_named_entities(query):
    return [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in nlp(query).ents]

def show_entities(idx, df, options={}):
    query = df['queries'][idx]
    custom_render(nlp(query), df, options=options, idx=idx)
    
pd.options.mode.chained_assignment = None # prevent warning about working on a copy of a df

In [178]:
mini_entity_df = entity_df[:5]
show_entities(0, mini_entity_df)

In [179]:
def add_named_entities(df=entity_df):
    df['named_entities'] = df['queries'].apply(extract_named_entities)
add_named_entities(mini_entity_df)

In [180]:
display(mini_entity_df)

Unnamed: 0,queries,named_entities
0,i need to reschedule my appointment which was ...,"{'11/24': [60, 65, 'CARDINAL'], '1267154747': ..."
1,this is the third time i made an appointment w...,"{'third': [12, 17, 'ORDINAL'], 'ny': [70, 72, ..."
2,why if i make a appointment for 8 am do they t...,"{'8': [32, 33, 'DATE'], '30': [159, 161, 'QUAN..."
3,called pasadena store sunday am . asked how lo...,"{'sunday': [22, 28, 'DATE'], 'am': [29, 31, 'T..."
4,is it normal to have a 2 pm appointment for ti...,"{'2 pm': [23, 27, 'TIME'], '2 hours': [203, 21..."


## Step 2: Extract all nouns

In [181]:
def extract_nouns(query):
    keep_pos = ['PRON', 'PROPN', 'NOUN']
    return [(tok.text, tok.idx, tok.idx+len(tok.text), tok.pos_) for tok in nlp(query) if tok.pos_ in keep_pos]

In [182]:
def add_nouns(df=entity_df):
    df['nouns'] = df['queries'].apply(extract_nouns)
add_nouns(mini_entity_df)

In [183]:
display(mini_entity_df)

Unnamed: 0,queries,named_entities,nouns
0,i need to reschedule my appointment which was ...,"{'11/24': [60, 65, 'CARDINAL'], '1267154747': ...","{'i': [271, 272, 'PRON'], 'appointment': [348,..."
1,this is the third time i made an appointment w...,"{'third': [12, 17, 'ORDINAL'], 'ny': [70, 72, ...","{'time': [18, 22, 'NOUN'], 'i': [340, 341, 'PR..."
2,why if i make a appointment for 8 am do they t...,"{'8': [32, 33, 'DATE'], '30': [159, 161, 'QUAN...","{'i': [243, 244, 'PRON'], 'appointment': [268,..."
3,called pasadena store sunday am . asked how lo...,"{'sunday': [22, 28, 'DATE'], 'am': [29, 31, 'T...","{'pasadena': [7, 15, 'NOUN'], 'store': [16, 21..."
4,is it normal to have a 2 pm appointment for ti...,"{'2 pm': [23, 27, 'TIME'], '2 hours': [203, 21...","{'it': [189, 191, 'PRON'], 'pm': [25, 27, 'NOU..."


In [184]:
def extract_entities(row_series):
    entities = {}
    
    for entity in row_series['nouns']:
        entities[entity[0]] if entity[]
        
## TODO: figure out how to combine named_entities and nouns to get proper tagging
        
    
    for entity, metadata in entities.items():
        try:
            if entity_series['named_entities'][entity][0] == metadata[0]:
                entities[entity] = entity_series['named_entities'][entity]
        except KeyError:
            continue
#     for entity, metadata in entity_series['named_entities'].items():
#         entities[entity] = metadata
    return entities

In [185]:
def add_entities(df=entity_df):
    df['entities'] = df.apply(extract_entities, axis=1)
add_entities(mini_entity_df)

In [186]:
display(mini_entity_df)

Unnamed: 0,queries,named_entities,nouns,entities
0,i need to reschedule my appointment which was ...,"{'11/24': [60, 65, 'CARDINAL'], '1267154747': ...","{'i': [271, 272, 'PRON'], 'appointment': [348,...","{'i': [271, 272, 'PRON'], 'appointment': [348,..."
1,this is the third time i made an appointment w...,"{'third': [12, 17, 'ORDINAL'], 'ny': [70, 72, ...","{'time': [18, 22, 'NOUN'], 'i': [340, 341, 'PR...","{'time': [18, 22, 'NOUN'], 'i': [340, 341, 'PR..."
2,why if i make a appointment for 8 am do they t...,"{'8': [32, 33, 'DATE'], '30': [159, 161, 'QUAN...","{'i': [243, 244, 'PRON'], 'appointment': [268,...","{'i': [243, 244, 'PRON'], 'appointment': [268,..."
3,called pasadena store sunday am . asked how lo...,"{'sunday': [22, 28, 'DATE'], 'am': [29, 31, 'T...","{'pasadena': [7, 15, 'NOUN'], 'store': [16, 21...","{'pasadena': [7, 15, 'NOUN'], 'store': [16, 21..."
4,is it normal to have a 2 pm appointment for ti...,"{'2 pm': [23, 27, 'TIME'], '2 hours': [203, 21...","{'it': [189, 191, 'PRON'], 'pm': [25, 27, 'NOU...","{'it': [189, 191, 'PRON'], 'pm': [25, 27, 'NOU..."


In [187]:
options = {'colors': {'PRON': '#2391D7', 'PROPN': '#2391D7', 'NOUN': '#2391D7'}}
show_entities(0, mini_entity_df, options=options)

In [211]:
q = mini_entity_df['queries'][0]
q

"i need to reschedule my appointment which was originally on 11/24 . order number  1267154747 . i was there for my appointment  my tires were not . the location was supposed to call me when the tires arrived which they never did . i 'd like to reschedule my appointment . i 'd also like some type of compensation for not only wasting my time on the appointment date but for the lack of follow up by the location ."

In [212]:
doc = nlp(q)
displacy.render(doc, style='ent', jupyter=True)

In [213]:
extract_nouns(q)

{'i': [271, 272, 'PRON'],
 'appointment': [348, 359, 'NOUN'],
 'order': [68, 73, 'NOUN'],
 'number': [74, 80, 'NOUN'],
 'tires': [193, 198, 'NOUN'],
 'location': [402, 410, 'NOUN'],
 'me': [181, 183, 'PRON'],
 'they': [213, 217, 'PRON'],
 'type': [291, 295, 'NOUN'],
 'compensation': [299, 311, 'NOUN'],
 'time': [336, 340, 'NOUN'],
 'date': [360, 364, 'NOUN'],
 'lack': [377, 381, 'NOUN'],
 'follow': [385, 391, 'NOUN']}

In [221]:
keep_pos = ['PRON', 'PROPN', 'NOUN']
[(tok.text, tok.idx, tok.idx+len(tok.text), tok.pos_) for tok in doc if tok.pos_ in keep_pos]

[('i', 0, 1, 'PRON'),
 ('appointment', 24, 35, 'NOUN'),
 ('order', 68, 73, 'NOUN'),
 ('number', 74, 80, 'NOUN'),
 ('i', 95, 96, 'PRON'),
 ('appointment', 114, 125, 'NOUN'),
 ('tires', 130, 135, 'NOUN'),
 ('location', 151, 159, 'NOUN'),
 ('me', 181, 183, 'PRON'),
 ('tires', 193, 198, 'NOUN'),
 ('they', 213, 217, 'PRON'),
 ('i', 230, 231, 'PRON'),
 ('appointment', 257, 268, 'NOUN'),
 ('i', 271, 272, 'PRON'),
 ('type', 291, 295, 'NOUN'),
 ('compensation', 299, 311, 'NOUN'),
 ('time', 336, 340, 'NOUN'),
 ('appointment', 348, 359, 'NOUN'),
 ('date', 360, 364, 'NOUN'),
 ('lack', 377, 381, 'NOUN'),
 ('follow', 385, 391, 'NOUN'),
 ('location', 402, 410, 'NOUN')]