In [15]:
import pandas as pd
import spacy
from spacy.displacy.render import EntityRenderer
from spacy.tokens import Doc

In [98]:
_html = {}

def render(docs, df, options={}, style='ent', page=False, minify=False, jupyter=True, manual=False, idx=0):
    if isinstance(docs, Doc) or isinstance(docs, dict):
        docs = [docs]
    renderer, converter = EntityRenderer, parse_custom_ents
    renderer = renderer(options=options)
    parsed = [converter(doc, df, idx) for doc in docs] if not manual else docs
    _html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
    html = _html['parsed']
    if jupyter:  # return HTML rendered by IPython display()
        from IPython.core.display import display, HTML
        return display(HTML(html))
    return html

def parse_custom_ents(doc, df, idx):
    if df['entities'].bool:
        entities = df['entities'][idx]
        ents = [{'start': ent[0], 'end': ent[1], 'label': ent[2]} 
                for ent in entities.values()]
    else:
        ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
            for ent in doc.ents]
    title = None
    return {'text': doc.text, 'ents': ents, 'title': title}

## Step 1: Inspect data

In [47]:
entity_df = pd.read_csv('queries.csv', header=None)
entity_df.columns = ['queries']
pd.set_option('display.max_rows', 5)
display(entity_df)

Unnamed: 0,queries
0,i need to reschedule my appointment which was ...
1,this is the third time i made an appointment w...
...,...
4620,hola me podrÌ£åÁn cotizar
4621,can you install the resonator for me ?


In [6]:
nlp = spacy.load('en_core_web_lg')

In [95]:
def extract_named_entities(query):
    return {ent.text: [ent.start_char, ent.end_char, ent.label_] for ent in nlp(query).ents}
def show_entities(idx, style='ent', df=entity_df):
    options = {'color': {'PRON': '#2391D7', 'PROPN': '#2391D7', 'NOUN': '#2391D7'}}
    query = df['queries'][idx]
    render(nlp(query), df, style=style, options=options, jupyter=True, idx=idx)

In [49]:
mini_entity_df.get('entities')

0    {'i': [271, 272, 'PRON'], 'appointment': [348,...
1    {'time': [18, 22, 'NOUN'], 'i': [340, 341, 'PR...
2    {'i': [243, 244, 'PRON'], 'appointment': [268,...
3    {'pasadena': [7, 15, 'NOUN'], 'store': [16, 21...
4    {'it': [189, 191, 'PRON'], 'pm': [25, 27, 'NOU...
Name: entities, dtype: object

In [50]:
mini_entity_df = entity_df[:5]
query = mini_entity_df['queries'][0]
print(extract_named_entities(query))
show_entities(0, df=mini_entity_df)

{'11/24': [60, 65, 'CARDINAL'], '1267154747': [82, 92, 'DATE']}
{'text': "i need to reschedule my appointment which was originally on 11/24 . order number  1267154747 . i was there for my appointment  my tires were not . the location was supposed to call me when the tires arrived which they never did . i 'd like to reschedule my appointment . i 'd also like some type of compensation for not only wasting my time on the appointment date but for the lack of follow up by the location .", 'ents': [{'start': 60, 'end': 65, 'label': 'CARDINAL'}, {'start': 82, 'end': 92, 'label': 'DATE'}], 'title': None}


In [51]:
def add_named_entities(df=entity_df):
    df['named_entities'] = df['queries'].apply(extract_named_entities)
add_named_entities(mini_entity_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [52]:
display(mini_entity_df)

Unnamed: 0,queries,named_entities
0,i need to reschedule my appointment which was ...,"{'11/24': [60, 65, 'CARDINAL'], '1267154747': ..."
1,this is the third time i made an appointment w...,"{'third': [12, 17, 'ORDINAL'], 'ny': [70, 72, ..."
2,why if i make a appointment for 8 am do they t...,"{'8': [32, 33, 'DATE'], '30': [159, 161, 'QUAN..."
3,called pasadena store sunday am . asked how lo...,"{'sunday': [22, 28, 'DATE'], 'am': [29, 31, 'T..."
4,is it normal to have a 2 pm appointment for ti...,"{'2 pm': [23, 27, 'TIME'], '2 hours': [203, 21..."


## Step 2: Extract all nouns

In [20]:
??displacy

In [12]:
??EntityRenderer

In [53]:
def extract_nouns(query):
    keep_pos = ['PRON', 'PROPN', 'NOUN']
    return {tok.text: [tok.idx, tok.idx+len(tok.text), tok.pos_] for tok in nlp(query) if tok.pos_ in keep_pos}

In [54]:
def add_nouns(df=entity_df):
    df['nouns'] = df['queries'].apply(extract_nouns)
add_nouns(mini_entity_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [55]:
display(mini_entity_df)

Unnamed: 0,queries,named_entities,nouns
0,i need to reschedule my appointment which was ...,"{'11/24': [60, 65, 'CARDINAL'], '1267154747': ...","{'i': [271, 272, 'PRON'], 'appointment': [348,..."
1,this is the third time i made an appointment w...,"{'third': [12, 17, 'ORDINAL'], 'ny': [70, 72, ...","{'time': [18, 22, 'NOUN'], 'i': [340, 341, 'PR..."
2,why if i make a appointment for 8 am do they t...,"{'8': [32, 33, 'DATE'], '30': [159, 161, 'QUAN...","{'i': [243, 244, 'PRON'], 'appointment': [268,..."
3,called pasadena store sunday am . asked how lo...,"{'sunday': [22, 28, 'DATE'], 'am': [29, 31, 'T...","{'pasadena': [7, 15, 'NOUN'], 'store': [16, 21..."
4,is it normal to have a 2 pm appointment for ti...,"{'2 pm': [23, 27, 'TIME'], '2 hours': [203, 21...","{'it': [189, 191, 'PRON'], 'pm': [25, 27, 'NOU..."


In [56]:
def extract_entities(entity_series):
    entities = entity_series['nouns']
    for entity, metadata in entities.items():
        try:
            if entity_series['named_entities'][entity][0] == metadata[0]:
                entities[entity] = entity_series['named_entities'][entity]
        except KeyError:
            continue
    return entities

In [57]:
def add_entities(df=entity_df):
    df['entities'] = df.apply(extract_entities, axis=1)
add_entities(mini_entity_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [58]:
mini_entity_df

Unnamed: 0,queries,named_entities,nouns,entities
0,i need to reschedule my appointment which was ...,"{'11/24': [60, 65, 'CARDINAL'], '1267154747': ...","{'i': [271, 272, 'PRON'], 'appointment': [348,...","{'i': [271, 272, 'PRON'], 'appointment': [348,..."
1,this is the third time i made an appointment w...,"{'third': [12, 17, 'ORDINAL'], 'ny': [70, 72, ...","{'time': [18, 22, 'NOUN'], 'i': [340, 341, 'PR...","{'time': [18, 22, 'NOUN'], 'i': [340, 341, 'PR..."
2,why if i make a appointment for 8 am do they t...,"{'8': [32, 33, 'DATE'], '30': [159, 161, 'QUAN...","{'i': [243, 244, 'PRON'], 'appointment': [268,...","{'i': [243, 244, 'PRON'], 'appointment': [268,..."
3,called pasadena store sunday am . asked how lo...,"{'sunday': [22, 28, 'DATE'], 'am': [29, 31, 'T...","{'pasadena': [7, 15, 'NOUN'], 'store': [16, 21...","{'pasadena': [7, 15, 'NOUN'], 'store': [16, 21..."
4,is it normal to have a 2 pm appointment for ti...,"{'2 pm': [23, 27, 'TIME'], '2 hours': [203, 21...","{'it': [189, 191, 'PRON'], 'pm': [25, 27, 'NOU...","{'it': [189, 191, 'PRON'], 'pm': [25, 27, 'NOU..."


In [99]:
show_entities(1, df=mini_entity_df)

In [89]:
mini_entity_df['named_entities'][0]

{'11/24': [60, 65, 'CARDINAL'], '1267154747': [82, 92, 'DATE']}

In [91]:
mini_entity_df['entities'][0]

{'i': [271, 272, 'PRON'],
 'appointment': [348, 359, 'NOUN'],
 'order': [68, 73, 'NOUN'],
 'number': [74, 80, 'NOUN'],
 'tires': [193, 198, 'NOUN'],
 'location': [402, 410, 'NOUN'],
 'me': [181, 183, 'PRON'],
 'they': [213, 217, 'PRON'],
 'type': [291, 295, 'NOUN'],
 'compensation': [299, 311, 'NOUN'],
 'time': [336, 340, 'NOUN'],
 'date': [360, 364, 'NOUN'],
 'lack': [377, 381, 'NOUN'],
 'follow': [385, 391, 'NOUN']}