In [126]:
import pandas as pd
import spacy
from spacy import displacy
from tqdm import tqdm
tqdm.pandas()

## Step 1: Inspect data

In [236]:
entity_df = pd.read_csv('queries.csv', header=None)
entity_df.columns = ['queries']
pd.set_option('display.max_rows', 20)
display(entity_df)

Unnamed: 0,queries
0,i need to reschedule my appointment which was ...
1,this is the third time i made an appointment w...
2,why if i make a appointment for 8 am do they t...
3,called pasadena store sunday am . asked how lo...
4,is it normal to have a 2 pm appointment for ti...
5,i schedule a appointment on your website and w...
6,i think it is rude to send me an email saying ...
7,i had an appointment today for an oil change ....
8,had an appointment for 9 am brought my car in ...
9,made an appointment online yesterday morning f...


In [118]:
nlp = spacy.load('en_core_web_lg')

In [237]:
extract_spacy_entities = lambda query: [(ent.text, ent.label_) for ent in nlp(query).ents]
def show_spacy_entities(idx, style='ent', df=entity_df):
    query = df['queries'][idx]
    displacy.render(nlp(query), style=style, jupyter=True)

In [238]:
mini_entity_df = entity_df[:20]
query = mini_entity_df['queries'][0]
print(extract_entities(query))
show_spacy_entities(0)

[('11/24', 'CARDINAL'), ('1267154747', 'DATE')]


In [239]:
def add_spacy_entities(df=entity_df):
    df['spacy_entities'] = df['queries'].apply(extract_spacy_entities)
add_spacy_entities(mini_entity_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [242]:
display(mini_entity_df)

Unnamed: 0,queries,spacy_entities
0,i need to reschedule my appointment which was ...,"[(11/24, CARDINAL), (1267154747, DATE)]"
1,this is the third time i made an appointment w...,"[(third, ORDINAL), (ny, GPE), (an hour, TIME),..."
2,why if i make a appointment for 8 am do they t...,"[(8, DATE), (30, QUANTITY), (15 min, TIME), (e..."
3,called pasadena store sunday am . asked how lo...,"[(sunday, DATE), (am, TIME), (about 2 hours, T..."
4,is it normal to have a 2 pm appointment for ti...,"[(2 pm, TIME), (2 hours, TIME), (almost 2 hour..."
5,i schedule a appointment on your website and w...,"[(2 hours, TIME), (second, ORDINAL)]"
6,i think it is rude to send me an email saying ...,"[(30 min, TIME), (3 hours, TIME), (zero, CARDI..."
7,i had an appointment today for an oil change ....,"[(today, DATE), (two, CARDINAL)]"
8,had an appointment for 9 am brought my car in ...,"[(9 am, TIME), (over and hour, TIME), (a few m..."
9,made an appointment online yesterday morning f...,"[(yesterday, DATE), (morning, TIME), (7 pm tha..."


## Step 2: Extract all nouns

In [185]:
from spacy.displacy.render import DependencyRenderer, EntityRenderer
from spacy.displacy import parse_deps, parse_ents

In [166]:
??displacy

In [165]:
??EntityRenderer

In [162]:
??DependencyRenderer

In [202]:
??parse_deps

In [219]:
parse_ents(nlp(test_query))

{'text': 'Apple is looking at buying U.K. startup for $1 billion',
 'ents': [{'start': 0, 'end': 5, 'label': 'ORG'},
  {'start': 27, 'end': 31, 'label': 'GPE'},
  {'start': 44, 'end': 54, 'label': 'MONEY'}],
 'title': None}

In [216]:
def extract_custom_entities(query):
    keep_pos = ['PRON', 'PROPN', 'NOUN']
    return [(tok.text, tok.pos_) for tok in nlp(query) if tok.pos_ in keep_pos]  

In [243]:
def add_custom_entities(df=entity_df):
    df['custom_entities'] = df['queries'].apply(extract_custom_entities)
add_custom_entities(mini_entity_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [244]:
display(mini_entity_df)

Unnamed: 0,queries,spacy_entities,custom_entities
0,i need to reschedule my appointment which was ...,"[(11/24, CARDINAL), (1267154747, DATE)]","[(Apple, PROPN), (U.K., PROPN), (startup, NOUN)]"
1,this is the third time i made an appointment w...,"[(third, ORDINAL), (ny, GPE), (an hour, TIME),...","[(Apple, PROPN), (U.K., PROPN), (startup, NOUN)]"
2,why if i make a appointment for 8 am do they t...,"[(8, DATE), (30, QUANTITY), (15 min, TIME), (e...","[(Apple, PROPN), (U.K., PROPN), (startup, NOUN)]"
3,called pasadena store sunday am . asked how lo...,"[(sunday, DATE), (am, TIME), (about 2 hours, T...","[(Apple, PROPN), (U.K., PROPN), (startup, NOUN)]"
4,is it normal to have a 2 pm appointment for ti...,"[(2 pm, TIME), (2 hours, TIME), (almost 2 hour...","[(Apple, PROPN), (U.K., PROPN), (startup, NOUN)]"
5,i schedule a appointment on your website and w...,"[(2 hours, TIME), (second, ORDINAL)]","[(Apple, PROPN), (U.K., PROPN), (startup, NOUN)]"
6,i think it is rude to send me an email saying ...,"[(30 min, TIME), (3 hours, TIME), (zero, CARDI...","[(Apple, PROPN), (U.K., PROPN), (startup, NOUN)]"
7,i had an appointment today for an oil change ....,"[(today, DATE), (two, CARDINAL)]","[(Apple, PROPN), (U.K., PROPN), (startup, NOUN)]"
8,had an appointment for 9 am brought my car in ...,"[(9 am, TIME), (over and hour, TIME), (a few m...","[(Apple, PROPN), (U.K., PROPN), (startup, NOUN)]"
9,made an appointment online yesterday morning f...,"[(yesterday, DATE), (morning, TIME), (7 pm tha...","[(Apple, PROPN), (U.K., PROPN), (startup, NOUN)]"
