In [126]:
import pandas as pd
import spacy
from spacy import displacy
from tqdm import tqdm
tqdm.pandas()

## Step 1: Inspect data

In [302]:
entity_df = pd.read_csv('queries.csv', header=None)
entity_df.columns = ['queries']
pd.set_option('display.max_rows', 5)
display(entity_df)

Unnamed: 0,queries
0,i need to reschedule my appointment which was ...
1,this is the third time i made an appointment w...
...,...
4620,hola me podrÌ£åÁn cotizar
4621,can you install the resonator for me ?


In [118]:
nlp = spacy.load('en_core_web_lg')

In [312]:
def extract_named_entities(query):
    return {ent.text: [ent.start_char, ent.end_char, ent.label_] for ent in nlp(query).ents}
def show_named_entities(idx, style='ent', df=entity_df):
    query = df['queries'][idx]
    displacy.render(nlp(query), style=style, jupyter=True)

In [313]:
mini_entity_df = entity_df[:5]
query = mini_entity_df['queries'][0]
print(extract_entities(query))
show_spacy_entities(0)

[('11/24', 'CARDINAL'), ('1267154747', 'DATE')]


In [314]:
def add_named_entities(df=entity_df):
    df['named_entities'] = df['queries'].apply(extract_named_entities)
add_named_entities(mini_entity_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [315]:
display(mini_entity_df)

Unnamed: 0,queries,named_entities
0,i need to reschedule my appointment which was ...,"{'11/24': [60, 65, 'CARDINAL'], '1267154747': ..."
1,this is the third time i made an appointment w...,"{'third': [12, 17, 'ORDINAL'], 'ny': [70, 72, ..."
2,why if i make a appointment for 8 am do they t...,"{'8': [32, 33, 'DATE'], '30': [159, 161, 'QUAN..."
3,called pasadena store sunday am . asked how lo...,"{'sunday': [22, 28, 'DATE'], 'am': [29, 31, 'T..."
4,is it normal to have a 2 pm appointment for ti...,"{'2 pm': [23, 27, 'TIME'], '2 hours': [203, 21..."


## Step 2: Extract all nouns

In [286]:
from spacy.displacy.render import DependencyRenderer, EntityRenderer
from spacy.displacy import parse_deps, parse_ents

In [287]:
??displacy

In [288]:
??EntityRenderer

In [289]:
??DependencyRenderer

In [202]:
??parse_deps

In [290]:
parse_ents(nlp(test_query))

{'text': 'Apple is looking at buying U.K. startup for $1 billion',
 'ents': [{'start': 0, 'end': 5, 'label': 'ORG'},
  {'start': 27, 'end': 31, 'label': 'GPE'},
  {'start': 44, 'end': 54, 'label': 'MONEY'}],
 'title': None}

In [316]:
def extract_nouns(query):
    keep_pos = ['PRON', 'PROPN', 'NOUN']
    return {tok.text: [tok.idx, tok.idx+len(tok.text), tok.pos_] for tok in nlp(query) if tok.pos_ in keep_pos}

In [317]:
def add_nouns(df=entity_df):
    df['nouns'] = df['queries'].apply(extract_nouns)
add_nouns(mini_entity_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [318]:
display(mini_entity_df)

Unnamed: 0,queries,named_entities,nouns
0,i need to reschedule my appointment which was ...,"{'11/24': [60, 65, 'CARDINAL'], '1267154747': ...","{'i': [271, 272, 'PRON'], 'appointment': [348,..."
1,this is the third time i made an appointment w...,"{'third': [12, 17, 'ORDINAL'], 'ny': [70, 72, ...","{'time': [18, 22, 'NOUN'], 'i': [340, 341, 'PR..."
2,why if i make a appointment for 8 am do they t...,"{'8': [32, 33, 'DATE'], '30': [159, 161, 'QUAN...","{'i': [243, 244, 'PRON'], 'appointment': [268,..."
3,called pasadena store sunday am . asked how lo...,"{'sunday': [22, 28, 'DATE'], 'am': [29, 31, 'T...","{'pasadena': [7, 15, 'NOUN'], 'store': [16, 21..."
4,is it normal to have a 2 pm appointment for ti...,"{'2 pm': [23, 27, 'TIME'], '2 hours': [203, 21...","{'it': [189, 191, 'PRON'], 'pm': [25, 27, 'NOU..."


In [328]:
mini_entity_df['named_entities'][1]

{'third': [12, 17, 'ORDINAL'],
 'ny': [70, 72, 'GPE'],
 'an hour': [261, 268, 'TIME'],
 'three': [362, 367, 'CARDINAL'],
 '1286793129': [399, 409, 'MONEY'],
 '12/29/17 11:00am': [410, 426, 'ORG']}

In [331]:
mini_entity_df['nouns'][1]

{'time': [18, 22, 'NOUN'],
 'i': [340, 341, 'PRON'],
 'appointment': [427, 438, 'NOUN'],
 'acme': [50, 54, 'NOUN'],
 'corp': [55, 59, 'NOUN'],
 'patchogue': [60, 69, 'NOUN'],
 'ny': [70, 72, 'NOUN'],
 'internet': [82, 90, 'NOUN'],
 'confirmation': [210, 222, 'NOUN'],
 'print': [186, 191, 'NOUN'],
 'email': [321, 326, 'NOUN'],
 'it': [223, 225, 'PRON'],
 'm': [247, 248, 'NOUN'],
 'hour': [264, 268, 'NOUN'],
 'home': [330, 334, 'NOUN'],
 'service': [352, 359, 'NOUN'],
 'strikes': [368, 375, 'NOUN'],
 'out': [385, 388, 'NOUN']}

In [332]:
entities = mini_entity_df['nouns'][1]
for entity, metadata in entities.items():
    try:
        if mini_entity_df['named_entities'][1][entity][0] == metadata[0]:
            entities[entity] = mini_entity_df['named_entities'][1][entity]
    except KeyError:
        continue

In [333]:
entities

{'time': [18, 22, 'NOUN'],
 'i': [340, 341, 'PRON'],
 'appointment': [427, 438, 'NOUN'],
 'acme': [50, 54, 'NOUN'],
 'corp': [55, 59, 'NOUN'],
 'patchogue': [60, 69, 'NOUN'],
 'ny': [70, 72, 'GPE'],
 'internet': [82, 90, 'NOUN'],
 'confirmation': [210, 222, 'NOUN'],
 'print': [186, 191, 'NOUN'],
 'email': [321, 326, 'NOUN'],
 'it': [223, 225, 'PRON'],
 'm': [247, 248, 'NOUN'],
 'hour': [264, 268, 'NOUN'],
 'home': [330, 334, 'NOUN'],
 'service': [352, 359, 'NOUN'],
 'strikes': [368, 375, 'NOUN'],
 'out': [385, 388, 'NOUN']}

In [338]:
def extract_entities(entity_series):
    entities = entity_series['nouns']
    for entity, metadata in entities.items():
        try:
            if entity_series['named_entities'][entity][0] == metadata[0]:
                entities[entity] = entity_series['named_entities'][entity]
        except KeyError:
            continue
    return entities

In [339]:
def add_entities(df=entity_df):
    df['entities'] = df.apply(extract_entities, axis=1)
add_entities(mini_entity_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [340]:
mini_entity_df

Unnamed: 0,queries,named_entities,nouns,entities
0,i need to reschedule my appointment which was ...,"{'11/24': [60, 65, 'CARDINAL'], '1267154747': ...","{'i': [271, 272, 'PRON'], 'appointment': [348,...","{'i': [271, 272, 'PRON'], 'appointment': [348,..."
1,this is the third time i made an appointment w...,"{'third': [12, 17, 'ORDINAL'], 'ny': [70, 72, ...","{'time': [18, 22, 'NOUN'], 'i': [340, 341, 'PR...","{'time': [18, 22, 'NOUN'], 'i': [340, 341, 'PR..."
2,why if i make a appointment for 8 am do they t...,"{'8': [32, 33, 'DATE'], '30': [159, 161, 'QUAN...","{'i': [243, 244, 'PRON'], 'appointment': [268,...","{'i': [243, 244, 'PRON'], 'appointment': [268,..."
3,called pasadena store sunday am . asked how lo...,"{'sunday': [22, 28, 'DATE'], 'am': [29, 31, 'T...","{'pasadena': [7, 15, 'NOUN'], 'store': [16, 21...","{'pasadena': [7, 15, 'NOUN'], 'store': [16, 21..."
4,is it normal to have a 2 pm appointment for ti...,"{'2 pm': [23, 27, 'TIME'], '2 hours': [203, 21...","{'it': [189, 191, 'PRON'], 'pm': [25, 27, 'NOU...","{'it': [189, 191, 'PRON'], 'pm': [25, 27, 'NOU..."


In [342]:
#TODO: add visual support to see entities