In [2]:
import pandas as pd
import numpy as np
import spacy
import itertools
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = stopwords.words('english')
nlp = spacy.load('en_core_web_lg')
import matplotlib.pyplot as plt
%matplotlib inline

#### Spent a couple of train rides hand-labeling the relevant topics for a summary on 200 DailyDialog conversations with > 10 turns.

In [7]:
df = pd.read_csv('../data/first_200_conversations_labelled.csv', encoding='latin-1',index_col=0)

Sans time series filtering, these are the algorithms employed for the SummaryBot NER.

In [32]:
interesting_entity_types = ['EVENT','PERSON','PRODUCT',
                            'ORG','TIME',# v. relevant to a summary
                            'FAC','GPE','LOC', # locations
                            'LANGUAGE','WORK_OF_ART' #others
                            ]
nonlocation_entities = ['EVENT','PERSON','PRODUCT','ORG','TIME','LANGUAGE','WORK_OF_ART']

def extract_entities(conv_string):
    """
    Given a string as input, perform Named Entity Recognition.
    Extract interesting entities to report on in a summary.
    
    NOTE: Capitalization is important! 
          For example, "Amazon" will be recognized as an organization,
          but "amazon" will not be.
    """
    conv_nlp = nlp(conv_string)
    dct = {}
    # I feel like there's a way to get rid of/vectorize this for loop
    for ent in conv_nlp.ents:
        if ent.text.strip() == '':
            continue
        if ent.label_ in interesting_entity_types:
            try:
                dct[ent.label_].append(ent.text)
            except KeyError:
                dct[ent.label_] = [ent.text]
    dct_ = {k:set(dct[k]) for k in dct.keys()}
    return dct_    

Run the pipe! Extract interesting entities.

In [33]:
df['entity_dict'] = df['dialog'].apply(lambda x: extract_entities(x))

Create the "payload"; a string passed to Slack to be printed for the user.

In [36]:
def create_firstTwoFromDict_string(dct,key):
    """
    Extract first two entries (if there are more than one)
    from dictionary value dct[key] and print prettily.
    """
    kstring = None
    if key in dct:
        if len(dct[key]) == 1:
            kstring = list(dct[key])[0]
        else:
            # take first two
            # XXX this could be improved
            kstring = '{0} and {1}'.format(*list(dct[key])[:2])
    return kstring

def create_locationFromEntityDict_string(dct):
    """
    Generate a string for locations, according to relevance
    priorities: GPE > FAC > LOC (country/city/state > named places > lakes etc.)
    """
    locstring = None
    if 'GPE' in dct:
        locstring = create_firstTwoFromDict_string(dct,'GPE')
    elif 'FAC' in dct:
        locstring = create_firstTwoFromDict_string(dct,'FAC')
    elif 'LOC' in dct:
        locstring = create_firstTwoFromDict_string(dct,'LOC')
    return locstring

def construct_entity_payload(entities_dict,conversant_string=''):
    payload_dict = {}
    for k in nonlocation_entities:
        payload_dict['{0}_string'.format(k)] = create_firstTwoFromDict_string(entities_dict,k)
    payload_dict['LOC_string'] = create_locationFromEntityDict_string(entities_dict)
    
    # situation: no entities were extracted
    # XXX TODO: LOCATE SUBJECT NOUNS IN SPACY SPAN
    if all(v==None for v in payload_dict.values()):
        payload = conversant_string+'... something, but I cannot detect what.'
    else:
        payload = conversant_string+':'
        if payload_dict['EVENT_string']:
            payload+='EVENT: {0}; '.format(payload_dict['EVENT_string'])
        if payload_dict['LOC_string']:
            payload+='LOCATION: {0}; '.format(payload_dict['LOC_string'])
        if payload_dict['PERSON_string']:
            payload+='PEOPLE: {0}; '.format(payload_dict['PERSON_string'])
        if payload_dict['ORG_string']:
            payload+='ORGANIZATION: {0}; '.format(payload_dict['ORG_string'])
        if payload_dict['LANGUAGE_string']:
            payload+='LANGUAGE: {0}; '.format(payload_dict['LANGUAGE_string'])
        if payload_dict['WORK_OF_ART_string']:
            payload+='WORKS: {0}; '.format(payload_dict['WORK_OF_ART_string'])
        if payload_dict['TIME_string']:
            payload+='AT TIME: {0}'.format(payload_dict['TIME_string'])
    return payload

In [None]:
df['entity_payload'] = df['entity_dict'].apply(lambda x: construct_entity_payload(x))

Let's see how the NER matches up to the hand labels.

In [43]:
df[['summary_topics','entity_payload',]].iloc[:2,:]

Unnamed: 0_level_0,summary_topics,entity_payload
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"Opera, Carmen",:PEOPLE: Laura and Carmen;
1,"Cambridge University, room",:PEOPLE: Smith and Mrs; ORGANIZATION: Cambridg...


In [62]:
df['summary_topics_list'] = df['summary_topics'].apply(lambda x: [w.lower().strip() for w in x.split(',')])
df['payload_topics_list'] = df['entity_payload'].apply(lambda x: [w.lower() for w in word_tokenize(x) if not w in stop_words and len(w)>3])
df['summary_payload_intersection_test'] = df[['summary_topics_list','payload_topics_list']].apply(lambda x: len(set(x[0]).intersection(set(x[1])))>=1,axis=1)

In [56]:
sumtest = 'Opera, Carmen'
paytest = ':PEOPLE: Laura and Carmen;'
sumwords = [w.lower().strip() for w in sumtest.split(',')]
paywords = [w.lower() for w in word_tokenize(paytest) if not w in stop_words and len(w)>3]
sumset = set(sumwords)
payset = set(paywords)

In [65]:
sum(df['summary_payload_intersection_test'])/200.

0.29