# Possible important entity to extract from stories:
<br>important time period
<br>education background
<br>career related phrases
<br>living area 
<br>years and defining events
<br>personal characteristics 
<br>lost someone and relationship with the person
<br>faith
<br>family members

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import spacy
from spacy.displacy.render import EntityRenderer
from IPython.core.display import display, HTML

In [4]:
with open("Jackie_Boss.txt", "r") as ins:
    array = []
    for line in ins:
        array.append(line)

In [5]:
array

['Fortitude, faith and family. That is Jackie Boss.\n',
 'Through trials and tribulations most of us couldn’t fathom, Jackie not only persevered, but thrived. Overcoming the early death of her mother and the temporary displacement of living quarters when she was a teenager, she nurtured a marriage that is 63 years young and raised five daughters.\n',
 'Jackie Boss (nee Griffioen) was born in September of 1936 to Henrietta and Arie Griffioen in Grand Rapids, Michigan. Her mother passed when she was only six years old, and she entered her teenage years living in a “guardian home” (what we would now call a foster home) for three years.\n',
 'In her late teens she worked at a local supermarket-meat department, where her father was a butcher; however, Jackie learned how to butcher chickens and turkeys. There, she met the man with whom she would end up spending the rest of her life. Clarence “Clare” Boss also worked in the meat section. He said he was terrified of her wielding that cleaver a

In [6]:
len(array)

16

In [7]:
df = pd.DataFrame(data=array)
df.columns = ['text']
df

Unnamed: 0,text
0,"Fortitude, faith and family. That is Jackie Bo..."
1,Through trials and tribulations most of us cou...
2,Jackie Boss (nee Griffioen) was born in Septem...
3,In her late teens she worked at a local superm...
4,Clare was in the lumber business before he ret...
5,"Laurie, Cindi, Kathi and Amy are Jackie’s biol..."
6,"The foster system helped her, so Jackie wanted..."
7,Never more than seven but always with several ...
8,Not all of the kids were well behaved. Many of...
9,But Clare and Jackie were patient. “I couldn’t...


make every word into lowercase

In [8]:
lower = lambda x: x.lower()

In [9]:
df = pd.DataFrame(df['text'].apply(lower))
df

Unnamed: 0,text
0,"fortitude, faith and family. that is jackie bo..."
1,through trials and tribulations most of us cou...
2,jackie boss (nee griffioen) was born in septem...
3,in her late teens she worked at a local superm...
4,clare was in the lumber business before he ret...
5,"laurie, cindi, kathi and amy are jackie’s biol..."
6,"the foster system helped her, so jackie wanted..."
7,never more than seven but always with several ...
8,not all of the kids were well behaved. many of...
9,but clare and jackie were patient. “i couldn’t...


remove /n from the text

In [10]:
for i in range(0, len(df)):
    df['text'][i] = df['text'][i].rstrip()

In [11]:
def extract_named_ents(text):
    """Extract named entities, and beginning, middle and end idx using spacy's out-of-the-box model. 
    
    text -- text source from which to extract entities
    
    """
    return [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in nlp(text).ents]

def add_named_ents(df):
    """Create new column in data frame with named entity tuple extracted.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['named_ents'] = df['text'].apply(extract_named_ents)    

In [12]:
#import spacy won't work...
import en_core_web_sm
nlp = en_core_web_sm.load()
add_named_ents(df)
df

Unnamed: 0,text,named_ents
0,"fortitude, faith and family. that is jackie boss.",[]
1,through trials and tribulations most of us cou...,"[(63 years, 249, 257, DATE), (five, 275, 279, ..."
2,jackie boss (nee griffioen) was born in septem...,"[(september, 40, 49, DATE), (1936, 53, 57, DAT..."
3,in her late teens she worked at a local superm...,[]
4,clare was in the lumber business before he ret...,"[(almost 24 years ago, 51, 70, DATE), (five, 1..."
5,"laurie, cindi, kathi and amy are jackie’s biol...","[(dozens, 225, 231, CARDINAL), (the next 25 ye..."
6,"the foster system helped her, so jackie wanted...",[]
7,never more than seven but always with several ...,"[(more than seven, 6, 21, CARDINAL), (more tha..."
8,not all of the kids were well behaved. many of...,[]
9,but clare and jackie were patient. “i couldn’t...,"[(63 years, 347, 355, DATE)]"


In [13]:
def extract_nouns(text):
    """Extract a few types of nouns, and beginning, middle and end idx using spaCy's POS (part of speech) tagger. 
    
    Keyword arguments:
    text -- text source from which to extract entities
    
    """
    keep_pos = ['PROPN', 'NOUN']
    return [(tok.text, tok.idx, tok.idx+len(tok.text), tok.pos_) for tok in nlp(text) if tok.pos_ in keep_pos]

def add_nouns(df):
    """Create new column in data frame with nouns extracted.
    
    """
    df['nouns'] = df['text'].apply(extract_nouns)

In [14]:
add_nouns(df)
df

Unnamed: 0,text,named_ents,nouns
0,"fortitude, faith and family. that is jackie boss.",[],"[(fortitude, 0, 9, NOUN), (faith, 11, 16, NOUN..."
1,through trials and tribulations most of us cou...,"[(63 years, 249, 257, DATE), (five, 275, 279, ...","[(trials, 8, 14, NOUN), (tribulations, 19, 31,..."
2,jackie boss (nee griffioen) was born in septem...,"[(september, 40, 49, DATE), (1936, 53, 57, DAT...","[(jackie, 0, 6, NOUN), (boss, 7, 11, NOUN), (n..."
3,in her late teens she worked at a local superm...,[],"[(teens, 12, 17, NOUN), (supermarket, 40, 51, ..."
4,clare was in the lumber business before he ret...,"[(almost 24 years ago, 51, 70, DATE), (five, 1...","[(lumber, 17, 23, NOUN), (business, 24, 32, NO..."
5,"laurie, cindi, kathi and amy are jackie’s biol...","[(dozens, 225, 231, CARDINAL), (the next 25 ye...","[(laurie, 0, 6, NOUN), (kathi, 15, 20, NOUN), ..."
6,"the foster system helped her, so jackie wanted...",[],"[(system, 11, 17, NOUN), (jackie, 33, 39, NOUN..."
7,never more than seven but always with several ...,"[(more than seven, 6, 21, CARDINAL), (more tha...","[(kids, 46, 50, NOUN), (jackie, 59, 65, NOUN),..."
8,not all of the kids were well behaved. many of...,[],"[(kids, 15, 19, NOUN), (children, 51, 59, NOUN..."
9,but clare and jackie were patient. “i couldn’t...,"[(63 years, 347, 355, DATE)]","[(jackie, 14, 20, NOUN), (clare, 68, 73, NOUN)..."


In [15]:
def extract_named_nouns(row_series):
    """Combine nouns and non-numerical entities. 
    
    Keyword arguments:
    row_series -- a Pandas Series object
    
    """
    ents = set()
    idxs = set()
    # remove duplicates and merge two lists together
    for noun_tuple in row_series['nouns']:
        for named_ents_tuple in row_series['named_ents']:
            if noun_tuple[1] == named_ents_tuple[1]: 
                idxs.add(noun_tuple[1])
                ents.add(named_ents_tuple)
        if noun_tuple[1] not in idxs:
            ents.add(noun_tuple)
    
    return sorted(list(ents), key=lambda x: x[1])

def add_named_nouns(df):
    """Create new column in data frame with nouns and named ents.
  
    
    """
    df['named_nouns'] = df.apply(extract_named_nouns, axis=1)


In [16]:
add_named_nouns(df)
df

Unnamed: 0,text,named_ents,nouns,named_nouns
0,"fortitude, faith and family. that is jackie boss.",[],"[(fortitude, 0, 9, NOUN), (faith, 11, 16, NOUN...","[(fortitude, 0, 9, NOUN), (faith, 11, 16, NOUN..."
1,through trials and tribulations most of us cou...,"[(63 years, 249, 257, DATE), (five, 275, 279, ...","[(trials, 8, 14, NOUN), (tribulations, 19, 31,...","[(trials, 8, 14, NOUN), (tribulations, 19, 31,..."
2,jackie boss (nee griffioen) was born in septem...,"[(september, 40, 49, DATE), (1936, 53, 57, DAT...","[(jackie, 0, 6, NOUN), (boss, 7, 11, NOUN), (n...","[(jackie, 0, 6, NOUN), (boss, 7, 11, NOUN), (n..."
3,in her late teens she worked at a local superm...,[],"[(teens, 12, 17, NOUN), (supermarket, 40, 51, ...","[(teens, 12, 17, NOUN), (supermarket, 40, 51, ..."
4,clare was in the lumber business before he ret...,"[(almost 24 years ago, 51, 70, DATE), (five, 1...","[(lumber, 17, 23, NOUN), (business, 24, 32, NO...","[(lumber, 17, 23, NOUN), (business, 24, 32, NO..."
5,"laurie, cindi, kathi and amy are jackie’s biol...","[(dozens, 225, 231, CARDINAL), (the next 25 ye...","[(laurie, 0, 6, NOUN), (kathi, 15, 20, NOUN), ...","[(laurie, 0, 6, NOUN), (kathi, 15, 20, NOUN), ..."
6,"the foster system helped her, so jackie wanted...",[],"[(system, 11, 17, NOUN), (jackie, 33, 39, NOUN...","[(system, 11, 17, NOUN), (jackie, 33, 39, NOUN..."
7,never more than seven but always with several ...,"[(more than seven, 6, 21, CARDINAL), (more tha...","[(kids, 46, 50, NOUN), (jackie, 59, 65, NOUN),...","[(kids, 46, 50, NOUN), (jackie, 59, 65, NOUN),..."
8,not all of the kids were well behaved. many of...,[],"[(kids, 15, 19, NOUN), (children, 51, 59, NOUN...","[(kids, 15, 19, NOUN), (children, 51, 59, NOUN..."
9,but clare and jackie were patient. “i couldn’t...,"[(63 years, 347, 355, DATE)]","[(jackie, 14, 20, NOUN), (clare, 68, 73, NOUN)...","[(jackie, 14, 20, NOUN), (clare, 68, 73, NOUN)..."


In [17]:
def extract_noun_phrases(text):
    """Combine noun phrases. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    return [(chunk.text, chunk.start_char, chunk.end_char, chunk.label_) for chunk in nlp(text).noun_chunks]

def add_noun_phrases(df):
    """Create new column in data frame with noun phrases.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['noun_phrases'] = df['text'].apply(extract_noun_phrases)

In [18]:
add_noun_phrases(df)
df

Unnamed: 0,text,named_ents,nouns,named_nouns,noun_phrases
0,"fortitude, faith and family. that is jackie boss.",[],"[(fortitude, 0, 9, NOUN), (faith, 11, 16, NOUN...","[(fortitude, 0, 9, NOUN), (faith, 11, 16, NOUN...","[(fortitude, 0, 9, NP), (faith, 11, 16, NP), (..."
1,through trials and tribulations most of us cou...,"[(63 years, 249, 257, DATE), (five, 275, 279, ...","[(trials, 8, 14, NOUN), (tribulations, 19, 31,...","[(trials, 8, 14, NOUN), (tribulations, 19, 31,...","[(trials, 8, 14, NP), (tribulations, 19, 31, N..."
2,jackie boss (nee griffioen) was born in septem...,"[(september, 40, 49, DATE), (1936, 53, 57, DAT...","[(jackie, 0, 6, NOUN), (boss, 7, 11, NOUN), (n...","[(jackie, 0, 6, NOUN), (boss, 7, 11, NOUN), (n...","[(jackie boss, 0, 11, NP), (nee griffioen, 13,..."
3,in her late teens she worked at a local superm...,[],"[(teens, 12, 17, NOUN), (supermarket, 40, 51, ...","[(teens, 12, 17, NOUN), (supermarket, 40, 51, ...","[(her late teens, 3, 17, NP), (she, 18, 21, NP..."
4,clare was in the lumber business before he ret...,"[(almost 24 years ago, 51, 70, DATE), (five, 1...","[(lumber, 17, 23, NOUN), (business, 24, 32, NO...","[(lumber, 17, 23, NOUN), (business, 24, 32, NO...","[(the lumber business, 13, 32, NP), (he, 40, 4..."
5,"laurie, cindi, kathi and amy are jackie’s biol...","[(dozens, 225, 231, CARDINAL), (the next 25 ye...","[(laurie, 0, 6, NOUN), (kathi, 15, 20, NOUN), ...","[(laurie, 0, 6, NOUN), (kathi, 15, 20, NOUN), ...","[(laurie, 0, 6, NP), (kathi, 15, 20, NP), (amy..."
6,"the foster system helped her, so jackie wanted...",[],"[(system, 11, 17, NOUN), (jackie, 33, 39, NOUN...","[(system, 11, 17, NOUN), (jackie, 33, 39, NOUN...","[(the foster system, 0, 17, NP), (her, 25, 28,..."
7,never more than seven but always with several ...,"[(more than seven, 6, 21, CARDINAL), (more tha...","[(kids, 46, 50, NOUN), (jackie, 59, 65, NOUN),...","[(kids, 46, 50, NOUN), (jackie, 59, 65, NOUN),...","[(several kids, 38, 50, NP), (jackie, 59, 65, ..."
8,not all of the kids were well behaved. many of...,[],"[(kids, 15, 19, NOUN), (children, 51, 59, NOUN...","[(kids, 15, 19, NOUN), (children, 51, 59, NOUN...","[(the kids, 11, 19, NP), (the children, 47, 59..."
9,but clare and jackie were patient. “i couldn’t...,"[(63 years, 347, 355, DATE)]","[(jackie, 14, 20, NOUN), (clare, 68, 73, NOUN)...","[(jackie, 14, 20, NOUN), (clare, 68, 73, NOUN)...","[(jackie, 14, 20, NP), (i, 36, 37, NP), (it, 5..."


In [19]:
def extract_compounds(text):
    """Extract compound noun phrases with beginning and end idxs. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    comp_idx = 0
    compound = []
    compound_nps = []
    tok_idx = 0
    for idx, tok in enumerate(nlp(text)):
        if tok.dep_ == 'compound':

            # capture hyphenated compounds
            children = ''.join([c.text for c in tok.children])
            if '-' in children:
                compound.append(''.join([children, tok.text]))
            else:
                compound.append(tok.text)

            # remember starting index of first child in compound or word
            try:
                tok_idx = [c for c in tok.children][0].idx
            except IndexError:
                if len(compound) == 1:
                    tok_idx = tok.idx
            comp_idx = tok.i

        # append the last word in a compound phrase
        if tok.i - comp_idx == 1:
            compound.append(tok.text)
            if len(compound) > 1: 
                compound = ' '.join(compound)
                compound_nps.append((compound, tok_idx, tok_idx+len(compound), 'COMPOUND'))

            # reset parameters
            tok_idx = 0 
            compound = []

    return compound_nps

def add_compounds(df):
    """Create new column in data frame with compound noun phrases.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['compounds'] = df['text'].apply(extract_compounds)

In [20]:
add_compounds(df)
df

Unnamed: 0,text,named_ents,nouns,named_nouns,noun_phrases,compounds
0,"fortitude, faith and family. that is jackie boss.",[],"[(fortitude, 0, 9, NOUN), (faith, 11, 16, NOUN...","[(fortitude, 0, 9, NOUN), (faith, 11, 16, NOUN...","[(fortitude, 0, 9, NP), (faith, 11, 16, NP), (...","[(jackie boss, 37, 48, COMPOUND)]"
1,through trials and tribulations most of us cou...,"[(63 years, 249, 257, DATE), (five, 275, 279, ...","[(trials, 8, 14, NOUN), (tribulations, 19, 31,...","[(trials, 8, 14, NOUN), (tribulations, 19, 31,...","[(trials, 8, 14, NP), (tribulations, 19, 31, N...","[(living quarters, 176, 191, COMPOUND)]"
2,jackie boss (nee griffioen) was born in septem...,"[(september, 40, 49, DATE), (1936, 53, 57, DAT...","[(jackie, 0, 6, NOUN), (boss, 7, 11, NOUN), (n...","[(jackie, 0, 6, NOUN), (boss, 7, 11, NOUN), (n...","[(jackie boss, 0, 11, NP), (nee griffioen, 13,...","[(jackie boss, 0, 11, COMPOUND), (nee griffioe..."
3,in her late teens she worked at a local superm...,[],"[(teens, 12, 17, NOUN), (supermarket, 40, 51, ...","[(teens, 12, 17, NOUN), (supermarket, 40, 51, ...","[(her late teens, 3, 17, NP), (she, 18, 21, NP...","[(supermarket -, 40, 53, COMPOUND), (supermark..."
4,clare was in the lumber business before he ret...,"[(almost 24 years ago, 51, 70, DATE), (five, 1...","[(lumber, 17, 23, NOUN), (business, 24, 32, NO...","[(lumber, 17, 23, NOUN), (business, 24, 32, NO...","[(the lumber business, 13, 32, NP), (he, 40, 4...","[(lumber business, 17, 32, COMPOUND)]"
5,"laurie, cindi, kathi and amy are jackie’s biol...","[(dozens, 225, 231, CARDINAL), (the next 25 ye...","[(laurie, 0, 6, NOUN), (kathi, 15, 20, NOUN), ...","[(laurie, 0, 6, NOUN), (kathi, 15, 20, NOUN), ...","[(laurie, 0, 6, NP), (kathi, 15, 20, NP), (amy...",[]
6,"the foster system helped her, so jackie wanted...",[],"[(system, 11, 17, NOUN), (jackie, 33, 39, NOUN...","[(system, 11, 17, NOUN), (jackie, 33, 39, NOUN...","[(the foster system, 0, 17, NP), (her, 25, 28,...",[]
7,never more than seven but always with several ...,"[(more than seven, 6, 21, CARDINAL), (more tha...","[(kids, 46, 50, NOUN), (jackie, 59, 65, NOUN),...","[(kids, 46, 50, NOUN), (jackie, 59, 65, NOUN),...","[(several kids, 38, 50, NP), (jackie, 59, 65, ...",[]
8,not all of the kids were well behaved. many of...,[],"[(kids, 15, 19, NOUN), (children, 51, 59, NOUN...","[(kids, 15, 19, NOUN), (children, 51, 59, NOUN...","[(the kids, 11, 19, NP), (the children, 47, 59...",[]
9,but clare and jackie were patient. “i couldn’t...,"[(63 years, 347, 355, DATE)]","[(jackie, 14, 20, NOUN), (clare, 68, 73, NOUN)...","[(jackie, 14, 20, NOUN), (clare, 68, 73, NOUN)...","[(jackie, 14, 20, NP), (i, 36, 37, NP), (it, 5...",[]


In [21]:
def extract_comp_nouns(row_series, cols=[]):
    """Combine compound noun phrases and entities. 
    
    Keyword arguments:
    row_series -- a Pandas Series object
    
    """
    return {noun_tuple[0] for col in cols for noun_tuple in row_series[col]}

def add_comp_nouns(df, cols=[]):
    """Create new column in data frame with merged entities.
    
    Keyword arguments:
    df -- a dataframe object
    cols -- a list of column names that need to be merged
    
    """
    df['comp_nouns'] = df.apply(extract_comp_nouns, axis=1, cols=cols)

In [22]:
cols = ['nouns', 'compounds']
add_comp_nouns(df, cols=cols)
df

Unnamed: 0,text,named_ents,nouns,named_nouns,noun_phrases,compounds,comp_nouns
0,"fortitude, faith and family. that is jackie boss.",[],"[(fortitude, 0, 9, NOUN), (faith, 11, 16, NOUN...","[(fortitude, 0, 9, NOUN), (faith, 11, 16, NOUN...","[(fortitude, 0, 9, NP), (faith, 11, 16, NP), (...","[(jackie boss, 37, 48, COMPOUND)]","{family, fortitude, boss, jackie boss, faith}"
1,through trials and tribulations most of us cou...,"[(63 years, 249, 257, DATE), (five, 275, 279, ...","[(trials, 8, 14, NOUN), (tribulations, 19, 31,...","[(trials, 8, 14, NOUN), (tribulations, 19, 31,...","[(trials, 8, 14, NP), (tribulations, 19, 31, N...","[(living quarters, 176, 191, COMPOUND)]","{living, living quarters, teenager, mother, ye..."
2,jackie boss (nee griffioen) was born in septem...,"[(september, 40, 49, DATE), (1936, 53, 57, DAT...","[(jackie, 0, 6, NOUN), (boss, 7, 11, NOUN), (n...","[(jackie, 0, 6, NOUN), (boss, 7, 11, NOUN), (n...","[(jackie boss, 0, 11, NP), (nee griffioen, 13,...","[(jackie boss, 0, 11, COMPOUND), (nee griffioe...","{griffioen, rapids, nee griffioen, guardian ho..."
3,in her late teens she worked at a local superm...,[],"[(teens, 12, 17, NOUN), (supermarket, 40, 51, ...","[(teens, 12, 17, NOUN), (supermarket, 40, 51, ...","[(her late teens, 3, 17, NP), (she, 18, 21, NP...","[(supermarket -, 40, 53, COMPOUND), (supermark...","{meat, life, supermarket -, section, man, supe..."
4,clare was in the lumber business before he ret...,"[(almost 24 years ago, 51, 70, DATE), (five, 1...","[(lumber, 17, 23, NOUN), (business, 24, 32, NO...","[(lumber, 17, 23, NOUN), (business, 24, 32, NO...","[(the lumber business, 13, 32, NP), (he, 40, 4...","[(lumber business, 17, 32, COMPOUND)]","{lumber business, children, years, lumber, mon..."
5,"laurie, cindi, kathi and amy are jackie’s biol...","[(dozens, 225, 231, CARDINAL), (the next 25 ye...","[(laurie, 0, 6, NOUN), (kathi, 15, 20, NOUN), ...","[(laurie, 0, 6, NOUN), (kathi, 15, 20, NOUN), ...","[(laurie, 0, 6, NP), (kathi, 15, 20, NP), (amy...",[],"{lives, amy, people, children, years, laurie, ..."
6,"the foster system helped her, so jackie wanted...",[],"[(system, 11, 17, NOUN), (jackie, 33, 39, NOUN...","[(system, 11, 17, NOUN), (jackie, 33, 39, NOUN...","[(the foster system, 0, 17, NP), (her, 25, 28,...",[],"{future, life, system, jackie, bridge, parent}"
7,never more than seven but always with several ...,"[(more than seven, 6, 21, CARDINAL), (more tha...","[(kids, 46, 50, NOUN), (jackie, 59, 65, NOUN),...","[(kids, 46, 50, NOUN), (jackie, 59, 65, NOUN),...","[(several kids, 38, 50, NP), (jackie, 59, 65, ...",[],"{children, years, kids, jackie, bridge, clare}"
8,not all of the kids were well behaved. many of...,[],"[(kids, 15, 19, NOUN), (children, 51, 59, NOUN...","[(kids, 15, 19, NOUN), (children, 51, 59, NOUN...","[(the kids, 11, 19, NP), (the children, 47, 59...",[],"{children, kids, behavior, problems}"
9,but clare and jackie were patient. “i couldn’t...,"[(63 years, 347, 355, DATE)]","[(jackie, 14, 20, NOUN), (clare, 68, 73, NOUN)...","[(jackie, 14, 20, NOUN), (clare, 68, 73, NOUN)...","[(jackie, 14, 20, NP), (i, 36, 37, NP), (it, 5...",[],"{tolerance, love, decisions, reason, years, po..."


In [23]:
def drop_duplicate_np_splits(ents):
    """Drop any entities that are already captured by noun phrases. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    drop_ents = set()
    for ent in ents:
        if len(ent.split(' ')) > 1:
            for e in ent.split(' '):
                if e in ents:
                    drop_ents.add(e)
    return ents - drop_ents

def drop_single_char_nps(ents):
    """Within an entity, drop single characters. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    return {' '.join([e for e in ent.split(' ') if not len(e) == 1]) for ent in ents}

def drop_double_char(ents):
    """Drop any entities that are less than three characters. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    drop_ents = {ent for ent in ents if len(ent) < 3}
    return ents - drop_ents

def keep_alpha(ents):
    """Keep only entities with alphabetical unicode characters, hyphens, and spaces. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    keep_char = set('-abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ')
    drop_ents = {ent for ent in ents if not set(ent).issubset(keep_char)}
    return ents - drop_ents

In [24]:
def add_clean_ents(df, funcs=[]):
    """Create new column in data frame with cleaned entities.
    
    funcs -- a list of heuristic functions to be applied to entities
    
    """
    col = 'clean_ents'
    df[col] = df['comp_nouns']
    for f in funcs:
        df[col] = df[col].apply(f)

In [25]:
funcs = [drop_duplicate_np_splits, drop_double_char, keep_alpha, drop_single_char_nps]
add_clean_ents(df, funcs)
df

Unnamed: 0,text,named_ents,nouns,named_nouns,noun_phrases,compounds,comp_nouns,clean_ents
0,"fortitude, faith and family. that is jackie boss.",[],"[(fortitude, 0, 9, NOUN), (faith, 11, 16, NOUN...","[(fortitude, 0, 9, NOUN), (faith, 11, 16, NOUN...","[(fortitude, 0, 9, NP), (faith, 11, 16, NP), (...","[(jackie boss, 37, 48, COMPOUND)]","{family, fortitude, boss, jackie boss, faith}","{family, fortitude, jackie boss, faith}"
1,through trials and tribulations most of us cou...,"[(63 years, 249, 257, DATE), (five, 275, 279, ...","[(trials, 8, 14, NOUN), (tribulations, 19, 31,...","[(trials, 8, 14, NOUN), (tribulations, 19, 31,...","[(trials, 8, 14, NP), (tribulations, 19, 31, N...","[(living quarters, 176, 191, COMPOUND)]","{living, living quarters, teenager, mother, ye...","{living quarters, teenager, mother, years, tri..."
2,jackie boss (nee griffioen) was born in septem...,"[(september, 40, 49, DATE), (1936, 53, 57, DAT...","[(jackie, 0, 6, NOUN), (boss, 7, 11, NOUN), (n...","[(jackie, 0, 6, NOUN), (boss, 7, 11, NOUN), (n...","[(jackie boss, 0, 11, NP), (nee griffioen, 13,...","[(jackie boss, 0, 11, COMPOUND), (nee griffioe...","{griffioen, rapids, nee griffioen, guardian ho...","{rapids, nee griffioen, guardian home, mother,..."
3,in her late teens she worked at a local superm...,[],"[(teens, 12, 17, NOUN), (supermarket, 40, 51, ...","[(teens, 12, 17, NOUN), (supermarket, 40, 51, ...","[(her late teens, 3, 17, NP), (she, 18, 21, NP...","[(supermarket -, 40, 53, COMPOUND), (supermark...","{meat, life, supermarket -, section, man, supe...","{cleaver, life, man, clarence, teens, chickens..."
4,clare was in the lumber business before he ret...,"[(almost 24 years ago, 51, 70, DATE), (five, 1...","[(lumber, 17, 23, NOUN), (business, 24, 32, NO...","[(lumber, 17, 23, NOUN), (business, 24, 32, NO...","[(the lumber business, 13, 32, NP), (he, 40, 4...","[(lumber business, 17, 32, COMPOUND)]","{lumber business, children, years, lumber, mon...","{lumber business, children, years, money, daug..."
5,"laurie, cindi, kathi and amy are jackie’s biol...","[(dozens, 225, 231, CARDINAL), (the next 25 ye...","[(laurie, 0, 6, NOUN), (kathi, 15, 20, NOUN), ...","[(laurie, 0, 6, NOUN), (kathi, 15, 20, NOUN), ...","[(laurie, 0, 6, NP), (kathi, 15, 20, NP), (amy...",[],"{lives, amy, people, children, years, laurie, ...","{lives, amy, people, children, years, laurie, ..."
6,"the foster system helped her, so jackie wanted...",[],"[(system, 11, 17, NOUN), (jackie, 33, 39, NOUN...","[(system, 11, 17, NOUN), (jackie, 33, 39, NOUN...","[(the foster system, 0, 17, NP), (her, 25, 28,...",[],"{future, life, system, jackie, bridge, parent}","{future, life, system, bridge, jackie, parent}"
7,never more than seven but always with several ...,"[(more than seven, 6, 21, CARDINAL), (more tha...","[(kids, 46, 50, NOUN), (jackie, 59, 65, NOUN),...","[(kids, 46, 50, NOUN), (jackie, 59, 65, NOUN),...","[(several kids, 38, 50, NP), (jackie, 59, 65, ...",[],"{children, years, kids, jackie, bridge, clare}","{children, years, kids, jackie, bridge, clare}"
8,not all of the kids were well behaved. many of...,[],"[(kids, 15, 19, NOUN), (children, 51, 59, NOUN...","[(kids, 15, 19, NOUN), (children, 51, 59, NOUN...","[(the kids, 11, 19, NP), (the children, 47, 59...",[],"{children, kids, behavior, problems}","{children, kids, behavior, problems}"
9,but clare and jackie were patient. “i couldn’t...,"[(63 years, 347, 355, DATE)]","[(jackie, 14, 20, NOUN), (clare, 68, 73, NOUN)...","[(jackie, 14, 20, NOUN), (clare, 68, 73, NOUN)...","[(jackie, 14, 20, NP), (i, 36, 37, NP), (it, 5...",[],"{tolerance, love, decisions, reason, years, po...","{tolerance, love, decisions, reason, years, po..."


In [26]:
for i in range(0, len(df)):
    print(df['clean_ents'][i])

{'family', 'fortitude', 'jackie boss', 'faith'}
{'living quarters', 'teenager', 'mother', 'years', 'trials', 'jackie', 'displacement', 'daughters', 'marriage', 'tribulations', 'death'}
{'rapids', 'nee griffioen', 'guardian home', 'mother', 'years', 'september', 'jackie boss', 'michigan', 'what'}
{'cleaver', 'life', 'man', 'clarence', 'teens', 'chickens', 'meat section', 'turkeys', 'whom', 'rest', 'supermarket', 'jackie', 'survival', 'father', 'supermarket-meat department', 'butcher', 'route'}
{'lumber business', 'children', 'years', 'money', 'daughters', 'jackie'}
{'lives', 'amy', 'people', 'children', 'years', 'laurie', 'jackie', 'kathi', 'mission', 'rena', 'course', 'dozens'}
{'future', 'life', 'system', 'bridge', 'jackie', 'parent'}
{'children', 'years', 'kids', 'jackie', 'bridge', 'clare'}
{'children', 'kids', 'behavior', 'problems'}
{'tolerance', 'love', 'decisions', 'reason', 'years', 'police', 'kids', 'jackie', 'husband', 'clare', 'wink'}
{'jenison', 'members', 'characteristics'