## Features
Non-destructive tokenization <br>
Named entity recognition <br>
Support for 49+ languages <br>
16 statistical models for 9 languages <br>
Pre-trained word vectors <br>
State-of-the-art speed <br>
Easy deep learning integration <br>
Part-of-speech tagging <br>
Labelled dependency parsing <br>
Syntax-driven sentence segmentation <br>
Built in visualizers for syntax and NER <br>
Convenient string-to-hash mapping <br>
Export to numpy data arrays <br>
Efficient binary serialization <br>
Easy model packaging and deployment <br>
Robust, rigorously evaluated accuracy <br>

In [1]:
%load_ext autoreload
%autoreload 2
#Reload all modules (except those excluded by %aimport) automatically now.

In [2]:
import pandas as pd
import spacy
from spacy.displacy.render import EntityRenderer
from IPython.core.display import display, HTML

In [3]:
def custom_render(doc, df, column, options={}, page=False, minify=False, idx=0):
    """Overload the spaCy built-in rendering to allow custom part-of-speech (POS) tags.
    
    Keyword arguments:
    doc -- a spaCy nlp doc object
    df -- a pandas dataframe object
    column -- the name of of a column of interest in the dataframe
    options -- various options to feed into the spaCy renderer, including colors
    page -- rendering markup as full HTML page (default False)
    minify -- for compact HTML (default False)
    idx -- index for specific query or doc in dataframe (default 0)
    
    """
    renderer, converter = EntityRenderer, parse_custom_ents
    renderer = renderer(options=options)
    parsed = [converter(doc, df=df, idx=idx, column=column)]
    html = renderer.render(parsed, page=page, minify=minify).strip()  
    return display(HTML(html))

def parse_custom_ents(doc, df, idx, column):
    """Parse custom entity types that aren't in the original spaCy module.
    
    Keyword arguments:
    doc -- a spaCy nlp doc object
    df -- a pandas dataframe object
    idx -- index for specific query or doc in dataframe
    column -- the name of of a column of interest in the dataframe
    
    """
    if column in df.columns:
        entities = df[column][idx]
        ents = [{'start': ent[1], 'end': ent[2], 'label': ent[3]} 
                for ent in entities]
    else:
        ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
            for ent in doc.ents]
    return {'text': doc.text, 'ents': ents, 'title': None}

def render_entities(idx, df, options={}, column='named_ents'):
    """A wrapper function to get text from a dataframe and render it visually in jupyter notebooks
    
    Keyword arguments:
    idx -- index for specific query or doc in dataframe (default 0)
    df -- a pandas dataframe object
    options -- various options to feed into the spaCy renderer, including colors
    column -- the name of of a column of interest in the dataframe (default 'named_ents')
    
    """
    text = df['event_summary'][idx]
    custom_render(nlp(text), df=df, column=column, options=options, idx=idx)

In [4]:
options = {
    'colors': {'COMPOUND': '#FE6BFE', 'PROPN': '#18CFE6', 'NOUN': '#18CFE6', 'NP': '#1EECA6', 'ENTITY': '#FF8800'}
}

In [5]:
pd.set_option('display.max_rows', 10) # edit how jupyter will render our pandas dataframes
pd.options.mode.chained_assignment = None # prevent warning about working on a copy of a dataframe

In [6]:
import csv
import sys, os, json, re, time
import subprocess

In [7]:
#file = './nips-2015-papers/Papers.csv'
#file = '/uci-news-aggregator.csv'
df = pd.read_csv('newsdataset1.csv')
df.event_summary=df.event_summary.astype(str)


In [8]:
df.describe()

Unnamed: 0,_id,date,category,event_title,event_summary,entities,person,external_link,Unnamed: 8,Unnamed: 9,...,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58
count,12817,12701,12636,12601,12817,12299,12042,11786,11181,9783,...,2,2,2,1,1,1,1,1,1,1
unique,12814,1869,853,3259,11220,10017,9611,8658,7958,6994,...,2,2,2,1,1,1,1,1,1,1
top,Jeff Weaver,"2016-04-22T00:00:00.000Z""""",Armed conflicts and attacks,FALSE,"_2016""]""",FALSE,FALSE,FALSE,FALSE,FALSE,...,/wiki/Holland,/wiki/Belgium,http://www.msn.com/en-us/news/politics/media-s...,/wiki/State_of_Palestine,/wiki/Turkey,"/wiki/Greece]""",False,[https://www.independent.co.uk/news/world/euro...,http://www.euronews.com/2016/03/20/at-least-14...,https://www.bloomberg.com/news/articles/2016-0...
freq,2,32,2309,5216,311,123,572,1452,1879,1798,...,1,1,1,1,1,1,1,1,1,1


In [9]:
display(df)

Unnamed: 0,_id,date,category,event_title,event_summary,entities,person,external_link,Unnamed: 8,Unnamed: 9,...,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58
0,"ObjectId(""5d056c5bb5f3a01b25e85fd2"")","2015-01-01T00:00:00.000Z""""",Armed conflicts and attacks,"[/wiki/Terrorism_in_Yemen""]""",The death toll of the suicide bombing in Ibb,Yemen,that occurred on December 31 rises to 49 with...,"[/wiki/2014_Ibb_bombing""",/wiki/Ibb,"/wiki/Yemen]""",...,,,,,,,,,,
1,"ObjectId(""5d056c5bb5f3a01b25e85fd3"")","2015-01-01T00:00:00.000Z""""",Armed conflicts and attacks,FALSE,A shooting kills one and injures six people in...,Calgary,Canada,with police making no arrests. (National Post),[Killarney,"Calgary""]""",...,,,,,,,,,,
2,"ObjectId(""5d056c5bb5f3a01b25e85fd4"")","2015-01-01T00:00:00.000Z""""",Business and economy,FALSE,Lithuania adopts the euro as its official curr...,becoming the 19th member of the Eurozone. (CNN),"[Lithuania and the euro""","Eurozone]""",FALSE,[http://money.cnn.com/2014/12/31/news/economy/...,...,,,,,,,,,,
3,"ObjectId(""5d056c5bb5f3a01b25e85fd5"")","2015-01-01T00:00:00.000Z""""",Business and economy,FALSE,U.S. fast-food restaurant chain Chick-fil-A wa...,000 customers in five states. (Daily Mail),"[Chick-fil-A""]""",FALSE,[http://www.dailymail.co.uk/news/article-28936...,,...,,,,,,,,,,
4,"ObjectId(""5d056c5bb5f3a01b25e85fd6"")","2015-01-01T00:00:00.000Z""""",International relations,FALSE,The Palestine Authority signs a treaty to join...,"[Palestine Authority""","International Criminal Court]""",FALSE,[https://www.wsj.com/articles/abbas-oks-palest...,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12812,"ObjectId(""5d056c7ab5f3a01b25e88dd1"")","2017-09-30T00:00:00.000Z""""",Disasters and accidents,"[/wiki/Human_stampede""]""",A crowd surge collapses a barrier during a soc...,France,injuring twenty-nine people,five of them seriously. (AFP via Yahoo! Sports),"[/wiki/Soccer""",/wiki/Stade_de_la_Licorne,...,,,,,,,,,,
12813,"ObjectId(""5d056c7ab5f3a01b25e88dd2"")","2017-09-30T00:00:00.000Z""""",International relations,"[/wiki/Foreign_relations_of_Thailand""",/wiki/Rohingya_persecution_in_Myanmar_(2016%E2...,Thailand's Ministry of Foreign Affairs says it...,"[/wiki/Thailand""",/wiki/Ministry_of_Foreign_Affairs_(Thailand),/wiki/Rakhine_State,/wiki/Myanmar,...,,,,,,,,,,
12814,"ObjectId(""5d056c7ab5f3a01b25e88dd3"")","2017-09-30T00:00:00.000Z""""",Politics and elections,"[/wiki/Abortion_in_the_Republic_of_Ireland""]""",Tens of thousands of protesters march through ...,and anti-abortion activists stage counter-dem...,"[/wiki/Dublin""",/wiki/Pro-choice,"/wiki/Irish_embassy_in_London]""",FALSE,...,,,,,,,,,,
12815,"ObjectId(""5d056c7ab5f3a01b25e88dd4"")","2017-09-30T00:00:00.000Z""""",Politics and elections,"[/wiki/Nazism_in_Sweden""]""",About 600 members of the Swedish neo-Nazi grou...,and 10,000 people hold a counter-demonstration. More ...,"[/wiki/Sweden""",/wiki/Neo-Nazi,/wiki/Nordic_Resistance_Movement,...,,,,,,,,,,


In [10]:
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

Unnamed: 0,_id,date,category,event_title,event_summary,entities,person,external_link,Unnamed: 8,Unnamed: 9,...,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58
6014,"ObjectId(""5d056c69b5f3a01b25e87549"")","2016-03-20T00:00:00.000Z""""",Disasters and accidents,"[/wiki/Erasmus_bus_crash""]""",Thirteen are killed and 34 others injured afte...,crashes near Freginals,Spain. The regional government of Catalonia s...,"""... according to the latest data",the ill-fated bus had students from Hungary,Germany,...,/wiki/Holland,/wiki/Belgium,/wiki/France,/wiki/State_of_Palestine,/wiki/Turkey,"/wiki/Greece]""",False,[https://www.independent.co.uk/news/world/euro...,http://www.euronews.com/2016/03/20/at-least-14...,https://www.bloomberg.com/news/articles/2016-0...


In [11]:
# mini_df = df[:10]
# mini_df.index = pd.RangeIndex(len(mini_df.index))

# # comment this out to run on full dataset
# df = mini_df

In [12]:
display(df)

Unnamed: 0,_id,date,category,event_title,event_summary,entities,person,external_link,Unnamed: 8,Unnamed: 9,...,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58
0,"ObjectId(""5d056c5bb5f3a01b25e85fd2"")","2015-01-01T00:00:00.000Z""""",Armed conflicts and attacks,"[/wiki/Terrorism_in_Yemen""]""",The death toll of the suicide bombing in Ibb,Yemen,that occurred on December 31 rises to 49 with...,"[/wiki/2014_Ibb_bombing""",/wiki/Ibb,"/wiki/Yemen]""",...,,,,,,,,,,
1,"ObjectId(""5d056c5bb5f3a01b25e85fd3"")","2015-01-01T00:00:00.000Z""""",Armed conflicts and attacks,FALSE,A shooting kills one and injures six people in...,Calgary,Canada,with police making no arrests. (National Post),[Killarney,"Calgary""]""",...,,,,,,,,,,
2,"ObjectId(""5d056c5bb5f3a01b25e85fd4"")","2015-01-01T00:00:00.000Z""""",Business and economy,FALSE,Lithuania adopts the euro as its official curr...,becoming the 19th member of the Eurozone. (CNN),"[Lithuania and the euro""","Eurozone]""",FALSE,[http://money.cnn.com/2014/12/31/news/economy/...,...,,,,,,,,,,
3,"ObjectId(""5d056c5bb5f3a01b25e85fd5"")","2015-01-01T00:00:00.000Z""""",Business and economy,FALSE,U.S. fast-food restaurant chain Chick-fil-A wa...,000 customers in five states. (Daily Mail),"[Chick-fil-A""]""",FALSE,[http://www.dailymail.co.uk/news/article-28936...,,...,,,,,,,,,,
4,"ObjectId(""5d056c5bb5f3a01b25e85fd6"")","2015-01-01T00:00:00.000Z""""",International relations,FALSE,The Palestine Authority signs a treaty to join...,"[Palestine Authority""","International Criminal Court]""",FALSE,[https://www.wsj.com/articles/abbas-oks-palest...,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12812,"ObjectId(""5d056c7ab5f3a01b25e88dd1"")","2017-09-30T00:00:00.000Z""""",Disasters and accidents,"[/wiki/Human_stampede""]""",A crowd surge collapses a barrier during a soc...,France,injuring twenty-nine people,five of them seriously. (AFP via Yahoo! Sports),"[/wiki/Soccer""",/wiki/Stade_de_la_Licorne,...,,,,,,,,,,
12813,"ObjectId(""5d056c7ab5f3a01b25e88dd2"")","2017-09-30T00:00:00.000Z""""",International relations,"[/wiki/Foreign_relations_of_Thailand""",/wiki/Rohingya_persecution_in_Myanmar_(2016%E2...,Thailand's Ministry of Foreign Affairs says it...,"[/wiki/Thailand""",/wiki/Ministry_of_Foreign_Affairs_(Thailand),/wiki/Rakhine_State,/wiki/Myanmar,...,,,,,,,,,,
12814,"ObjectId(""5d056c7ab5f3a01b25e88dd3"")","2017-09-30T00:00:00.000Z""""",Politics and elections,"[/wiki/Abortion_in_the_Republic_of_Ireland""]""",Tens of thousands of protesters march through ...,and anti-abortion activists stage counter-dem...,"[/wiki/Dublin""",/wiki/Pro-choice,"/wiki/Irish_embassy_in_London]""",FALSE,...,,,,,,,,,,
12815,"ObjectId(""5d056c7ab5f3a01b25e88dd4"")","2017-09-30T00:00:00.000Z""""",Politics and elections,"[/wiki/Nazism_in_Sweden""]""",About 600 members of the Swedish neo-Nazi grou...,and 10,000 people hold a counter-demonstration. More ...,"[/wiki/Sweden""",/wiki/Neo-Nazi,/wiki/Nordic_Resistance_Movement,...,,,,,,,,,,


In [13]:
lower = lambda x: x.lower() # make everything lowercase

In [14]:
df = pd.DataFrame(df['event_summary'].apply(lower))
df.columns = ['event_summary']
display(df)

Unnamed: 0,event_summary
0,the death toll of the suicide bombing in ibb
1,a shooting kills one and injures six people in...
2,lithuania adopts the euro as its official curr...
3,u.s. fast-food restaurant chain chick-fil-a wa...
4,the palestine authority signs a treaty to join...
...,...
12812,a crowd surge collapses a barrier during a soc...
12813,/wiki/rohingya_persecution_in_myanmar_(2016%e2...
12814,tens of thousands of protesters march through ...
12815,about 600 members of the swedish neo-nazi grou...


In [15]:
 import spacy

In [16]:
nlp = spacy.load("en_core_web_sm")

In [17]:
def extract_named_ents(text):
    """Extract named entities, and beginning, middle and end idx using spaCy's out-of-the-box model. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    return [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in nlp(text).ents]

def add_named_ents(df):
    """Create new column in data frame with named entity tuple extracted.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['named_ents'] = df['event_summary'].apply(extract_named_ents)   

In [18]:
add_named_ents(df)

In [19]:
df['event_summary'].count()

12817

In [20]:
df

Unnamed: 0,event_summary,named_ents
0,the death toll of the suicide bombing in ibb,[]
1,a shooting kills one and injures six people in...,"[(one, 17, 20, CARDINAL), (six, 33, 36, CARDIN..."
2,lithuania adopts the euro as its official curr...,[]
3,u.s. fast-food restaurant chain chick-fil-a wa...,"[(9, 112, 113, CARDINAL)]"
4,the palestine authority signs a treaty to join...,[]
...,...,...
12812,a crowd surge collapses a barrier during a soc...,[]
12813,/wiki/rohingya_persecution_in_myanmar_(2016%e2...,[]
12814,tens of thousands of protesters march through ...,"[(tens of thousands, 0, 17, CARDINAL)]"
12815,about 600 members of the swedish neo-nazi grou...,"[(about 600, 0, 9, CARDINAL)]"


In [21]:
column = 'named_ents'
render_entities(9, df, options=options, column=column) # take a look at one of the abstracts

In [22]:
def extract_nouns(text):
    """Extract a few types of nouns, and beginning, middle and end idx using spaCy's POS (part of speech) tagger. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    keep_pos = ['PROPN', 'NOUN']
    return [(tok.text, tok.idx, tok.idx+len(tok.text), tok.pos_) for tok in nlp(text) if tok.pos_ in keep_pos]

def add_nouns(df):
    """Create new column in data frame with nouns extracted.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['nouns'] = df['event_summary'].apply(extract_nouns)

In [23]:
add_nouns(df)
display(df)

Unnamed: 0,event_summary,named_ents,nouns
0,the death toll of the suicide bombing in ibb,[],"[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s..."
1,a shooting kills one and injures six people in...,"[(one, 17, 20, CARDINAL), (six, 33, 36, CARDIN...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU..."
2,lithuania adopts the euro as its official curr...,[],"[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN..."
3,u.s. fast-food restaurant chain chick-fil-a wa...,"[(9, 112, 113, CARDINAL)]","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO..."
4,the palestine authority signs a treaty to join...,[],"[(palestine, 4, 13, NOUN), (authority, 14, 23,..."
...,...,...,...
12812,a crowd surge collapses a barrier during a soc...,[],"[(crowd, 2, 7, NOUN), (surge, 8, 13, NOUN), (b..."
12813,/wiki/rohingya_persecution_in_myanmar_(2016%e2...,[],"[(/wiki, 0, 5, PROPN)]"
12814,tens of thousands of protesters march through ...,"[(tens of thousands, 0, 17, CARDINAL)]","[(tens, 0, 4, NOUN), (thousands, 8, 17, NOUN),..."
12815,about 600 members of the swedish neo-nazi grou...,"[(about 600, 0, 9, CARDINAL)]","[(members, 10, 17, NOUN), (group, 42, 47, NOUN..."


In [24]:
column = 'nouns'
render_entities(0, df, options=options, column=column)

In [25]:
def extract_named_nouns(row_series):
    """Combine nouns and non-numerical entities. 
    
    Keyword arguments:
    row_series -- a Pandas Series object
    
    """
    ents = set()
    idxs = set()
    # remove duplicates and merge two lists together
    for noun_tuple in row_series['nouns']:
        for named_ents_tuple in row_series['named_ents']:
            if noun_tuple[1] == named_ents_tuple[1]: 
                idxs.add(noun_tuple[1])
                ents.add(named_ents_tuple)
        if noun_tuple[1] not in idxs:
            ents.add(noun_tuple)
    
    return sorted(list(ents), key=lambda x: x[1])

def add_named_nouns(df):
    """Create new column in data frame with nouns and named ents.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['named_nouns'] = df.apply(extract_named_nouns, axis=1)

In [26]:
add_named_nouns(df)
display(df)

Unnamed: 0,event_summary,named_ents,nouns,named_nouns
0,the death toll of the suicide bombing in ibb,[],"[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s..."
1,a shooting kills one and injures six people in...,"[(one, 17, 20, CARDINAL), (six, 33, 36, CARDIN...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU..."
2,lithuania adopts the euro as its official curr...,[],"[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN..."
3,u.s. fast-food restaurant chain chick-fil-a wa...,"[(9, 112, 113, CARDINAL)]","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO..."
4,the palestine authority signs a treaty to join...,[],"[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(palestine, 4, 13, NOUN), (authority, 14, 23,..."
...,...,...,...,...
12812,a crowd surge collapses a barrier during a soc...,[],"[(crowd, 2, 7, NOUN), (surge, 8, 13, NOUN), (b...","[(crowd, 2, 7, NOUN), (surge, 8, 13, NOUN), (b..."
12813,/wiki/rohingya_persecution_in_myanmar_(2016%e2...,[],"[(/wiki, 0, 5, PROPN)]","[(/wiki, 0, 5, PROPN)]"
12814,tens of thousands of protesters march through ...,"[(tens of thousands, 0, 17, CARDINAL)]","[(tens, 0, 4, NOUN), (thousands, 8, 17, NOUN),...","[(tens of thousands, 0, 17, CARDINAL), (thousa..."
12815,about 600 members of the swedish neo-nazi grou...,"[(about 600, 0, 9, CARDINAL)]","[(members, 10, 17, NOUN), (group, 42, 47, NOUN...","[(members, 10, 17, NOUN), (group, 42, 47, NOUN..."


In [27]:
column = 'named_nouns'
render_entities(1, df, options=options, column=column)

In [30]:
text = "Dr. Abraham is the primary author of this paper, and a physician in the specialty of internal medicine."

spacy.displacy.render(nlp(text), jupyter=True) # generating raw-markup using spacy's built-in renderer

In [31]:
def extract_noun_phrases(text):
    """Combine noun phrases. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    return [(chunk.text, chunk.start_char, chunk.end_char, chunk.label_) for chunk in nlp(text).noun_chunks]

def add_noun_phrases(df):
    """Create new column in data frame with noun phrases.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['noun_phrases'] = df['event_summary'].apply(extract_noun_phrases)

In [32]:
def visualize_noun_phrases(text):
    """Create a temporary dataframe to extract and visualize noun phrases. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    df = pd.DataFrame([text]) 
    df.columns = ['event_summary']
    add_noun_phrases(df)
    column = 'noun_phrases'
    render_entities(0, df, options=options, column=column)

In [33]:
visualize_noun_phrases(text)



In [34]:
add_noun_phrases(df)
display(df)

Unnamed: 0,event_summary,named_ents,nouns,named_nouns,noun_phrases
0,the death toll of the suicide bombing in ibb,[],"[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(the death toll, 0, 14, NP), (the suicide bom..."
1,a shooting kills one and injures six people in...,"[(one, 17, 20, CARDINAL), (six, 33, 36, CARDIN...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(six people, 33, 43, NP), (killarney, 47, 56,..."
2,lithuania adopts the euro as its official curr...,[],"[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, NP), (the euro, 17, 25, NP)..."
3,u.s. fast-food restaurant chain chick-fil-a wa...,"[(9, 112, 113, CARDINAL)]","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(fast-food restaurant chain, 5, 31, NP), (a s..."
4,the palestine authority signs a treaty to join...,[],"[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(the palestine authority, 0, 23, NP), (a trea..."
...,...,...,...,...,...
12812,a crowd surge collapses a barrier during a soc...,[],"[(crowd, 2, 7, NOUN), (surge, 8, 13, NOUN), (b...","[(crowd, 2, 7, NOUN), (surge, 8, 13, NOUN), (b...","[(a crowd surge, 0, 13, NP), (a barrier, 24, 3..."
12813,/wiki/rohingya_persecution_in_myanmar_(2016%e2...,[],"[(/wiki, 0, 5, PROPN)]","[(/wiki, 0, 5, PROPN)]","[(/wiki, 0, 5, NP)]"
12814,tens of thousands of protesters march through ...,"[(tens of thousands, 0, 17, CARDINAL)]","[(tens, 0, 4, NOUN), (thousands, 8, 17, NOUN),...","[(tens of thousands, 0, 17, CARDINAL), (thousa...","[(tens of thousands, 0, 17, NP), (protesters, ..."
12815,about 600 members of the swedish neo-nazi grou...,"[(about 600, 0, 9, CARDINAL)]","[(members, 10, 17, NOUN), (group, 42, 47, NOUN...","[(members, 10, 17, NOUN), (group, 42, 47, NOUN...","[(about 600 members, 0, 17, NP), (the swedish ..."


In [35]:
column = 'noun_phrases'
render_entities(0, df, options=options, column=column)

In [36]:
def extract_compounds(text):
    """Extract compound noun phrases with beginning and end idxs. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    comp_idx = 0
    compound = []
    compound_nps = []
    tok_idx = 0
    for idx, tok in enumerate(nlp(text)):
        if tok.dep_ == 'compound':

            # capture hyphenated compounds
            children = ''.join([c.text for c in tok.children])
            if '-' in children:
                compound.append(''.join([children, tok.text]))
            else:
                compound.append(tok.text)

            # remember starting index of first child in compound or word
            try:
                tok_idx = [c for c in tok.children][0].idx
            except IndexError:
                if len(compound) == 1:
                    tok_idx = tok.idx
            comp_idx = tok.i

        # append the last word in a compound phrase
        if tok.i - comp_idx == 1:
            compound.append(tok.text)
            if len(compound) > 1: 
                compound = ' '.join(compound)
                compound_nps.append((compound, tok_idx, tok_idx+len(compound), 'COMPOUND'))

            # reset parameters
            tok_idx = 0 
            compound = []

    return compound_nps

def add_compounds(df):
    """Create new column in data frame with compound noun phrases.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['compounds'] = df['event_summary'].apply(extract_compounds)

In [37]:
add_compounds(df)
display(df)

Unnamed: 0,event_summary,named_ents,nouns,named_nouns,noun_phrases,compounds
0,the death toll of the suicide bombing in ibb,[],"[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(the death toll, 0, 14, NP), (the suicide bom...","[(death toll, 4, 14, COMPOUND), (suicide bombi..."
1,a shooting kills one and injures six people in...,"[(one, 17, 20, CARDINAL), (six, 33, 36, CARDIN...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(six people, 33, 43, NP), (killarney, 47, 56,...","[(shooting kills, 2, 16, COMPOUND)]"
2,lithuania adopts the euro as its official curr...,[],"[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, NP), (the euro, 17, 25, NP)...",[]
3,u.s. fast-food restaurant chain chick-fil-a wa...,"[(9, 112, 113, CARDINAL)]","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(fast-food restaurant chain, 5, 31, NP), (a s...","[(restaurant chain, 15, 31, COMPOUND), (-chick..."
4,the palestine authority signs a treaty to join...,[],"[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(the palestine authority, 0, 23, NP), (a trea...","[(palestine authority, 4, 23, COMPOUND), (wall..."
...,...,...,...,...,...,...
12812,a crowd surge collapses a barrier during a soc...,[],"[(crowd, 2, 7, NOUN), (surge, 8, 13, NOUN), (b...","[(crowd, 2, 7, NOUN), (surge, 8, 13, NOUN), (b...","[(a crowd surge, 0, 13, NP), (a barrier, 24, 3...","[(crowd surge, 2, 13, COMPOUND), (soccer match..."
12813,/wiki/rohingya_persecution_in_myanmar_(2016%e2...,[],"[(/wiki, 0, 5, PROPN)]","[(/wiki, 0, 5, PROPN)]","[(/wiki, 0, 5, NP)]",[]
12814,tens of thousands of protesters march through ...,"[(tens of thousands, 0, 17, CARDINAL)]","[(tens, 0, 4, NOUN), (thousands, 8, 17, NOUN),...","[(tens of thousands, 0, 17, CARDINAL), (thousa...","[(tens of thousands, 0, 17, NP), (protesters, ...","[(abortion laws, 85, 98, COMPOUND)]"
12815,about 600 members of the swedish neo-nazi grou...,"[(about 600, 0, 9, CARDINAL)]","[(members, 10, 17, NOUN), (group, 42, 47, NOUN...","[(members, 10, 17, NOUN), (group, 42, 47, NOUN...","[(about 600 members, 0, 17, NP), (the swedish ...","[(resistance movement, 48, 67, COMPOUND)]"


In [38]:
column = 'compounds'
render_entities(0, df, options=options, column=column)

In [39]:
def extract_comp_nouns(row_series, cols=[]):
    """Combine compound noun phrases and entities. 
    
    Keyword arguments:
    row_series -- a Pandas Series object
    
    """
    return {noun_tuple[0] for col in cols for noun_tuple in row_series[col]}

def add_comp_nouns(df, cols=[]):
    """Create new column in data frame with merged entities.
    
    Keyword arguments:
    df -- a dataframe object
    cols -- a list of column names that need to be merged
    
    """
    df['comp_nouns'] = df.apply(extract_comp_nouns, axis=1, cols=cols)

In [40]:
cols = ['nouns', 'compounds']
add_comp_nouns(df, cols=cols)
display(df)

Unnamed: 0,event_summary,named_ents,nouns,named_nouns,noun_phrases,compounds,comp_nouns
0,the death toll of the suicide bombing in ibb,[],"[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(the death toll, 0, 14, NP), (the suicide bom...","[(death toll, 4, 14, COMPOUND), (suicide bombi...","{ibb, bombing, toll, suicide bombing, death to..."
1,a shooting kills one and injures six people in...,"[(one, 17, 20, CARDINAL), (six, 33, 36, CARDIN...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(six people, 33, 43, NP), (killarney, 47, 56,...","[(shooting kills, 2, 16, COMPOUND)]","{people, killarney, shooting kills, shooting}"
2,lithuania adopts the euro as its official curr...,[],"[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, NP), (the euro, 17, 25, NP)...",[],"{euro, lithuania, currency}"
3,u.s. fast-food restaurant chain chick-fil-a wa...,"[(9, 112, 113, CARDINAL)]","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(fast-food restaurant chain, 5, 31, NP), (a s...","[(restaurant chain, 15, 31, COMPOUND), (-chick...","{-chick -, breach, credit, card, details, secu..."
4,the palestine authority signs a treaty to join...,[],"[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(the palestine authority, 0, 23, NP), (a trea...","[(palestine authority, 4, 23, COMPOUND), (wall...","{authority, palestine, palestine authority, jo..."
...,...,...,...,...,...,...,...
12812,a crowd surge collapses a barrier during a soc...,[],"[(crowd, 2, 7, NOUN), (surge, 8, 13, NOUN), (b...","[(crowd, 2, 7, NOUN), (surge, 8, 13, NOUN), (b...","[(a crowd surge, 0, 13, NP), (a barrier, 24, 3...","[(crowd surge, 2, 13, COMPOUND), (soccer match...","{surge, barrier, amiens, licorne, match, crowd..."
12813,/wiki/rohingya_persecution_in_myanmar_(2016%e2...,[],"[(/wiki, 0, 5, PROPN)]","[(/wiki, 0, 5, PROPN)]","[(/wiki, 0, 5, NP)]",[],{/wiki}
12814,tens of thousands of protesters march through ...,"[(tens of thousands, 0, 17, CARDINAL)]","[(tens, 0, 4, NOUN), (thousands, 8, 17, NOUN),...","[(tens of thousands, 0, 17, CARDINAL), (thousa...","[(tens of thousands, 0, 17, NP), (protesters, ...","[(abortion laws, 85, 98, COMPOUND)]","{thousands, laws, tens, abortion, protesters, ..."
12815,about 600 members of the swedish neo-nazi grou...,"[(about 600, 0, 9, CARDINAL)]","[(members, 10, 17, NOUN), (group, 42, 47, NOUN...","[(members, 10, 17, NOUN), (group, 42, 47, NOUN...","[(about 600 members, 0, 17, NP), (the swedish ...","[(resistance movement, 48, 67, COMPOUND)]","{resistance movement, group, resistance, membe..."


In [41]:
# take a look at all the nouns again
column = 'named_nouns'
render_entities(0, df, options=options, column=column)

In [42]:
# take a look at all the compound noun phrases again
column = 'compounds'
render_entities(0, df, options=options, column=column)

In [43]:
# take a look at combined entities
df['comp_nouns'][0] 

{'bombing', 'death', 'death toll', 'ibb', 'suicide', 'suicide bombing', 'toll'}

In [44]:
df.head()


Unnamed: 0,event_summary,named_ents,nouns,named_nouns,noun_phrases,compounds,comp_nouns
0,the death toll of the suicide bombing in ibb,[],"[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(the death toll, 0, 14, NP), (the suicide bom...","[(death toll, 4, 14, COMPOUND), (suicide bombi...","{ibb, bombing, toll, suicide bombing, death to..."
1,a shooting kills one and injures six people in...,"[(one, 17, 20, CARDINAL), (six, 33, 36, CARDIN...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(six people, 33, 43, NP), (killarney, 47, 56,...","[(shooting kills, 2, 16, COMPOUND)]","{people, killarney, shooting kills, shooting}"
2,lithuania adopts the euro as its official curr...,[],"[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, NP), (the euro, 17, 25, NP)...",[],"{euro, lithuania, currency}"
3,u.s. fast-food restaurant chain chick-fil-a wa...,"[(9, 112, 113, CARDINAL)]","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(fast-food restaurant chain, 5, 31, NP), (a s...","[(restaurant chain, 15, 31, COMPOUND), (-chick...","{-chick -, breach, credit, card, details, secu..."
4,the palestine authority signs a treaty to join...,[],"[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(the palestine authority, 0, 23, NP), (a trea...","[(palestine authority, 4, 23, COMPOUND), (wall...","{authority, palestine, palestine authority, jo..."


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12817 entries, 0 to 12816
Data columns (total 7 columns):
event_summary    12817 non-null object
named_ents       12817 non-null object
nouns            12817 non-null object
named_nouns      12817 non-null object
noun_phrases     12817 non-null object
compounds        12817 non-null object
comp_nouns       12817 non-null object
dtypes: object(7)
memory usage: 701.0+ KB


In [46]:
df.comp_nouns.value_counts()

{}                                                                                          1027
{nan}                                                                                        234
{people}                                                                                      56
{/wiki}                                                                                       35
{football}                                                                                    31
                                                                                            ... 
{car, winds, pileup, people, 50-plus-car pileup, whiteout}                                     1
{provinces, forces, targets, houthi, houthi targets, coalition forces, yemen, coalition}       1
{lee jun -, jun, mv, lee, seok, mv sewol, sewol}                                               1
{others, neighborhoods, baghdad, people, attacks, series}                                      1
{arianna huffington, huffingto

In [47]:
df.compounds.value_counts()

[]                                                                                                                                               4766
[(death toll, 4, 14, COMPOUND)]                                                                                                                    36
[(pope francis, 0, 12, COMPOUND)]                                                                                                                  20
[(association football, 3, 23, COMPOUND)]                                                                                                          17
[(rugby union, 3, 14, COMPOUND)]                                                                                                                   14
                                                                                                                                                 ... 
[(border patrol agents, 5, 25, COMPOUND)]                                                           

In [48]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(df['event_summary'])

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    text_counts, df['compounds'], test_size=0.3, random_state=1)

In [50]:
from sklearn.naive_bayes import MultinomialNB
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.