In [1]:
%load_ext autoreload
%autoreload 2
#Reload all modules (except those excluded by %aimport) automatically now.

In [2]:
import pandas as pd
import spacy
from spacy.displacy.render import EntityRenderer
from IPython.core.display import display, HTML
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import random

In [3]:
def custom_render(doc, df, column, options={}, page=False, minify=False, idx=0):
    """Overload the spaCy built-in rendering to allow custom part-of-speech (POS) tags.
    
    Keyword arguments:
    doc -- a spaCy nlp doc object
    df -- a pandas dataframe object
    column -- the name of of a column of interest in the dataframe
    options -- various options to feed into the spaCy renderer, including colors
    page -- rendering markup as full HTML page (default False)
    minify -- for compact HTML (default False)
    idx -- index for specific query or doc in dataframe (default 0)
    
    """
    renderer, converter = EntityRenderer, parse_custom_ents
    renderer = renderer(options=options)
    parsed = [converter(doc, df=df, idx=idx, column=column)]
    html = renderer.render(parsed, page=page, minify=minify).strip()  
    return display(HTML(html))

def parse_custom_ents(doc, df, idx, column):
    """Parse custom entity types that aren't in the original spaCy module.
    
    Keyword arguments:
    doc -- a spaCy nlp doc object
    df -- a pandas dataframe object
    idx -- index for specific query or doc in dataframe
    column -- the name of of a column of interest in the dataframe
    
    """
    if column in df.columns:
        entities = df[column][idx]
        ents = [{'start': ent[1], 'end': ent[2], 'label': ent[3]} 
                for ent in entities]
    else:
        ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
            for ent in doc.ents]
    return {'text': doc.text, 'ents': ents, 'title': None}

def render_entities(idx, df, options={}, column='named_ents'):
    """A wrapper function to get text from a dataframe and render it visually in jupyter notebooks
    
    Keyword arguments:
    idx -- index for specific query or doc in dataframe (default 0)
    df -- a pandas dataframe object
    options -- various options to feed into the spaCy renderer, including colors
    column -- the name of of a column of interest in the dataframe (default 'named_ents')
    
    """
    text = df['event_summary'][idx]
    custom_render(nlp(text), df=df, column=column, options=options, idx=idx)

In [4]:
options = {
    'colors': {'COMPOUND': '#FE6BFE', 'PROPN': '#18CFE6', 'NOUN': '#18CFE6', 'NP': '#1EECA6', 'ENTITY': '#FF8800'}
}

In [5]:
pd.set_option('display.max_rows', 10) # edit how jupyter will render our pandas dataframes
pd.options.mode.chained_assignment = None # prevent warning about working on a copy of a dataframe

In [6]:
import csv
import sys, os, json, re, time
import subprocess

In [214]:
#file = './nips-2015-papers/Papers.csv'
#file = '/uci-news-aggregator.csv'
df = pd.read_csv('newsdataset1.csv')
df.event_summary=df.event_summary.astype(str)


In [215]:
df['event_summary']=df['event_summary'].str.replace('(','').astype(str)
df['event_summary']=df['event_summary'].str.replace(')','').astype(str)
df['event_summary']=df['event_summary'].str.replace('[','').astype(str)
df['event_summary']=df['event_summary'].str.replace(']','').astype(str)
df['event_summary']=df['event_summary'].str.replace('\"','').astype(str)
df['event_summary']=df['event_summary'].str.replace('\'','').astype(str)
# df['event_summary']=df['event_summary'].str.replace('’','').astype(str)
# df['event_summary']=df['event_summary'].str.replace('.','').astype(str)
# df['event_summary']=df['event_summary'].str.replace('-','').astype(str)
df=df[~df.event_summary.str.contains("/wiki", na=False)]
indexes=df[df['event_summary'].map(len)  < 35].index
df=df.drop(indexes)
index2=df[df['event_summary']=='nan'].index
df=df.drop(index2)

In [216]:
df.describe()

Unnamed: 0.1,Unnamed: 0,index
count,9901.0,9901.0
mean,5188.58287,7291.413797
std,2996.15822,4155.409935
min,0.0,0.0
25%,2593.0,3717.0
50%,5171.0,7283.0
75%,7765.0,10884.0
max,10422.0,14427.0


In [217]:
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

Unnamed: 0.1,Unnamed: 0,index,date,category,event_title,event_summary,entities,person,external_link
0,0,0,2016-01-01,Armed conflicts and attacks,IsraeliE28093Palestinian conflict,A shooting takes place at a pub in Tel Aviv,Israel,leaving two dead and eight injured. The gunma...,an Arab taxi driver
1,1,2,2016-01-01,Disasters and Incidents,FALSE,About one thousand houses in Manilas Tondo dis...,"[Manila""","Tondo, Manila",Philippines
2,2,3,2016-01-01,International Relations,FALSE,The EU-Ukraine Free Trade deal officially come...,coinciding with a Russian food embargo on Ukr...,"[Deep and Comprehensive Free Trade Area""]""",FALSE
3,3,4,2016-01-01,Crime and Law,FALSE,The two-child policy takes effect in China,allowing couples in the country to have at mo...,replacing the controversial one-child policy....,five days prior to its effect. (AFP via Chann...
6,6,11,2016-01-03,Armed conflicts and attacks,Iraqi Civil War 2014E280932017,The Islamic State of Iraq and the Levant claim...,now known as Tikrit Air Academy),near the town of Tikrit,north of Baghdad
...,...,...,...,...,...,...,...,...,...
10418,10418,14421,2019-04-29,Disasters and Incidents,FALSE,A minibus carrying Turkish Süper Lig Alanyaspo...,Turkey,resulting in the death of Czech Republic nati...,while several other players are seriously inj...
10419,10419,14422,2019-04-29,Politics and Elections,FALSE,Indonesian President Joko Widodo has decided t...,on the northwest coast of Java island. The ne...,home to over 10 million people,is sinking at one of the fastest rates in the...
10420,10420,14423,2019-04-29,Politics and Elections,FALSE,U.S. Deputy Attorney General Rod Rosenstein a...,effective May 11. (The Washington Post),"[United States Deputy Attorney General""","Rod Rosenstein]"""
10421,10421,14424,2019-04-30,Arts and culture,2019 Japanese imperial transition,Emperor Akihito abdicates the Chrysanthemum Th...,Crown Prince Naruhito. He is the first Empero...,since Emperor Kōkaku in 1817. (The Japan Time...,"[/wiki/Emperor_of_Japan"""


In [218]:
mini_df = df[:10]
mini_df.index = pd.RangeIndex(len(mini_df.index))

# comment this out to run on full dataset
df = mini_df

In [219]:
lower = lambda x: x.lower() # make everything lowercase

In [220]:
df = pd.DataFrame(df['event_summary'].apply(lower))
df.columns = ['event_summary']
display(df)

Unnamed: 0,event_summary
0,a shooting takes place at a pub in tel aviv
1,about one thousand houses in manilas tondo dis...
2,the eu-ukraine free trade deal officially come...
3,the two-child policy takes effect in china
4,the islamic state of iraq and the levant claim...
5,fiji warns residents to prepare for the impact...
6,the united kingdom designates ascension island...
7,north korea may be preparing for an imminent t...
8,saudi arabia breaks off diplomatic relations w...
9,two suspects are killed in a clash with police


In [221]:
 import spacy

In [222]:
nlp = spacy.load("en_core_web_sm")

In [223]:
def extract_named_ents(text):
    """Extract named entities, and beginning, middle and end idx using spaCy's out-of-the-box model. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    return [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in nlp(text).ents]

def add_named_ents(df):
    """Create new column in data frame with named entity tuple extracted.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['named_ents'] = df['event_summary'].apply(extract_named_ents)   

In [224]:
add_named_ents(df)

In [225]:
df['event_summary'].count()

10

In [226]:
df

Unnamed: 0,event_summary,named_ents
0,a shooting takes place at a pub in tel aviv,[]
1,about one thousand houses in manilas tondo dis...,"[(about one thousand, 0, 18, CARDINAL), (years..."
2,the eu-ukraine free trade deal officially come...,[]
3,the two-child policy takes effect in china,"[(two, 4, 7, CARDINAL)]"
4,the islamic state of iraq and the levant claim...,[]
5,fiji warns residents to prepare for the impact...,"[(24, 94, 96, CARDINAL)]"
6,the united kingdom designates ascension island...,"[(just over half, 173, 187, CARDINAL)]"
7,north korea may be preparing for an imminent t...,[]
8,saudi arabia breaks off diplomatic relations w...,[]
9,two suspects are killed in a clash with police,"[(two, 0, 3, CARDINAL)]"


In [227]:
column = 'named_ents'
render_entities(9, df, options=options, column=column) # take a look at one of the abstracts

In [228]:
def extract_nouns(text):
    """Extract a few types of nouns, and beginning, middle and end idx using spaCy's POS (part of speech) tagger. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    keep_pos = ['PROPN', 'NOUN']
    return [(tok.text, tok.idx, tok.idx+len(tok.text), tok.pos_) for tok in nlp(text) if tok.pos_ in keep_pos]

def add_nouns(df):
    """Create new column in data frame with nouns extracted.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['nouns'] = df['event_summary'].apply(extract_nouns)

In [229]:
add_nouns(df)
display(df)

Unnamed: 0,event_summary,named_ents,nouns
0,a shooting takes place at a pub in tel aviv,[],"[(shooting, 2, 10, NOUN), (place, 17, 22, NOUN..."
1,about one thousand houses in manilas tondo dis...,"[(about one thousand, 0, 18, CARDINAL), (years...","[(houses, 19, 25, NOUN), (tondo, 37, 42, NOUN)..."
2,the eu-ukraine free trade deal officially come...,[],"[(eu, 4, 6, NOUN), (trade, 20, 25, NOUN), (dea..."
3,the two-child policy takes effect in china,"[(two, 4, 7, CARDINAL)]","[(child, 8, 13, NOUN), (policy, 14, 20, NOUN),..."
4,the islamic state of iraq and the levant claim...,[],"[(state, 12, 17, NOUN), (iraq, 21, 25, NOUN), ..."
5,fiji warns residents to prepare for the impact...,"[(24, 94, 96, CARDINAL)]","[(residents, 11, 20, NOUN), (impact, 40, 46, N..."
6,the united kingdom designates ascension island...,"[(just over half, 173, 187, CARDINAL)]","[(kingdom, 11, 18, NOUN), (ascension, 30, 39, ..."
7,north korea may be preparing for an imminent t...,[],"[(north, 0, 5, NOUN), (korea, 6, 11, NOUN), (t..."
8,saudi arabia breaks off diplomatic relations w...,[],"[(arabia, 6, 12, NOUN), (relations, 35, 44, NO..."
9,two suspects are killed in a clash with police,"[(two, 0, 3, CARDINAL)]","[(suspects, 4, 12, NOUN), (clash, 29, 34, NOUN..."


In [230]:
column = 'nouns'
render_entities(0, df, options=options, column=column)

In [231]:
def extract_named_nouns(row_series):
    """Combine nouns and non-numerical entities. 
    
    Keyword arguments:
    row_series -- a Pandas Series object
    
    """
    ents = set()
    idxs = set()
    # remove duplicates and merge two lists together
    for noun_tuple in row_series['nouns']:
        for named_ents_tuple in row_series['named_ents']:
            if noun_tuple[1] == named_ents_tuple[1]: 
                idxs.add(noun_tuple[1])
                ents.add(named_ents_tuple)
        if noun_tuple[1] not in idxs:
            ents.add(noun_tuple)
    
    return sorted(list(ents), key=lambda x: x[1])

def add_named_nouns(df):
    """Create new column in data frame with nouns and named ents.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['named_nouns'] = df.apply(extract_named_nouns, axis=1)

In [232]:
add_named_nouns(df)
display(df)

TypeError: ("'spacy.tokens.span.Span' object is not callable", 'occurred at index 0')

In [None]:
column = 'named_nouns'
render_entities(1, df, options=options, column=column)

In [233]:
text = "Dr. Abraham is the primary author of this paper, and a physician in the specialty of internal medicine."

spacy.displacy.render(nlp(text), jupyter=True, options={'distance':120}) # generating raw-markup using spacy's built-in renderer

In [234]:
def extract_noun_phrases(text):
    """Combine noun phrases. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    return [(chunk.text, chunk.start_char, chunk.end_char, chunk.label_) for chunk in nlp(text).noun_chunks]

def add_noun_phrases(df):
    """Create new column in data frame with noun phrases.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['noun_phrases'] = df['event_summary'].apply(extract_noun_phrases)

In [235]:
def visualize_noun_phrases(text):
    """Create a temporary dataframe to extract and visualize noun phrases. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    df = pd.DataFrame([text]) 
    df.columns = ['event_summary']
    add_noun_phrases(df)
    column = 'noun_phrases'
    render_entities(0, df, options=options, column=column)

In [236]:
visualize_noun_phrases(text)



In [237]:
add_noun_phrases(df)
display(df)

Unnamed: 0,event_summary,named_ents,nouns,noun_phrases
0,a shooting takes place at a pub in tel aviv,[],"[(shooting, 2, 10, NOUN), (place, 17, 22, NOUN...","[(a shooting, 0, 10, NP), (place, 17, 22, NP),..."
1,about one thousand houses in manilas tondo dis...,"[(about one thousand, 0, 18, CARDINAL), (years...","[(houses, 19, 25, NOUN), (tondo, 37, 42, NOUN)...","[(about one thousand houses, 0, 25, NP), (mani..."
2,the eu-ukraine free trade deal officially come...,[],"[(eu, 4, 6, NOUN), (trade, 20, 25, NOUN), (dea...","[(the eu-ukraine free trade deal, 0, 30, NP), ..."
3,the two-child policy takes effect in china,"[(two, 4, 7, CARDINAL)]","[(child, 8, 13, NOUN), (policy, 14, 20, NOUN),...","[(the two-child policy, 0, 20, NP), (effect, 2..."
4,the islamic state of iraq and the levant claim...,[],"[(state, 12, 17, NOUN), (iraq, 21, 25, NOUN), ...","[(the islamic state, 0, 17, NP), (iraq, 21, 25..."
5,fiji warns residents to prepare for the impact...,"[(24, 94, 96, CARDINAL)]","[(residents, 11, 20, NOUN), (impact, 40, 46, N...","[(residents, 11, 20, NP), (the impact, 36, 46,..."
6,the united kingdom designates ascension island...,"[(just over half, 173, 187, CARDINAL)]","[(kingdom, 11, 18, NOUN), (ascension, 30, 39, ...","[(the united kingdom, 0, 18, NP), (ascension i..."
7,north korea may be preparing for an imminent t...,[],"[(north, 0, 5, NOUN), (korea, 6, 11, NOUN), (t...","[(north korea, 0, 11, NP), (an imminent thermo..."
8,saudi arabia breaks off diplomatic relations w...,[],"[(arabia, 6, 12, NOUN), (relations, 35, 44, NO...","[(saudi arabia, 0, 12, NP), (diplomatic relati..."
9,two suspects are killed in a clash with police,"[(two, 0, 3, CARDINAL)]","[(suspects, 4, 12, NOUN), (clash, 29, 34, NOUN...","[(two suspects, 0, 12, NP), (a clash, 27, 34, ..."


In [238]:
column = 'noun_phrases'
render_entities(0, df, options=options, column=column)

In [239]:
def extract_compounds(text):
    """Extract compound noun phrases with beginning and end idxs. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    comp_idx = 0
    compound = []
    compound_nps = []
    tok_idx = 0
    for idx, tok in enumerate(nlp(text)):
        if tok.dep_ == 'compound':

            # capture hyphenated compounds
            children = ''.join([c.text for c in tok.children])
            if '-' in children:
                compound.append(''.join([children, tok.text]))
            else:
                compound.append(tok.text)

            # remember starting index of first child in compound or word
            try:
                tok_idx = [c for c in tok.children][0].idx
            except IndexError:
                if len(compound) == 1:
                    tok_idx = tok.idx
            comp_idx = tok.i

        # append the last word in a compound phrase
        if tok.i - comp_idx == 1:
            compound.append(tok.text)
            if len(compound) > 1: 
                compound = ' '.join(compound)
                compound_nps.append((compound, tok_idx, tok_idx+len(compound), 'COMPOUND'))

            # reset parameters
            tok_idx = 0 
            compound = []

    return compound_nps

def add_compounds(df):
    """Create new column in data frame with compound noun phrases.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['compounds'] = df['event_summary'].apply(extract_compounds)

In [240]:
add_compounds(df)
display(df)

Unnamed: 0,event_summary,named_ents,nouns,noun_phrases,compounds
0,a shooting takes place at a pub in tel aviv,[],"[(shooting, 2, 10, NOUN), (place, 17, 22, NOUN...","[(a shooting, 0, 10, NP), (place, 17, 22, NP),...",[]
1,about one thousand houses in manilas tondo dis...,"[(about one thousand, 0, 18, CARDINAL), (years...","[(houses, 19, 25, NOUN), (tondo, 37, 42, NOUN)...","[(about one thousand houses, 0, 25, NP), (mani...","[(one thousand, 6, 18, COMPOUND), (manilas ton..."
2,the eu-ukraine free trade deal officially come...,[],"[(eu, 4, 6, NOUN), (trade, 20, 25, NOUN), (dea...","[(the eu-ukraine free trade deal, 0, 30, NP), ...","[(trade deal, 15, 25, COMPOUND)]"
3,the two-child policy takes effect in china,"[(two, 4, 7, CARDINAL)]","[(child, 8, 13, NOUN), (policy, 14, 20, NOUN),...","[(the two-child policy, 0, 20, NP), (effect, 2...","[(two-child policy, 4, 20, COMPOUND)]"
4,the islamic state of iraq and the levant claim...,[],"[(state, 12, 17, NOUN), (iraq, 21, 25, NOUN), ...","[(the islamic state, 0, 17, NP), (iraq, 21, 25...","[(claims responsibility, 41, 62, COMPOUND), (s..."
5,fiji warns residents to prepare for the impact...,"[(24, 94, 96, CARDINAL)]","[(residents, 11, 20, NOUN), (impact, 40, 46, N...","[(residents, 11, 20, NP), (the impact, 36, 46,...","[(cyclone ula, 57, 68, COMPOUND)]"
6,the united kingdom designates ascension island...,"[(just over half, 173, 187, CARDINAL)]","[(kingdom, 11, 18, NOUN), (ascension, 30, 39, ...","[(the united kingdom, 0, 18, NP), (ascension i...","[(ascension island, 30, 46, COMPOUND)]"
7,north korea may be preparing for an imminent t...,[],"[(north, 0, 5, NOUN), (korea, 6, 11, NOUN), (t...","[(north korea, 0, 11, NP), (an imminent thermo...","[(north korea, 0, 11, COMPOUND), (weapon test,..."
8,saudi arabia breaks off diplomatic relations w...,[],"[(arabia, 6, 12, NOUN), (relations, 35, 44, NO...","[(saudi arabia, 0, 12, NP), (diplomatic relati...","[(sheikh nimrs execution, 61, 83, COMPOUND)]"
9,two suspects are killed in a clash with police,"[(two, 0, 3, CARDINAL)]","[(suspects, 4, 12, NOUN), (clash, 29, 34, NOUN...","[(two suspects, 0, 12, NP), (a clash, 27, 34, ...",[]


In [241]:
column = 'compounds'
render_entities(0, df, options=options, column=column)

In [242]:
def extract_comp_nouns(row_series, cols=[]):
    """Combine compound noun phrases and entities. 
    
    Keyword arguments:
    row_series -- a Pandas Series object
    
    """
    return {noun_tuple[0] for col in cols for noun_tuple in row_series[col]}

def add_comp_nouns(df, cols=[]):
    """Create new column in data frame with merged entities.
    
    Keyword arguments:
    df -- a dataframe object
    cols -- a list of column names that need to be merged
    
    """
    df['comp_nouns'] = df.apply(extract_comp_nouns, axis=1, cols=cols)

In [243]:
cols = ['nouns', 'compounds']
add_comp_nouns(df, cols=cols)
display(df)

Unnamed: 0,event_summary,named_ents,nouns,noun_phrases,compounds,comp_nouns
0,a shooting takes place at a pub in tel aviv,[],"[(shooting, 2, 10, NOUN), (place, 17, 22, NOUN...","[(a shooting, 0, 10, NP), (place, 17, 22, NP),...",[],"{pub, tel, place, shooting}"
1,about one thousand houses in manilas tondo dis...,"[(about one thousand, 0, 18, CARDINAL), (years...","[(houses, 19, 25, NOUN), (tondo, 37, 42, NOUN)...","[(about one thousand houses, 0, 25, NP), (mani...","[(one thousand, 6, 18, COMPOUND), (manilas ton...","{manilas tondo district, years, tondo, eve, fe..."
2,the eu-ukraine free trade deal officially come...,[],"[(eu, 4, 6, NOUN), (trade, 20, 25, NOUN), (dea...","[(the eu-ukraine free trade deal, 0, 30, NP), ...","[(trade deal, 15, 25, COMPOUND)]","{trade, force, trade deal, eu, deal}"
3,the two-child policy takes effect in china,"[(two, 4, 7, CARDINAL)]","[(child, 8, 13, NOUN), (policy, 14, 20, NOUN),...","[(the two-child policy, 0, 20, NP), (effect, 2...","[(two-child policy, 4, 20, COMPOUND)]","{child, effect, china, policy, two-child policy}"
4,the islamic state of iraq and the levant claim...,[],"[(state, 12, 17, NOUN), (iraq, 21, 25, NOUN), ...","[(the islamic state, 0, 17, NP), (iraq, 21, 25...","[(claims responsibility, 41, 62, COMPOUND), (s...","{base, speicher, iraq, state, levant, attack, ..."
5,fiji warns residents to prepare for the impact...,"[(24, 94, 96, CARDINAL)]","[(residents, 11, 20, NOUN), (impact, 40, 46, N...","[(residents, 11, 20, NP), (the impact, 36, 46,...","[(cyclone ula, 57, 68, COMPOUND)]","{ula, cyclone, residents, impact, cyclone ula,..."
6,the united kingdom designates ascension island...,"[(just over half, 173, 187, CARDINAL)]","[(kingdom, 11, 18, NOUN), (ascension, 30, 39, ...","[(the united kingdom, 0, 18, NP), (ascension i...","[(ascension island, 30, 46, COMPOUND)]","{kingdom, island, ascension, half, area, water..."
7,north korea may be preparing for an imminent t...,[],"[(north, 0, 5, NOUN), (korea, 6, 11, NOUN), (t...","[(north korea, 0, 11, NP), (an imminent thermo...","[(north korea, 0, 11, COMPOUND), (weapon test,...","{site, weapon test, test, weapon, north korea,..."
8,saudi arabia breaks off diplomatic relations w...,[],"[(arabia, 6, 12, NOUN), (relations, 35, 44, NO...","[(saudi arabia, 0, 12, NP), (diplomatic relati...","[(sheikh nimrs execution, 61, 83, COMPOUND)]","{sheikh nimrs execution, arabia, sheikh, attac..."
9,two suspects are killed in a clash with police,"[(two, 0, 3, CARDINAL)]","[(suspects, 4, 12, NOUN), (clash, 29, 34, NOUN...","[(two suspects, 0, 12, NP), (a clash, 27, 34, ...",[],"{police, suspects, clash}"


In [244]:
# take a look at all the nouns again
column = 'named_nouns'
render_entities(0, df, options=options, column=column)

In [245]:
# take a look at all the compound noun phrases again
column = 'compounds'
render_entities(0, df, options=options, column=column)

In [246]:
# take a look at combined entities
df['comp_nouns'][0] 

{'place', 'pub', 'shooting', 'tel'}

In [247]:
df.head()


Unnamed: 0,event_summary,named_ents,nouns,noun_phrases,compounds,comp_nouns
0,a shooting takes place at a pub in tel aviv,[],"[(shooting, 2, 10, NOUN), (place, 17, 22, NOUN...","[(a shooting, 0, 10, NP), (place, 17, 22, NP),...",[],"{pub, tel, place, shooting}"
1,about one thousand houses in manilas tondo dis...,"[(about one thousand, 0, 18, CARDINAL), (years...","[(houses, 19, 25, NOUN), (tondo, 37, 42, NOUN)...","[(about one thousand houses, 0, 25, NP), (mani...","[(one thousand, 6, 18, COMPOUND), (manilas ton...","{manilas tondo district, years, tondo, eve, fe..."
2,the eu-ukraine free trade deal officially come...,[],"[(eu, 4, 6, NOUN), (trade, 20, 25, NOUN), (dea...","[(the eu-ukraine free trade deal, 0, 30, NP), ...","[(trade deal, 15, 25, COMPOUND)]","{trade, force, trade deal, eu, deal}"
3,the two-child policy takes effect in china,"[(two, 4, 7, CARDINAL)]","[(child, 8, 13, NOUN), (policy, 14, 20, NOUN),...","[(the two-child policy, 0, 20, NP), (effect, 2...","[(two-child policy, 4, 20, COMPOUND)]","{child, effect, china, policy, two-child policy}"
4,the islamic state of iraq and the levant claim...,[],"[(state, 12, 17, NOUN), (iraq, 21, 25, NOUN), ...","[(the islamic state, 0, 17, NP), (iraq, 21, 25...","[(claims responsibility, 41, 62, COMPOUND), (s...","{base, speicher, iraq, state, levant, attack, ..."


In [248]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
event_summary    10 non-null object
named_ents       10 non-null object
nouns            10 non-null object
noun_phrases     10 non-null object
compounds        10 non-null object
comp_nouns       10 non-null object
dtypes: object(6)
memory usage: 608.0+ bytes


In [249]:
df.comp_nouns.value_counts()

{ula, cyclone, residents, impact, cyclone ula, france}                                                                                                                 1
{police, suspects, clash}                                                                                                                                              1
{manilas tondo district, years, tondo, eve, festivities, others, years eve firecracker festivities, houses, news, firecracker, one thousand, district, philippines}    1
{pub, tel, place, shooting}                                                                                                                                            1
{kingdom, island, ascension, half, area, waters, reserve, bbc, fishing, ocean, ascension island}                                                                       1
{sheikh nimrs execution, arabia, sheikh, attack, relations, iran, execution, embassy, tehran, nimrs, bbc}                                                  

In [250]:
df.compounds.value_counts()

[]                                                                                                                                       2
[(cyclone ula, 57, 68, COMPOUND)]                                                                                                        1
[(one thousand, 6, 18, COMPOUND), (manilas tondo district, 29, 51, COMPOUND), (years eve firecracker festivities, 96, 129, COMPOUND)]    1
[(ascension island, 30, 46, COMPOUND)]                                                                                                   1
[(sheikh nimrs execution, 61, 83, COMPOUND)]                                                                                             1
[(claims responsibility, 41, 62, COMPOUND), (suicide bomb attack, 69, 88, COMPOUND), (army base camp speicher, 94, 117, COMPOUND)]       1
[(two-child policy, 4, 20, COMPOUND)]                                                                                                    1
[(trade deal, 15, 25, COMPO

In [251]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(df['event_summary'])

In [252]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    text_counts, df['compounds'], test_size=0.3, random_state=1)

In [253]:
# from __future__ import unicode_literals
# import spacy,en_core_web_sm
# import textacy
# nlp = en_core_web_sm.load()
# sentence = 'The author is writing a new book.'
# pattern = r'<VERB>?<ADV>*<VERB>+'
# doc = textacy.Doc(sentence, lang='en_core_web_sm')
# lists = textacy.extract.pos_regex_matches(doc, pattern)
# for list in lists:
#     print(list.text)

In [254]:

text=df['event_summary']
text1 = str(text)
doc = nlp(text1)

In [255]:
#df1=df.columns = ['event_summary']
for i, token in enumerate(doc):
    df.loc[i, 'text'] = token.text
    df.loc[i, 'pos'] = token.pos_
    df.loc[i, 'dep'] = token.dep_
#     df.loc[i, 'ner'] = token.label_

In [256]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [257]:
df

Unnamed: 0,event_summary,named_ents,nouns,noun_phrases,compounds,comp_nouns,text,pos,dep
0,a shooting takes place at a pub in tel aviv,[],"[(shooting, 2, 10, NOUN), (place, 17, 22, NOUN...","[(a shooting, 0, 10, NP), (place, 17, 22, NP),...",[],"{pub, tel, place, shooting}",0,PUNCT,meta
1,about one thousand houses in manilas tondo dis...,"[(about one thousand, 0, 18, CARDINAL), (years...","[(houses, 19, 25, NOUN), (tondo, 37, 42, NOUN)...","[(about one thousand houses, 0, 25, NP), (mani...","[(one thousand, 6, 18, COMPOUND), (manilas ton...","{manilas tondo district, years, tondo, eve, fe...",,SPACE,
2,the eu-ukraine free trade deal officially come...,[],"[(eu, 4, 6, NOUN), (trade, 20, 25, NOUN), (dea...","[(the eu-ukraine free trade deal, 0, 30, NP), ...","[(trade deal, 15, 25, COMPOUND)]","{trade, force, trade deal, eu, deal}",a,DET,det
3,the two-child policy takes effect in china,"[(two, 4, 7, CARDINAL)]","[(child, 8, 13, NOUN), (policy, 14, 20, NOUN),...","[(the two-child policy, 0, 20, NP), (effect, 2...","[(two-child policy, 4, 20, COMPOUND)]","{child, effect, china, policy, two-child policy}",shooting,NOUN,nsubj
4,the islamic state of iraq and the levant claim...,[],"[(state, 12, 17, NOUN), (iraq, 21, 25, NOUN), ...","[(the islamic state, 0, 17, NP), (iraq, 21, 25...","[(claims responsibility, 41, 62, COMPOUND), (s...","{base, speicher, iraq, state, levant, attack, ...",takes,VERB,ROOT
...,...,...,...,...,...,...,...,...,...
123,,,,,,,event_summary,ADJ,npadvmod
124,,,,,,,",",PUNCT,punct
125,,,,,,,dtype,NOUN,ROOT
126,,,,,,,:,PUNCT,punct


In [297]:
df1= pd.DataFrame()


In [300]:
import pycountry
# j=0
for i in range(0,10):
    text = df.loc[i,"event_summary"]
    df1.loc[i,'event']=text.title()
    print( df1.loc[i,'event'])
    
    

A Shooting Takes Place At A Pub In Tel Aviv
About One Thousand Houses In Manilas Tondo District In The Philippines Are Set Ablaze Following New Years Eve Firecracker Festivities That Left One Dead And 380 Others Injured. Ap Via Ctv News
The Eu-Ukraine Free Trade Deal Officially Comes Into Force
The Two-Child Policy Takes Effect In China
The Islamic State Of Iraq And The Levant Claims Responsibility For A Suicide Bomb Attack On A Iraqi Army Base Camp Speicher
Fiji Warns Residents To Prepare For The Impact Of Severe Tropical Cyclone Ula. Afp Via France 24
The United Kingdom Designates Ascension Island And Its Surrounding Waters In The Atlantic Ocean As A Marine Protected Area. The Reserve Will Be Almost As Big As The Uk With Just Over Half Of The Protected Area Completely Closed To Fishing. Bbc
North Korea May Be Preparing For An Imminent Thermonuclear Weapon Test At Its Punggye-Ri Nuclear Test Site
Saudi Arabia Breaks Off Diplomatic Relations With Iran After Sheikh Nimrs Execution And A

In [294]:
df1

Unnamed: 0,text
a,A
shooting,Shooting
takes,Takes
place,Place
at,At
...,...
two,Two
suspects,Suspects
killed,Killed
clash,Clash


Different types of Named Entities PERSON People, including fictional. NORP Nationalities or religious or political groups. FAC Buildings, airports, highways, bridges, etc. ORG Companies, agencies, institutions, etc. GPE Countries, cities, states. LOC Non-GPE locations, mountain ranges, bodies of water. PRODUCT Objects, vehicles, foods, etc. (Not services.) EVENT Named hurricanes, battles, wars, sports events, etc. WORK_OF_ART Titles of books, songs, etc. LAW Named documents made into laws. LANGUAGE Any named language. DATE Absolute or relative dates or periods. TIME Times smaller than a day. PERCENT Percentage, including "%". MONEY Monetary values, including unit. QUANTITY Measurements, as of weight or distance. ORDINAL "first", "second", etc. CARDINAL Numerals that do not fall under another type.

In [302]:
import spacy
df3 = pd.DataFrame()
nlp = spacy.load("en_core_web_sm")
for i in range(0,10):
    text=df.loc[i,"event_summary"].title()
    doc = nlp(text)
    spacy.displacy.render(doc, style='ent',jupyter=True)
    for ent in doc.ents:
#         print(ent.text, ent.label_)
        for i,token in enumerate(doc):
#             token=token.capitalize()
            df3.loc[i,'text'] = token.text
            df3.loc[i,'lemma_'] = token.lemma_
            df3.loc[i,'pos_'] = token.pos_
            df3.loc[i,'tag_'] = token.tag_
            df3.loc[i,'dep_'] = token.dep_
            df3.loc[i,'shape_'] = token.shape_
            df3.loc[i,'is_alpha'] = token.is_alpha
            df3.loc[i,'is_stop'] = token.is_stop
            

  "__main__", mod_spec)


In [259]:
df3

Unnamed: 0,text,lemma_,pos_,tag_,dep_,shape_,is_alpha,is_stop
0,two,two,NUM,CD,nummod,xxx,True,True
1,suspects,suspect,NOUN,NNS,nsubjpass,xxxx,True,False
2,are,be,VERB,VBP,auxpass,xxx,True,True
3,killed,kill,VERB,VBN,ROOT,xxxx,True,False
4,in,in,ADP,IN,prep,xx,True,True
...,...,...,...,...,...,...,...,...
39,closed,close,VERB,VBN,advcl,xxxx,True,False
40,to,to,ADP,IN,prep,xx,True,True
41,fishing,fishing,NOUN,NN,pobj,xxxx,True,False
42,.,.,PUNCT,.,punct,.,False,False


In [191]:
spacy.displacy.render(doc, style='dep',jupyter=True,options = {'compact':60})

https://www.kaggle.com/ganeshn88/faster-nlp-with-spacy-in-python/code

In [192]:
from __future__ import unicode_literals
import spacy,en_core_web_sm
import textacy
from textacy import io

nlp = en_core_web_sm.load()
sentence = 'The author is writing a new book.'
pattern = r'<VERB>?<ADV>*<VERB>+'
# doc = textacy.Doc.load(sentence, metadata=metadata, lang='en_core_web_sm')
doc = textacy.make_spacy_doc(sentence, lang='en_core_web_sm')

# doc = textacy.corpus.Corpus(sentence, lang='en_core_web_sm')
lists = textacy.extract.pos_regex_matches(doc, pattern)
for list in lists:
    print(list.text)

is writing


In [193]:
import spacy 
  
nlp = spacy.load('en_core_web_sm') 
  

In [194]:
import pandas as pd


In [195]:
df2= (pd.DataFrame())

In [196]:
alphabet=df['event_summary'][3]

In [197]:
data = alphabet.split()
i=0

In [198]:
for temp in data:
    df2.loc[i,'tokens']=temp
    i=i+1
    

In [199]:
df2

Unnamed: 0,tokens
0,the
1,twochild
2,policy
3,takes
4,effect
5,in
6,china


In [200]:
abc=df2.loc[0,'tokens']

In [201]:
abc.label_

AttributeError: 'str' object has no attribute 'label_'

In [261]:
# for i in range(0,10):
alphabet="Tondo District in the Philippines"
doc = nlp(alphabet)
print(doc)
for ent in doc.ents:
        print(ent.text, ent.label_)


Tondo District in the Philippines
Tondo District PERSON
Philippines GPE
