# Named Entities Extraction with spacy
*This is to explore spacy, particularly for extracting named entities in facebook dataset for TimeSets paper.*

To install the library and the required trained models:

```
pip install spacy
python -m spacy download en
```

In [1]:
input_file = '../data/detailedFbData.json'
output_file = '../data/facebook-with-themes.json'

In [2]:
import json
import itertools
from collections import Counter

import spacy
nlp = spacy.load('en_core_web_sm')

## Load data

In [3]:
def extract_post(post):
    text = post.get('message') or post.get('description') or ''
    time = post.get('created_time')
    return time, text

def load_data(filename):
    with open(filename) as f:
        posts = json.load(f) # list of posts
        return [extract_post(post) for post in posts]

In [4]:
posts = load_data(input_file)
docs = [p[1] for p in posts]
len(posts), posts[0]

(536,
 ('2016-09-19T03:00:51+0000',
  'WATCH: "JEB EXCLAMATION POINT!" - Jeb Bush, now driving around Selina Meyer in the opening intro to the #Emmys with Jimmy Kimmel.'))

## Extract entities

In [5]:
entities = [(e.label_, e.text) for e in nlp('. '.join(docs)).ents]
entities[:10]

[('PRODUCT', 'JEB EXCLAMATION'),
 ('PERSON', 'Jeb Bush'),
 ('PERSON', 'Selina Meyer'),
 ('MONEY', 'Emmys'),
 ('PERSON', 'Jimmy Kimmel'),
 ('NORP', 'Syrian'),
 ('DATE', 'today'),
 ('GPE', 'U.S'),
 ('PERSON', 'Rose Pak'),
 ('GPE', "San Francisco's")]

### Common entities per type

In [6]:
entities.sort(key=lambda x: x[0])
for k, g in itertools.groupby(entities, lambda x: x[0]):
    c = Counter([x[1] for x in g])
    print(k)
    print(c.most_common(5))
    print()

CARDINAL
[('two', 10), ('one', 5), ('three', 3), ('5', 2), ('12', 2)]

DATE
[('2016', 12), ('today', 8), ('November', 5), ('Election Day', 4), ('this week', 4)]

EVENT
[('Twitter', 1), ('the Iraq war', 1)]

FAC
[('7:00 ET', 1), ('Strait Talk LIVE', 1), ('Billionaires', 1), ('Islam', 1), ('Gramercy Park', 1)]

GPE
[('US', 21), ('New York', 12), ('U.S.', 9), ('Charlotte', 8), ('New Jersey', 7)]

LAW
[('Constitution', 2), ('Democratic National Convention', 1), ('the Constitution', 1)]

LOC
[('Mars', 2), ('Strait Talk', 1), ('NYC', 1), ('East Aleppo', 1), ('Gulf', 1)]

MONEY
[('Debates', 6), ('AxeFiles', 3), ('MyVote', 2), ('UNGA', 2), ('Emmys', 1)]

NORP
[('Republican', 27), ('Democrats', 16), ('Americans', 10), ('American', 9), ('Republicans', 9)]

ORDINAL
[('first', 39), ('First', 3), ('second', 3), ('eighth', 3), ('third', 3)]

ORG
[('CNN', 24), ('Senate', 14), ('ABC News', 13), ('GOP', 10), ("Trump's", 8)]

PERCENT
[('1%', 3), ('a mere 11%', 3), ('12%', 3), ('46%-41%', 2), ('6%', 2)]


### Most common entities

In [7]:
c = Counter(e[1] for e in entities)
c.most_common(20)

[('Donald J. Trump', 123),
 ('Hillary Clinton', 88),
 ('Obama', 46),
 ('first', 39),
 ('Republican', 27),
 ("Donald J. Trump's", 27),
 ('CNN', 24),
 ('US', 21),
 ('Trump', 17),
 ('Democrats', 16),
 ('Senate', 14),
 ('ABC News', 13),
 ('tonight', 13),
 ('2016', 12),
 ('New York', 12),
 ('Donald Trump', 12),
 ('two', 10),
 ('Americans', 10),
 ('GOP', 10),
 ('Clinton', 10)]

In [8]:
common_entities = [e[0] for e in c.most_common(20)]
common_entities

['Donald J. Trump',
 'Hillary Clinton',
 'Obama',
 'first',
 'Republican',
 "Donald J. Trump's",
 'CNN',
 'US',
 'Trump',
 'Democrats',
 'Senate',
 'ABC News',
 'tonight',
 '2016',
 'New York',
 'Donald Trump',
 'two',
 'Americans',
 'GOP',
 'Clinton']

### Themes for TimeSets
Some entities should be merged such as the ones about Trump.

In [9]:
print(common_entities[0])
print(common_entities[5])
print(common_entities[8])
print(common_entities[15])

Donald J. Trump
Donald J. Trump's
Trump
Donald Trump


So, I will define a theme as an array of entities that should be merged.

In [10]:
s = common_entities
theme_indices = [
    [0, 5, 8, 15],
    [1, 19],
    [2],
    [4],
    [6],
    [7],
    [9],
    [10]
]

In [11]:
themes = [[common_entities[i] for i in indices] for indices in theme_indices]
themes

[['Donald J. Trump', "Donald J. Trump's", 'Trump', 'Donald Trump'],
 ['Hillary Clinton', 'Clinton'],
 ['Obama'],
 ['Republican'],
 ['CNN'],
 ['US'],
 ['Democrats'],
 ['Senate']]

In [12]:
theme_labels = [theme[0] for theme in themes]
theme_labels

['Donald J. Trump',
 'Hillary Clinton',
 'Obama',
 'Republican',
 'CNN',
 'US',
 'Democrats',
 'Senate']

## Export data for TimeSets
What data format in TimeSets look like?
```
{
    themes: ['theme1', 'theme2'], # array of themes, each as a text
    events: [
        title,
        time,
        content,
        themes: [] # a subset of the above array of themes
    ]
}
```

In [13]:
def get_themes(doc, themes, theme_labels):
    entities = [(e.label_, e.text) for e in nlp(doc).ents]
    entities = [e[1] for e in entities]
    
    doc_theme_labels = []
    for i, theme in enumerate(themes):
        if any(e in theme for e in entities): # Check if the document's entities contain any themes
            doc_theme_labels.append(theme_labels[i])
            
    return doc_theme_labels

def get_first_sentence(doc):
    sents = list(nlp(doc).sents)
    return str(sents[0]) if sents else doc

def format_data(post, themes, theme_labels):
    doc = post[1]
    
    return {
        'time': post[0],
        'title': get_first_sentence(doc),
        'content': doc,
        'themes': get_themes(doc, themes, theme_labels)
    }

In [14]:
events = [format_data(post, themes, theme_labels) for post in posts]
events[0]

{'content': 'WATCH: "JEB EXCLAMATION POINT!" - Jeb Bush, now driving around Selina Meyer in the opening intro to the #Emmys with Jimmy Kimmel.',
 'themes': [],
 'time': '2016-09-19T03:00:51+0000',
 'title': 'WATCH: "JEB EXCLAMATION POINT!"'}

In [21]:
data = {
    'themes': theme_labels,
    'events': [e for e in events if e['themes']]
}

with open(output_file, 'w') as f:
    json.dump(data, f)