# Named Entities Extraction with spacy
*This is to explore spacy, particularly for extracting named entities in facebook dataset for TimeSets paper.*

To install the library and the required trained models:

```
pip install spacy
python -m spacy download en
```

In [65]:
input_file = '../data/fbv4.json'
output_file = '../data/facebook-with-themes.json'

In [66]:
import json
import itertools
from collections import Counter

import spacy
nlp = spacy.load('en_core_web_sm')

## Load data

In [67]:
def extract_post(post):
    time = post.get('created_time')
    subject = post.get('Subject', '').strip()
    content = post.get('description', '').strip()
    text = (subject + '. ' + content).strip()
    trust = post.get('Trust', '')
    relevance = post.get('Relevance', '')

    return text, time, subject, content, trust, relevance

def load_data(filename):
    with open(filename) as f:
        posts = json.load(f) # list of posts
        posts = [extract_post(post) for post in posts]
        return [p for p in posts if p[2]]

In [68]:
posts = load_data(input_file)
docs = [p[0] for p in posts]
len(posts), posts[0], docs[0]

(949,
 ('1 Woman Killed, Several Children Injured in Aleppo.',
  '2016-09-19T16:28:44+0000',
  '1 Woman Killed, Several Children Injured in Aleppo',
  '',
  'C3',
  4),
 '1 Woman Killed, Several Children Injured in Aleppo.')

## Extract entities

In [69]:
entities = [(e.label_, e.text) for e in nlp('. '.join(docs)).ents]
entities[:10]

[('CARDINAL', '1'),
 ('ORG', 'Aleppo..'),
 ('WORK_OF_ART', "Journal' Recovered"),
 ('NORP', 'Syrian'),
 ('GPE', 'US'),
 ('PERSON', 'Man Believed'),
 ('PERSON', 'Good Morning America'),
 ('PRODUCT', 'JEB EXCLAMATION'),
 ('PERSON', 'Jeb Bush'),
 ('PERSON', 'Selina Meyer')]

### Common entities per type

In [70]:
entities.sort(key=lambda x: x[0])
for k, g in itertools.groupby(entities, lambda x: x[0]):
    c = Counter([x[1] for x in g])
    print(k)
    print(c.most_common(5))
    print()

CARDINAL
[('one', 14), ('two', 10), ('5', 6), ('One', 6), ('Two', 5)]

DATE
[('2016', 13), ('Election Day', 6), ('Monday', 5), ('today', 5), ('This Week', 4)]

EVENT
[('the 2024 Olympics', 2), ('the Iraq War', 2), ("NYC Bombing 'Act of Terror'", 1), ("Between Two Ferns' Interview", 1), ('Bombing Incidents NY', 1)]

FAC
[('Blue Point Brewery', 1), ('Latest National Poll', 1), ('Influential San Francisco Activist Rose Pak Dies', 1), ('Times Square', 1), ('…', 1)]

GPE
[('Charlotte', 31), ('US', 25), ('America', 15), ('North Carolina', 10), ('New York', 9)]

LAW
[('Thug Videos Cop', 1), ('the Constitution', 1), ('SCREENSHOTS', 1), ('the Midst of a COUP', 1)]

LOC
[('Strait Talk', 1), ('Mediterranean', 1), ('Nile Delta', 1), ('Northern California', 1), ('Dear America', 1)]

MONEY
[('Debates', 3), ('Trump', 3), ('$5 million', 3), ('MyVote', 2), ('$1 million', 2)]

NORP
[('Trump', 16), ('Republicans', 14), ('Democrats', 14), ('Americans', 13), ('Muslim', 8)]

ORDINAL
[('first', 26), ('First'

### Most common entities

In [71]:
c = Counter(e[1] for e in entities)
c.most_common(20)

[('Clinton', 116),
 ('Hillary', 76),
 ('Timeline Photos', 63),
 ('Hillary Clinton', 60),
 ('Trump', 53),
 ('Donald Trump', 49),
 ('Obama', 44),
 ('Charlotte', 31),
 ('first', 26),
 ('US', 25),
 ('GOP', 20),
 ("Trump's", 18),
 ('CNN', 16),
 ('America', 15),
 ('one', 14),
 ('Republicans', 14),
 ('Democrats', 14),
 ('FBI', 14),
 ('2016', 13),
 ('Americans', 13)]

In [72]:
common_entities = [e[0] for e in c.most_common(20)]
for i, e in enumerate(common_entities):
    print(i, e)

0 Clinton
1 Hillary
2 Timeline Photos
3 Hillary Clinton
4 Trump
5 Donald Trump
6 Obama
7 Charlotte
8 first
9 US
10 GOP
11 Trump's
12 CNN
13 America
14 one
15 Republicans
16 Democrats
17 FBI
18 2016
19 Americans


### Themes for TimeSets
Some entities should be merged such as the ones about Trump.

In [73]:
print(common_entities[4])
print(common_entities[5])
print(common_entities[11])

Trump
Donald Trump
Trump's


and Cliton.

In [74]:
print(common_entities[0])
print(common_entities[1])
print(common_entities[3])

Clinton
Hillary
Hillary Clinton


So, I will define a theme as an array of entities that should be merged.

In [75]:
s = common_entities
theme_indices = [
    [0, 1, 3],
    [4, 5, 11],
    [6],
    [7],
    [10],
    [15],
    [16],
    [17]
]

In [76]:
themes = [[common_entities[i] for i in indices] for indices in theme_indices]
themes

[['Clinton', 'Hillary', 'Hillary Clinton'],
 ['Trump', 'Donald Trump', "Trump's"],
 ['Obama'],
 ['Charlotte'],
 ['GOP'],
 ['Republicans'],
 ['Democrats'],
 ['FBI']]

In [77]:
theme_labels = [theme[0] for theme in themes]
theme_labels

['Clinton',
 'Trump',
 'Obama',
 'Charlotte',
 'GOP',
 'Republicans',
 'Democrats',
 'FBI']

## Export data for TimeSets
What data format in TimeSets look like?
```
{
    themes: ['theme1', 'theme2'], # array of themes, each as a text
    events: [
        title,
        time,
        content,
        trust,
        relevance,
        themes: [] # a subset of the above array of themes
    ]
}
```

In [78]:
def get_themes(doc, themes, theme_labels):
    entities = [(e.label_, e.text) for e in nlp(doc).ents]
    entities = [e[1] for e in entities]
    
    doc_theme_labels = []
    for i, theme in enumerate(themes):
        if any(e in theme for e in entities): # Check if the document's entities contain any themes
            doc_theme_labels.append(theme_labels[i])
            
    return doc_theme_labels

# def get_first_sentence(doc):
#     sents = list(nlp(doc).sents)
#     return str(sents[0]) if sents else doc

def format_data(post, themes, theme_labels):
    doc = post[0]
    return {
        'time': post[1],
        'title': post[2],
        'content': post[3],
        'trust': post[4],
        'relevance': post[5],
        'themes': get_themes(doc, themes, theme_labels)
    }

In [79]:
events = [format_data(post, themes, theme_labels) for post in posts]
events = [e for e in events if e['themes']]
len(events), events[0]

(334,
 {'content': '',
  'relevance': 4,
  'themes': ['Republicans'],
  'time': '2016-09-20T17:08:03+0000',
  'title': 'Republicans See Turnaround in Indiana Senate Race',
  'trust': 'C3'})

In [80]:
data = {
    'themes': theme_labels,
    'events': events
}

with open(output_file, 'w') as f:
    json.dump(data, f)