##### Import packages

In [2]:
import pandas as pd
import spacy 
from spacy import displacy

In [3]:
# !pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz

##### Load spacy model

In [4]:
nlp = spacy.load("en_core_web_sm")

##### Test NER on random data

Link to Spacy Named Entity documentation: https://spacy.io/usage/linguistic-features#named-entities

In [5]:
text = "Apple acquired Zoom in China on Wednesday 6th May 2020.\
This news has made Apple and Google stock jump by 5% on Dow Jones Index in the \
United States of America"

In [6]:
def NER(text):
    """Takes in a string of text and returns a dataframe of all named entities
    
        Args:
            :param text: string, a string containing entire corpus
    """
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    entities = []
    labels = []
    position_start = []
    position_end = []

    for ent in doc.ents:
        entities.append(ent)
        labels.append(ent.label_)
        position_start.append(ent.start_char)
        position_end.append(ent.end_char)

    df = pd.DataFrame({'Entities':entities,'Labels':labels,'Position_Start':position_start, 'Position_End':position_end})

    return df

NER(text)

Unnamed: 0,Entities,Labels,Position_Start,Position_End
0,(Apple),ORG,0,5
1,(Zoom),ORG,15,19
2,(China),GPE,23,28
3,"(Wednesday, 6th)",DATE,32,45
4,(May),DATE,46,49
5,(Apple),ORG,74,79
6,(Google),NORP,84,90
7,"(5, %)",PERCENT,105,107
8,"(Dow, Jones)",ORG,111,120
9,"(the, United, States, of, America)",GPE,130,158


In [7]:
# Can use explain method for any label you don't understand
spacy.explain("NORP")

'Nationalities or religious or political groups'

##### Import dataset

In [9]:
benin_bronze = pd.read_json('../data/benin-bronze.json', lines=True)
benin_bronzes = pd.read_json('../data/benin-bronzes.json', lines=True)

##### Concat the 2 datasets

In [10]:
bb_df = pd.concat([benin_bronze, benin_bronzes])
bb_df.shape

(16374, 28)

In [11]:
bb_df.content.sample(10)

1121    University of Aberdeen to return Benin bronze ...
8920    Hold up!😳 So British curators want to "loan" N...
4282    Stupid debate! The Benin Bronze BELONGS TO THE...
5417    @MorticiaAddie Even the Benin bronzes which th...
5910    Cambridge college's Stolen Benin bronze cocker...
6686    So they still haven’t returned the benin bronz...
9546    Western museums try to forge deal with west Af...
2294    African Benin Bronze cats/Royal leopards https...
7098    Philadelphia Museum Director Offers Apologies,...
5198    The British Museum will return bronze sculptur...
Name: content, dtype: object

##### Join content column into a single string corpus

In [15]:
# prepaaring data for NER function
corpus = ''.join(list(bb_df.content))

In [18]:
len(corpus)

2657106

Note: I didn't clean the content column because I was worried about removing some of the mentioned entities.

##### Run the NER (named entity recognition) function

Currently investigating how to solve the spacy limitation that len(text) <= 1 000 000, if greater than 1 000 000 we run into memory error.

In [19]:
NER(corpus[:1000000])

Unnamed: 0,Entities,Labels,Position_Start,Position_End
0,"(Benin, Kingdom)",GPE,15,28
1,(Nigeria),GPE,32,39
2,(today),DATE,177,182
3,(Museum),ORG,194,200
4,(Germany),GPE,223,230
...,...,...,...,...
19609,(https://t.co/EpyLNQdyT5German),ORG,999392,999421
19610,(https://t.co/005sEOv1Bq),GPE,999629,999652
19611,(GermanyInAfrica),GPE,999654,999669
19612,(#),CARDINAL,999670,999671


In [20]:
from collections import Counter

In [22]:
items = [x.text for x in doc.ents]
Counter(items).most_common(100)

[('Nigeria', 3),
 ('Benin', 2),
 ('Germany', 2),
 ('Berlin', 1),
 ('UK', 1),
 ('https://t.co/R6I85PCgNwWas', 1),
 ('Ep7', 1),
 ('@DavidOlusoga', 1),
 ('Mexico', 1),
 ('German', 1),
 ('Benin City', 1),
 ('Monika Grütters', 1),
 ('Benin Bronzes', 1),
 ('16th century', 1),
 ('Benin kingdom', 1),
 ('today', 1),
 ('Ife Pays N24', 1),
 ('Bride Price', 1),
 ('UK museum', 1),
 ('Benin https://t.co/sdcLPVJjUu', 1),
 ('the Benin Bronzes', 1),
 ('150 years', 1),
 ('Nelson Mandela', 1),
 ('work?The British Museum', 1),
 ('European', 1),
 ('1897', 1),
 ('the kingdom of Benin', 1)]