In [None]:
!pip3 install spacy==2.1.0
!python3 -m spacy download en_core_web_lg

Import libraries:

In [None]:
from collections import Counter
from bs4 import BeautifulSoup
import requests
import spacy
import numpy
import nltk
import string

from nltk.corpus import stopwords

import matplotlib
import matplotlib.pyplot as plt
import numpy as np

nltk.download('stopwords')
eng_stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


CDFs:

In [40]:
def detokenizer(list_of_tokens):
    text_str="".join([" "+w if not w.startswith("'") and not w.startswith("’") and w!='' and w not in string.punctuation else w for w in list_of_tokens]).strip()
    return(text_str)

class Score:
    def __init__(self, f1, precision, recall, wiki,model):
        self.f1 = f1
        self.precision = precision
        self.recall = recall
        self.wiki = wiki
        self.model = model

def has_upper(word):
  for c in word:
    if c.isupper():
      return True
  return False

def evaluate(n, raw_corpus, labels):

  sentences = raw_corpus.split('\n')
  sentences=[sentence for sentence in sentences if len(sentence) > 0]

  doc_len=len(sentences)//n

  sentences = raw_corpus.split('\n')
  sentences=[sentence for sentence in sentences if len(sentence)>0]
  doc_len=144000//n
  documents=numpy.array_split(sentences,doc_len)
  documents=[document.tolist() for document in documents]
  documents=[' '.join(document) for document in documents]
  
  #get tokens for each document
  tokenized_documents=[document.split(' ') for document in documents]


  #get labels for each token
  fully_labeled_documents=[ [token.split('|') for token in tokenized_document] for tokenized_document in tokenized_documents]

  scores=[]
  i=0
  nlp = spacy.load('en_core_web_lg', disable=["tagger","parser"])
  for document in fully_labeled_documents:

    doc=nlp(detokenizer([token[0] for token in document]))
    
    spacy_entities=[token.text for token in doc if token.ent_type_ in labels]
    corpus_entities=[token[0] for token in document if token[2] in ['I-PER','I-ORG']]

    E1=set(corpus_entities)
    E2=set(spacy_entities)
    intersection=[ner for ner in E1 if ner in E2]
    try:
      Precision = len(intersection)/len(E2)
      Recall = len(intersection)/len(E1)
      F1 = 2*Precision*Recall/(Precision + Recall)
      scores.append(Score(F1,Precision,Recall,E1,E2))
    except:
      scores.append(Score(0,0,0,E1,E2)) 

  
  print('F1:')
  print('median = ', round(np.median([score.f1 for score in scores]),2))
  print('standard = ', round(np.std([score.f1 for score in scores]),2))

  print('\nPrecision:')
  print('median = ', round(np.median([score.precision for score in scores]),2))
  print('standard = ', round(np.std([score.recall for score in scores]),2))
  
  print('\nRecall:')
  print('median = ', round(np.median([score.recall for score in scores]),2))
  print('standard = ', round(np.std([score.recall for score in scores]),2))  

score_and_len_pairs=[]

Create model:

In [None]:
spacy_nlp = spacy.load('en_core_web_lg', disable=["tagger","parser"])

Get document:

In [None]:
htmlString = requests.get('https://en.wikipedia.org/wiki/Higgs_boson')
soup = BeautifulSoup(htmlString.content, 'html.parser')
paragraphs = soup.find_all(['p', 'article', 'section', 'title', 'h1', 'h2', 'h3'])
rawtext = ' '.join([i.text for i in paragraphs])

Get entities:

In [None]:
%%time
doc=spacy_nlp(rawtext)

i=0
ner_list = []

for ent in doc.ents:
  if ent.label_ in ['ORG', 'PERSON']:
    ner_list.append(ent.text)

CPU times: user 1.52 s, sys: 154 ms, total: 1.68 s
Wall time: 1.68 s


Print top entities:

In [None]:
ners = dict(Counter(ner_list))
ners = sorted(ners.items(), key=lambda x: x[1], reverse=True)
ners = [i[0] for i in ners][:10]

for i in ners:
  print('+', i)

+ Higgs
+ CERN
+ LHC
+ CMS
+ ATLAS
+ GHK
+ Goldstone
+ Lederman
+ SU(2
+ Peter Higgs


Get wikipedia marked corpus:

In [None]:
!wget https://github.com/dice-group/FOX/raw/master/input/Wikiner/aij-wikiner-en-wp2.bz2

--2020-11-03 15:03:12--  https://github.com/dice-group/FOX/raw/master/input/Wikiner/aij-wikiner-en-wp2.bz2
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/aij-wikiner-en-wp2.bz2 [following]
--2020-11-03 15:03:13--  https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/aij-wikiner-en-wp2.bz2
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6251145 (6.0M) [application/octet-stream]
Saving to: ‘aij-wikiner-en-wp2.bz2’


2020-11-03 15:03:13 (25.4 MB/s) - ‘aij-wikiner-en-wp2.bz2’ saved [6251145/6251145]



Unzip file here:

In [None]:
!bzip2 -d /content/aij-wikiner-en-wp2.bz2

Read file and define tokenizer:

In [None]:
f = open('/content/aij-wikiner-en-wp2', 'r')
raw_corpus = f.read()

spacy_nlp = spacy.load('en_core_web_lg')
spacy_nlp.tokenizer = spacy_nlp.tokenizer.tokens_from_list
spacy_nlp.max_length = 99999999999

Get model metrics:

In [42]:
%%time
score_and_len_pairs = evaluate(300, raw_corpus, ['PERSON', 'ORG'])

F1:
median =  0.79
standard =  0.06

Precision:
median =  0.73
standard =  0.04

Recall:
median =  0.87
standard =  0.04
CPU times: user 3min 10s, sys: 4.45 s, total: 3min 14s
Wall time: 3min 14s
