In [43]:
""" Named Entity Recognition """

import read_data
from datetime import datetime, timedelta
import spacy  # the spacy nets have been trained on OntoNotes 5.0
from spacy import displacy
from collections import Counter
import pandas as pd
from itertools import chain
import visualization
from pathlib import Path

In [1]:
def most_common_entities(df, nlp_doc_colname:str, n_common:int):
    """ 
    get most common entities for a series of articles on a certain date 
    @param df: data frame containing our articles
    @param nlp_doc_colname: name of column containing nlp-processed documents
    @param n_common: number of most frequent to keep
    """
    most_common = []
    for d in df[nlp_doc_colname]:
        items = [x.text for x in d.ents]
        most_common += Counter(items).most_common(n_common)
    colnames = []
    for i in range(1, n_common+1):
        colnames.append(f"most_common_{i}")
        colnames.append(f"most_common_{i}_num")
    df_most_common = pd.DataFrame.from_records(most_common, columns=colnames)
    return df_most_common


# TODO include functionality for counting most frequent Person, Location, etc.
def count_most_frequent(group):
    # current approach: most common of the most common - should be ok, I think
    items = [[i[0]] * i[1] for i in group if not (i is None)]
    items = list(chain.from_iterable(items))
    most_common = Counter(items).most_common(1)
    return most_common[0][0]  # get only the phrase for now

In [3]:
nlp = spacy.load("en_core_web_sm")  # "eng_core_web_lg" for better but slower results

In [4]:
df = read_data.get_body_df(
    start_date=datetime.strptime("2020-03-29", "%Y-%m-%d"),
    end_date=datetime.strptime("2020-04-06", "%Y-%m-%d"),
    articles_per_period=10,
)

Loading the data


In [5]:
df["nlp"] = [nlp(doc) for doc in df.body]  # might be a lot faster if we merge all articles of a day into one document?

In [39]:
for i in range(df.shape[0]):
    
    for token in df["nlp"][i]:
        # print(token)
        if(str(token) == 'coronavirus'):
            print(token.text in nlp.vocab)
            print(token.orth in nlp.vocab)

        if(token.text not in nlp.vocab):  
            print(token.text in nlp.vocab)
            print(token.orth in nlp.vocab)

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [41]:
df["body"][0]

'On Sunday, British Prime Minister Boris Johnson was hospitalized "for tests" because of "persistent" COVID-19 symptoms\xa010 days\xa0after he tested positive, CNN reports.\xa0\nJohnson reportedly went to the unspecified London hospital after his doctor advised him to do so. A press release from his office called the\xa0move\xa0"precautionary."\xa0\nOn March 26, Johnson revealed he had tested positive and that he had been dealing with symptoms since that date. Britain had gone into lockdown two days earlier.\nSince the 26th, Johnson has been quarantined at his Downing Street residence. He is the first known world leader to have contracted the virus.\xa0\nRoughly a month ago, right around the time the U.K. started dealing with an outbreak, Johnson garnered media coverage for saying he\'d shook hands with coronavirus patients during a hospital visit. \xa0\n"I shook hands with everybody, you will be pleased to know, and I continue to shake hands," Johnson said during a press conference th

In [44]:
doc = nlp('On Sunday, British Prime Minister Boris Johnson was hospitalized "for tests" because of "persistent" COVID-19 symptoms\xa010 days\xa0after he tested positive, CNN reports.')

# displacy.render(doc, style="dep")

svg = displacy.render(doc, style="dep", jupyter=True)
output_path = Path("figures/dependency_plot.svg") # you can keep there only "dependency_plot.svg" if you want to save it in the same folder where you run the script 
output_path.open("w", encoding="utf-8").write(svg)

TypeError: write() argument must be str, not None

In [35]:
doc = nlp('I am asdfsadfsadfsa.')

displacy.render(doc, style="dep")

for token in doc:
    print(token)
    # print(token.text in nlp.vocab)
    # print(token.orth in nlp.vocab)
    print(token.pos_)
    print(token.tag_)
    print()


I
PRON
PRP

am
AUX
VBP

asdfsadfsadfsa
ADJ
JJ

.
PUNCT
.

