# <font color="#49699E" size=40>Named Entity Recognition, Transfer Learning, and Transformer Models</font>
# LEARNING OBJECTIVES
# LEARNING MATERIALS
# INTRODUCTION


# NAMED ENTITY RECOGNITION (NER)

### Named Entity Recognition


## NER, Out of the Box
### Imports

In [ ]:
from dcss.text import *
from dcss.networks import *
from dcss.utils import list_files
from dcss.plotting import draw_ner_blockmodel_sfdp
import spacy
from graph_tool.all import *
import math
import pandas as pd
pd.set_option("display.notebook_repr_html", False)
from collections import Counter
nlp = spacy.load('en_core_web_sm')

In [ ]:
with open('../data/txt_files/ca_story.txt', 'r') as f:
    full_text = [line.strip() for line in f]
    full_text = " ".join(full_text)

In [ ]:
doc = nlp(full_text)

In [ ]:
ent_types = [ent.label_ for ent in doc.ents]
print('Found {} named entities'.format(len(ent_types)))
Counter(ent_types).most_common()

In [ ]:
Counter([str(ent) for ent in doc.ents if ent.label_ == "GPE"])

In [ ]:
Counter([str(ent) for ent in doc.ents if ent.label_ == 'PERSON']).most_common(10)

In [ ]:
Counter([str(ent) for ent in doc.ents if ent.label_ == 'ORG']).most_common(10)

## Customizing SpaCy's Pre-trained Named Entity Recognition

In [ ]:
import random
random.seed(7)
from spacy.training import Example

In [ ]:
update_list = [
    ('It was a report that drew on hours of testimony from Cambridge Analytica directors, Facebook executives and dozens of expert witnesses',
     {
         'entities': [(53, 72, 'ORG'), (84, 92, 'ORG')]
     }),
    ('Cambridge Analytica rode it out, initially, but finally called in the administrators in May',
     {
         'entities': [(0, 19, 'ORG')]
     }),
    ('In April Facebook admitted it wasn’t 50 million users who had had their profiles mined',
     {
         'entities': [(9, 17, 'ORG')]
     }),
    ('Facebook published a statement saying that it had banned both Cambridge Analytica and Wylie from its platform.',
     {
         'entities': [(0, 8, 'ORG'), (62, 81, 'ORG'), (86, 91, 'PERSON')]
     })
]

In [ ]:
nlp = spacy.load('en_core_web_sm')  

for i in range(10):
    random.shuffle(update_list)    
    examples = []                  
    for text, label_spans in update_list:
        doc = nlp.make_doc(text)          
        examples.append(Example.from_dict(doc, label_spans))  
    nlp.update(examples, drop = 0.6)
    
trained_doc = nlp(full_text)

In [ ]:
Counter([str(ent) for ent in trained_doc.ents if ent.label_ == "GPE"])

In [ ]:
Counter([str(ent) for ent in trained_doc.ents if ent.label_ == 'PERSON']).most_common(10)

In [ ]:
Counter([str(ent) for ent in trained_doc.ents if ent.label_ == 'ORG']).most_common(10)

In [ ]:
# reload a fresh version of the pre-trained model
nlp = spacy.load('en_core_web_sm')  

In [ ]:
examples = create_examples(full_text)

for i in range(10):
    random.shuffle(examples)
    nlp.update(examples, drop = 0.6)
    
trained_doc = nlp(full_text)

In [ ]:
Counter([str(ent) for ent in trained_doc.ents if ent.label_ == 'PERSON']).most_common(10)

In [ ]:
Counter([str(ent) for ent in trained_doc.ents if ent.label_ == 'ORG']).most_common(10)

In [ ]:
ent_labels = [
    {'label': 'ORG', 'pattern': 'Facebook'},
    {'label': 'ORG', 'pattern': 'Cambridge Analytica'}
]

ent_ruler = nlp.add_pipe('entity_ruler', config = {'overwrite_ents': True})
ent_ruler.add_patterns(ent_labels)

In [ ]:
ruled_doc = nlp(full_text)
Counter([str(ent) for ent in ruled_doc.ents if ent.label_ == 'ORG']).most_common(10)

## NER with Transfer Learning


# TRANSFORMER MODELS

### Hugging Face + SpaCy


In [ ]:
nlpt = spacy.load('en_core_web_trf', exclude=['tagger','lemmatizer'])
doct = nlpt(full_text)

In [ ]:
Counter([str(ent) for ent in doct.ents if ent.label_ == 'ORG']).most_common(10)

In [ ]:
Counter([str(ent) for ent in doct.ents if ent.label_ == 'PERSON']).most_common(10)

In [ ]:
for ent in doct.ents:
    if ent.text == "Brexit":
        print(ent.label_)
        print(ent.sent.text)
        for ent2 in ent.sent.ents:
            print(ent2.text + ': ' + ent2.label_)

In [ ]:
for token in doct:
    if token.text == "Brexit":
        print(token.sent.text)

In [ ]:
sentence = nlpt("The account of a whistleblower from inside the data analytics firm that had worked in different capacities "
               "– the details are still disputed – on the two pivotal campaigns of 2016 that gave us Brexit and Trump.")

In [ ]:
for ent in sentence.ents:
        print(ent.text + ": " + ent.label_)

In [ ]:
sentence = nlpt("It was a year ago this weekend that the Observer published the first in a series of stories, known as the Cambridge Analytica Files, "
                "that led to parliament grappling with these questions. The account of a whistleblower from inside the data analytics firm that had "
                "worked in different capacities – the details are still disputed – on the two pivotal campaigns of 2016 that gave us Brexit and the Trump administration.")

In [ ]:
for ent in sentence.ents:
        print(ent.text + ": " + ent.label_)

### Named Entities in Context


### Sentiment Analysis with Transformers


In [ ]:
from transformers import pipeline

sentiment = pipeline('sentiment-analysis')

In [ ]:
sentiment('“When you look at how, for example, the NCA [National Crime Agency] has just sat on blatant evidence of Russian interference in Brexit,” Wylie says.'
          '"The Brexit angle of the Cambridge Analytica Files, the explosive revelations of a second whistleblower, Shahmir Sanni, fell inexplicably flat.')

In [ ]:
scores = sentiment(['“When you look at how, for example, the NCA [National Crime Agency] has '
                   'just sat on blatant evidence of Russian interference in Brexit,” Wylie says.',
                   '"The Brexit angle of the Cambridge Analytica Files, the explosive revelations '
                   'of a second whistleblower, Shahmir Sanni, fell inexplicably flat.'])
print(scores)

In [ ]:
sentences = [sent.text for sent in doct.sents]
scores = sentiment(sentences)

In [ ]:
label, score = [x['label'] for x in scores], [x['score'] for x in scores]

df = pd.DataFrame()
df['sentence'], df['label'], df['score'] = sentences, label, score

top_pos = df[df['label'] == 'POSITIVE']['score'].idxmax()
bot_pos = df[df['label'] == 'POSITIVE']['score'].idxmin()
top_neg = df[df['label'] == 'NEGATIVE']['score'].idxmax()
bot_neg = df[df['label'] == 'NEGATIVE']['score'].idxmin()

for pos in [top_pos, bot_pos, top_neg, bot_neg]:    
    print('Value: ' + str(df['score'].iloc[pos]) + '\nSentence: ' + df['sentence'].iloc[pos], '\n')

In [ ]:
sent_df = entity_sentiment(doct, sentiment, ['Cambridge Analytica'])

sent_df['sent_signed'] = sent_df['sentiment_score']
sent_df.loc[sent_df['sentiment'] == 'NEGATIVE', 'sent_signed'] *= -1

In [ ]:
sent_df['sent_signed'].mean()

## Translating Transformer Insight into Human Insight

In [ ]:
datasets = list_files("../data/canadian_hansards/lipad/", 'csv')
dfs = [pd.read_csv(df, low_memory=False) for df in datasets]
df = pd.concat(dfs)

In [ ]:
leaders = ['Stephen Harper', 'Jack Layton', 'Jean Chrétien']
df_filt = df[df['speakername'].isin(leaders)]
df_filt.speakername.value_counts()

In [ ]:
nlp = spacy.load('en_core_web_trf', exclude=['tagger', 'parser', 'lemmatizer'])
nlp.add_pipe('sentencizer')

In [ ]:
sentiment_df = process_speeches_sentiment(df_filt, nlp, sentiment)

In [ ]:
sentiment_df.to_pickle('../data/pickles/can_hansard_sentiment.pkl')

In [ ]:
# run this cell to load the data with everything above analysed already
sentiment_df = pd.read_pickle('../data/pickles/can_hansard_sentiment.pkl')

In [ ]:
sentiment_df['sent_signed'] = sentiment_df['sentiment_score']
sentiment_df.loc[sentiment_df['sentiment'] == 'NEGATIVE', 'sent_signed'] *= -1

In [ ]:
sentiment_df.value_counts(subset=['speaker', 'sentiment'], sort=False)

In [ ]:
sentiment_df.groupby('speaker')['sent_signed'].mean()

In [ ]:
chretien_df = create_speaker_edge_df(sentiment_df, 'Jean Chrétien')
layton_df = create_speaker_edge_df(sentiment_df, 'Jack Layton')
harper_df = create_speaker_edge_df(sentiment_df, 'Stephen Harper')

In [ ]:
chretien_df.groupby(['source','target'])['weight'].mean().reset_index().sort_values(by='weight', ascending = False)

In [ ]:
layton_df.groupby(['source','target'])['weight'].mean().reset_index().sort_values(by='weight', ascending = False)

In [ ]:
harper_df.groupby(['source','target'])['weight'].mean().reset_index().sort_values(by='weight', ascending = False)

### Co-Occurring Named Entities


In [ ]:
chretien_df = create_speaker_edge_df(sentiment_df, 'Jean Chrétien')
layton_df = create_speaker_edge_df(sentiment_df, 'Jack Layton')
harper_df = create_speaker_edge_df(sentiment_df, 'Stephen Harper')

In [ ]:
chretien_small_df = shrink_sent_df(chretien_df)
layton_small_df = shrink_sent_df(layton_df)
harper_small_df = shrink_sent_df(harper_df)

In [ ]:
chretien_small_G, chretien_small_blocks = blockmodel_from_edge_df(chretien_small_df, n_edges = 200)
layton_small_G, layton_small_blocks = blockmodel_from_edge_df(layton_small_df, n_edges = 200)
harper_small_G, harper_small_blocks = blockmodel_from_edge_df(harper_small_df, n_edges = 200)

In [ ]:
draw_ner_blockmodel_sfdp(chretien_small_G, chretien_small_blocks, filename = '../figures/chretien_blockmodel_top200_unweighted_sfdp.pdf')
draw_ner_blockmodel_sfdp(layton_small_G, layton_small_blocks, filename = '../figures/layton_blockmodel_top200_unweighted_sfdp.pdf')
draw_ner_blockmodel_sfdp(harper_small_G, harper_small_blocks, filename = '../figures/harper_blockmodel_top200_unweighted_sfdp.pdf')

In [ ]:
chretien_results = get_sentiment_blocks_df(chretien_small_G, chretien_small_blocks)
layton_results = get_sentiment_blocks_df(layton_small_G, layton_small_blocks)
harper_results = get_sentiment_blocks_df(harper_small_G, harper_small_blocks)

In [ ]:
chretien_results.head()

In [ ]:
chretien_block_sentiment_df = calculate_avg_block_sentiment(chretien_results, chretien_df)
chretien_block_sentiment_df.to_pickle('../data/pickles/chretien_blockmodel_sent_analysis.pkl')

In [ ]:
# run to load the pickled dataframe
chretien_block_sentiment_df = pd.read_pickle('../data/pickles/chretien_blockmodel_sent_analysis.pkl')

In [ ]:
chretien_block_sentiment_df.head(30)

In [ ]:
layton_block_sentiment_df = calculate_avg_block_sentiment(layton_results, layton_df)
layton_block_sentiment_df.to_pickle('../data/pickles/layton_blockmodel_sent_analysis.pkl')

In [ ]:
# run to load the pickled dataframe
layton_block_sentiment_df.to_pickle('../data/pickles/layton_blockmodel_sent_analysis.pkl')

In [ ]:
layton_block_sentiment_df.head(30)

In [ ]:
harper_block_sentiment_df = calculate_avg_block_sentiment(harper_results, harper_df)
harper_block_sentiment_df.to_pickle('../data/pickles/harper_blockmodel_sent_analysis.pkl')

In [ ]:
# run to load the pickled dataframe
harper_block_sentiment_df.to_pickle('../data/pickles/harper_blockmodel_sent_analysis.pkl')

In [ ]:
harper_block_sentiment_df.head(30)

# CONCLUSION
## Key Points 
