# Install Dependencies

In [1]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shree.Charran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Shree.Charran\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [1]:
import pandas as pd

df = pd.read_csv('ner_dataset.csv.gz', compression='gzip', encoding='ISO-8859-1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
Sentence #    47959 non-null object
Word          1048575 non-null object
POS           1048575 non-null object
Tag           1048575 non-null object
dtypes: object(4)
memory usage: 32.0+ MB


In [6]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1048565,1048566,1048567,1048568,1048569,1048570,1048571,1048572,1048573,1048574
sentence_id,1,1,1,1,1,1,1,1,1,1,...,47958,47958,47959,47959,47959,47959,47959,47959,47959,47959
words,Thousands,of,demonstrators,have,marched,through,London,to,protest,the,...,impact,.,Indian,forces,said,they,responded,to,the,attack
pos,NNS,IN,NNS,VBP,VBN,IN,NNP,TO,VB,DT,...,NN,.,JJ,NNS,VBD,PRP,VBD,TO,DT,NN
labels,O,O,O,O,O,O,B-geo,O,O,O,...,O,O,B-gpe,O,O,O,O,O,O,O


## Basic Data Formatting

In [4]:
df = df.fillna(method='ffill')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
Sentence #    1048575 non-null object
Word          1048575 non-null object
POS           1048575 non-null object
Tag           1048575 non-null object
dtypes: object(4)
memory usage: 32.0+ MB


In [5]:
df['sentence_id'] = [item.split(':')[1].strip() for item in df['Sentence #'].values]
df['words'] = df['Word']
df['pos'] = df['POS']
df['labels'] = df['Tag']
df = df[['sentence_id', 'words', 'pos', 'labels']]

In [7]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1048565,1048566,1048567,1048568,1048569,1048570,1048571,1048572,1048573,1048574
sentence_id,1,1,1,1,1,1,1,1,1,1,...,47958,47958,47959,47959,47959,47959,47959,47959,47959,47959
words,Thousands,of,demonstrators,have,marched,through,London,to,protest,the,...,impact,.,Indian,forces,said,they,responded,to,the,attack
pos,NNS,IN,NNS,VBP,VBN,IN,NNP,TO,VB,DT,...,NN,.,JJ,NNS,VBD,PRP,VBD,TO,DT,NN
labels,O,O,O,O,O,O,B-geo,O,O,O,...,O,O,B-gpe,O,O,O,O,O,O,O


In [8]:
df.sentence_id.nunique(), df.words.nunique(), df.pos.nunique(), df.labels.nunique()

(47959, 35178, 42, 17)

We have 47959 sentences that contain 35178 unique words.

These sentences have a total of 42 unique POS tags and 17 unique NER tags in total.


# Tags

In [9]:
df.labels.value_counts()

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: labels, dtype: int64

# CRF

## Prepare Data

CRF trains upon sequence of input data to learn transitions from one state (label) to another. 

To enable such an algorithm, __we need to define features__ which take into account different transitions. 

In the function ```word2features()``` below, we transform each word into a feature dictionary depicting the following attributes or features:

+ __lower case of word__
+ __suffix containing last 3 characters__
+ __suffix containing last 2 characters__
+ __flags to determine upper-case, title-case, numeric data and POS tag__

We also attach __attributes related to previous and next words or tags to determine beginning of sentence (BOS) or end of sentence (EOS)__

https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#let-s-use-conll-2002-data-to-build-a-ner-system

In this example we use word identity, word suffix, word shape and word POS tag; also, some information from nearby words is used.

### Feature Engineering Utility

In [10]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

In [11]:
agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['words'].values.tolist(), 
                                                   s['pos'].values.tolist(), 
                                                   s['labels'].values.tolist())]

In [12]:
grouped_df = df.groupby('sentence_id').apply(agg_func)

In [13]:
print(grouped_df[grouped_df.index == '1'].values)

[list([('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')])]


In [14]:
grouped_df.shape

(47959,)

In [15]:
sentences = [s for s in grouped_df]
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [16]:
sent2features(sentences[0][5:7])

[{'+1:postag': 'NNP',
  '+1:postag[:2]': 'NN',
  '+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'london',
  'BOS': True,
  'bias': 1.0,
  'postag': 'IN',
  'postag[:2]': 'IN',
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': False,
  'word.lower()': 'through',
  'word[-2:]': 'gh',
  'word[-3:]': 'ugh'},
 {'-1:postag': 'IN',
  '-1:postag[:2]': 'IN',
  '-1:word.istitle()': False,
  '-1:word.isupper()': False,
  '-1:word.lower()': 'through',
  'EOS': True,
  'bias': 1.0,
  'postag': 'NNP',
  'postag[:2]': 'NN',
  'word.isdigit()': False,
  'word.istitle()': True,
  'word.isupper()': False,
  'word.lower()': 'london',
  'word[-2:]': 'on',
  'word[-3:]': 'don'}]

In [17]:
sent2labels(sentences[0][5:7])

['O', 'B-geo']

## Prepare Train and Test Datasets

In [18]:
from sklearn.model_selection import train_test_split
import numpy as np

X = np.array([sent2features(s) for s in sentences])
y = np.array([sent2labels(s) for s in sentences])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape

((35969,), (11990,))

## Sample Featureset & Labels

In [19]:
pd.concat([pd.DataFrame(X_train[1]), pd.DataFrame({'label': y_train[1]})], axis=1)

Unnamed: 0,+1:postag,+1:postag[:2],+1:word.istitle(),+1:word.isupper(),+1:word.lower(),-1:postag,-1:postag[:2],-1:word.istitle(),-1:word.isupper(),-1:word.lower(),...,bias,postag,postag[:2],word.isdigit(),word.istitle(),word.isupper(),word.lower(),word[-2:],word[-3:],label
0,NN,NN,False,False,newspaper,,,,,,...,1.0,DT,DT,False,True,False,the,he,The,O
1,VBD,VB,False,False,alleged,DT,DT,True,False,the,...,1.0,NN,NN,False,False,False,newspaper,er,per,O
2,IN,IN,False,False,that,NN,NN,False,False,newspaper,...,1.0,VBD,VB,False,False,False,alleged,ed,ged,O
3,NNP,NN,True,False,jimenez,VBD,VB,False,False,alleged,...,1.0,IN,IN,False,False,False,that,at,hat,O
4,VBD,VB,False,False,tipped,IN,IN,False,False,that,...,1.0,NNP,NN,False,True,False,jimenez,ez,nez,B-per
5,RP,RP,False,False,off,NNP,NN,True,False,jimenez,...,1.0,VBD,VB,False,False,False,tipped,ed,ped,O
6,JJ,JJ,False,False,jailed,VBD,VB,False,False,tipped,...,1.0,RP,RP,False,False,False,off,ff,off,O
7,NN,NN,False,False,drug,RP,RP,False,False,off,...,1.0,JJ,JJ,False,False,False,jailed,ed,led,O
8,NN,NN,False,False,trafficker,JJ,JJ,False,False,jailed,...,1.0,NN,NN,False,False,False,drug,ug,rug,O
9,NNP,NN,True,False,fernando,NN,NN,False,False,drug,...,1.0,NN,NN,False,False,False,trafficker,er,ker,O


# Building Models with sklearn-crfsuite

__`sklearn-crfsuite`__ is a thin [CRFsuite (python-crfsuite)](https://github.com/scrapinghub/python-crfsuite) wrapper which provides scikit-learn-compatible sklearn_crfsuite.CRF estimator: you can use e.g. scikit-learn model selection utilities (cross-validation, hyperparameter optimization) with it, or save/load CRF models using joblib.

In [20]:
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting tabulate
  Using cached tabulate-0.8.7-py3-none-any.whl (24 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.7-cp35-cp35m-win_amd64.whl (153 kB)
Installing collected packages: tabulate, python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6 tabulate-0.8.7


DEPRECATION: Python 3.5 reached the end of its life on September 13th, 2020. Please upgrade your Python as Python 3.5 is no longer maintained. pip 21.0 will drop support for Python 3.5 in January 2021. pip 21.0 will remove support for this functionality.


## Your Turn: Train the model!

Train the model using the default configurations mentioned in the [sklearn-crfsuite API docs](https://sklearn-crfsuite.readthedocs.io/en/latest/api.html)

We have filled in some of these for your convenience as follows.

- __algorithm:__ the training algorithm. We use [L-BFGS](https://en.wikipedia.org/wiki/Limited-memory_BFGS) for gradient descent for optimization and getting model parameters
- __c1:__ Coefficient for Lasso (L1) regularization
- __c2:__ Coefficient for Ridge (L2) regularization
- __all_possible_transitions:__ Specify whether CRFsuite generates transition features that do not even occur in the training data


__Note:__ If the model is taking too long to train, you can load up the pre-trained model using the code after the training cells and use that for predictions.

In [21]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                           c1=0.1,
                           c2=0.1,
                           max_iterations=100,
                           all_possible_transitions=True,
                           verbose=True)

In [None]:
crf.fit(X_train, y_train)

## Pickle

In [None]:
from sklearn.externals import joblib

joblib.dump(crf, 'ner_model.pkl')

In [None]:
crf = joblib.load('ner_model.pkl')

In [None]:
pd.DataFrame(X_test[0])

In [None]:
y_pred = crf.predict(X_test)
print(y_pred[0])

In [None]:
print(y_test[0])

In [None]:
from sklearn_crfsuite import metrics as crf_metrics

labels = list(crf.classes_)
labels.remove('O')

In [None]:
print(crf_metrics.flat_classification_report(y_test, y_pred, labels=labels))

### Prepare Sample Document

In [None]:
import re

text = """Three more countries have joined an “international grand committee” of parliaments, adding to calls for 
Facebook’s boss, Mark Zuckerberg, to give evidence on misinformation to the coalition. Brazil, Latvia and Singapore 
bring the total to eight different parliaments across the world, with plans to send representatives to London on 27 
November with the intention of hearing from Zuckerberg. Since the Cambridge Analytica scandal broke, the Facebook chief 
has only appeared in front of two legislatures: the American Senate and House of Representatives, and the European parliament. 
Facebook has consistently rebuffed attempts from others, including the UK and Canadian parliaments, to hear from Zuckerberg. 
He added that an article in the New York Times on Thursday, in which the paper alleged a pattern of behaviour from Facebook 
to “delay, deny and deflect” negative news stories, “raises further questions about how recent data breaches were allegedly 
dealt with within Facebook.”
"""

text = re.sub(r'\n', '', text)
text

### NER Tagging with SpaCy

In [None]:
import spacy
from spacy import displacy

nlp = spacy.load('en')
text_nlp = nlp(text)
displacy.render(text_nlp, style='ent', jupyter=True)

### Pipeline Step 1

- Tokenize Text
- POS Tagging

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
text_tokens = nltk.word_tokenize(text)
text_pos = nltk.pos_tag(text_tokens)
text_pos[:10]

[('Three', 'CD'),
 ('more', 'JJR'),
 ('countries', 'NNS'),
 ('have', 'VBP'),
 ('joined', 'VBN'),
 ('an', 'DT'),
 ('“', 'NNP'),
 ('international', 'JJ'),
 ('grand', 'JJ'),
 ('committee', 'NN')]

### Pipeline Step 2
- Extract Features from the POS tagged text document
- Hint: Use `sent2features`

In [None]:
features = [sent2features(text_pos)]
features[0][0]

{'+1:postag': 'JJR',
 '+1:postag[:2]': 'JJ',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:word.lower()': 'more',
 'BOS': True,
 'bias': 1.0,
 'postag': 'CD',
 'postag[:2]': 'CD',
 'word.isdigit()': False,
 'word.istitle()': True,
 'word.isupper()': False,
 'word.lower()': 'three',
 'word[-2:]': 'ee',
 'word[-3:]': 'ree'}

### Pipeline Step 3
- Use the CRF Model `crf` to predict on the features

In [None]:
labels = crf.predict(features)
doc_labels = labels[0]
doc_labels[10:20]

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-art', 'I-art']

### Pipeline Step 4
- Combine text tokens with NER Tags
- Retrieve relevant named entities from NER Tags

In [None]:
text_ner = [(token, tag) for token, tag in zip(text_tokens, doc_labels)]
print(text_ner)

[('Three', 'O'), ('more', 'O'), ('countries', 'O'), ('have', 'O'), ('joined', 'O'), ('an', 'O'), ('“', 'O'), ('international', 'O'), ('grand', 'O'), ('committee', 'O'), ('”', 'O'), ('of', 'O'), ('parliaments', 'O'), (',', 'O'), ('adding', 'O'), ('to', 'O'), ('calls', 'O'), ('for', 'O'), ('Facebook', 'B-art'), ('’', 'I-art'), ('s', 'O'), ('boss', 'O'), (',', 'O'), ('Mark', 'B-per'), ('Zuckerberg', 'I-per'), (',', 'O'), ('to', 'O'), ('give', 'O'), ('evidence', 'O'), ('on', 'O'), ('misinformation', 'O'), ('to', 'O'), ('the', 'O'), ('coalition', 'O'), ('.', 'O'), ('Brazil', 'B-geo'), (',', 'O'), ('Latvia', 'B-org'), ('and', 'I-org'), ('Singapore', 'I-org'), ('bring', 'O'), ('the', 'O'), ('total', 'O'), ('to', 'O'), ('eight', 'O'), ('different', 'O'), ('parliaments', 'O'), ('across', 'O'), ('the', 'O'), ('world', 'O'), (',', 'O'), ('with', 'O'), ('plans', 'O'), ('to', 'O'), ('send', 'O'), ('representatives', 'O'), ('to', 'O'), ('London', 'B-geo'), ('on', 'O'), ('27', 'B-tim'), ('November', 

In [None]:
named_entities = []
temp_entity_name = ''
temp_named_entity = None
for term, tag in text_ner:
    if tag != 'O':
        temp_entity_name = ' '.join([temp_entity_name, term]).strip()
        temp_named_entity = (temp_entity_name, tag)
    else:
        if temp_named_entity:
            named_entities.append(temp_named_entity)
            temp_entity_name = ''
            temp_named_entity = None

In [None]:
import pandas as pd

pd.DataFrame(named_entities, columns=['Entity', 'Tag'])

Unnamed: 0,Entity,Tag
0,Facebook ’,I-art
1,Mark Zuckerberg,I-per
2,Brazil,B-geo
3,Latvia and Singapore,I-org
4,London,B-geo
5,27 November,I-tim
6,Zuckerberg,B-geo
7,Cambridge Analytica,I-org
8,Facebook,B-org
9,American Senate and House of Representatives,I-org
