<a href="https://colab.research.google.com/github/VeereshShringari/NLP-DHS-SRK/blob/master/5_NamedEntityRecognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Named Entity Recognition

References:

1. https://towardsdatascience.com/named-entity-recognition-and-classification-with-scikit-learn-f05372f07ba2
2. https://www.analyticsvidhya.com/blog/2018/08/nlp-guide-conditional-random-fields-text-classification/
3. https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus

In [0]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
data_path = "/content/drive/My Drive/DHS2019_Workshop/GMB_NER/"
# this is the tagged or labeled data taken from the kaggle
df = pd.read_csv(data_path + "ner_dataset.csv", encoding = "ISO-8859-1")
df.shape

(1048575, 4)

In [9]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [0]:
df = df.fillna(method='ffill')
df = df[:50000]

In [5]:
df["Tag"].value_counts()

O        42547
B-geo     1490
B-gpe      968
B-org      959
I-per      931
B-tim      880
B-per      789
I-org      689
I-geo      303
I-tim      239
B-art       48
B-eve       39
I-eve       33
I-gpe       31
I-art       27
B-nat       18
I-nat        9
Name: Tag, dtype: int64

### Simple Model

In [6]:
X = df[["Word", "POS"]]
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
X.shape

(50000, 7504)

In [0]:
y = df.Tag.values
classes = np.unique(y).tolist()

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)
X_train.shape, y_train.shape

((33500, 7504), (33500,))

In [11]:
clf = SGDClassifier(max_iter=5, random_state=2019)
clf.partial_fit(X_train, y_train, classes)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=2019, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [12]:
new_classes = [c for c in classes if c != "O"]
print(classification_report(y_pred=clf.predict(X_test), y_true=y_test, labels=new_classes))

              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        18
       B-eve       0.01      0.07      0.02        14
       B-geo       0.74      0.48      0.58       476
       B-gpe       0.94      0.45      0.61       320
       B-nat       0.00      0.00      0.00         3
       B-org       0.75      0.34      0.47       301
       B-per       0.87      0.40      0.54       258
       B-tim       0.91      0.72      0.81       272
       I-art       0.00      0.00      0.00         5
       I-eve       0.00      0.00      0.00        11
       I-geo       0.80      0.30      0.44       115
       I-gpe       0.00      0.00      0.00        11
       I-nat       0.00      0.00      0.00         3
       I-org       0.76      0.28      0.41       200
       I-per       0.28      0.97      0.44       306
       I-tim       0.00      0.00      0.00        90

   micro avg       0.54      0.48      0.51      2403
   macro avg       0.38   

  'precision', 'predicted', average, warn_for)


### DIY - Model with more features similar to POS Tagging

### NER using modules

##### Flair

https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_2_TAGGING.md

In [0]:
!pip install flair

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/16/22/8fc8e5978ec05b710216735ca47415700e83f304dec7e4281d61cefb6831/flair-0.4.4-py3-none-any.whl (193kB)
[K     |█▊                              | 10kB 16.8MB/s eta 0:00:01[K     |███▍                            | 20kB 1.7MB/s eta 0:00:01[K     |█████                           | 30kB 2.5MB/s eta 0:00:01[K     |██████▊                         | 40kB 1.7MB/s eta 0:00:01[K     |████████▌                       | 51kB 2.1MB/s eta 0:00:01[K     |██████████▏                     | 61kB 2.6MB/s eta 0:00:01[K     |███████████▉                    | 71kB 3.0MB/s eta 0:00:01[K     |█████████████▌                  | 81kB 3.4MB/s eta 0:00:01[K     |███████████████▏                | 92kB 3.8MB/s eta 0:00:01[K     |█████████████████               | 102kB 2.9MB/s eta 0:00:01[K     |██████████████████▋             | 112kB 2.9MB/s eta 0:00:01[K     |████████████████████▎           | 122kB 2.9MB/s eta 0:00:0

In [0]:
from flair.models import SequenceTagger

tagger = SequenceTagger.load('ner')

2019-11-15 03:08:15,895 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4/NER-conll03-english/en-ner-conll03-v0.4.pt not found in cache, downloading to /tmp/tmpimc5lygg


100%|██████████| 432197603/432197603 [00:17<00:00, 25299558.78B/s]

2019-11-15 03:08:33,413 copying /tmp/tmpimc5lygg to cache at /root/.flair/models/en-ner-conll03-v0.4.pt





2019-11-15 03:08:35,066 removing temp file /tmp/tmpimc5lygg
2019-11-15 03:08:35,318 loading file /root/.flair/models/en-ner-conll03-v0.4.pt


In [0]:
from flair.data import Sentence
sentence = Sentence('George Washington went to Washington .')

# predict NER tags
tagger.predict(sentence)

# print sentence with predicted tags
print(sentence.to_tagged_string())

George <B-PER> Washington <E-PER> went to Washington <S-LOC> .


##### Spacy

https://spacy.io/usage/linguistic-features#named-entities

In [0]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [0]:
doc = nlp("George Washington went to Washington .")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

George Washington 0 17 PERSON
Washington 26 36 GPE
