Proprietary content. ©Great Learning. All Rights Reserved. Unauthorized use or distribution prohibited

### Libraries along with their versions used at the time of making notebook-
google	2.0.3

nltk	3.2.5

numpy	1.18.1

pandas	0.25.3

tensorflow	2.1.0

Firstly, let's select TensorFlow version 2.x in colab

In [0]:
%tensorflow_version 2.x
import tensorflow
tensorflow.__version__

'2.1.0'

In [0]:
# Initialize the random number generator
import random
random.seed(0)

# Ignore the warnings
import warnings
warnings.filterwarnings("ignore")

### Load the dataset

As we are using google colab, we need to mount the google drive to load the data file

In [0]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
# IMPORT DATA
import pandas as pd
import numpy as np

data = pd.read_csv('/content/drive/My Drive/NLP/ner_dataset.csv', encoding='latin1')
data = data.fillna(method="ffill") # Deal with N/A

In [0]:
tags = list(set(data["POS"].values)) # Read POS values

In [0]:
tags # List of possible POS values

['RBR',
 'RRB',
 'VBZ',
 'WRB',
 'VBD',
 'PRP$',
 'VBN',
 'NN',
 '$',
 'FW',
 'RB',
 'JJS',
 'NNP',
 'NNPS',
 'NNS',
 'VBG',
 ':',
 'RBS',
 '``',
 'WP',
 'TO',
 ';',
 'UH',
 'VBP',
 'MD',
 'VB',
 'PDT',
 'LRB',
 'WDT',
 'CD',
 'IN',
 'JJR',
 'POS',
 'WP$',
 'JJ',
 'DT',
 'EX',
 'RP',
 ',',
 '.',
 'CC',
 'PRP']

In [0]:
words = list(set(data["Word"].values))
words.append("DUMMY") # Add a dummy word to pad sentences.

In [0]:
# Code to read sentences

class ReadSentences(object): 
    
    def __init__(self, data):
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [0]:
sentences = ReadSentences(data).sentences # Read all sentences

In [0]:
# Convert words and tags into numbers
word2id = {w: i for i, w in enumerate(words)}
tag2id = {t: i for i, t in enumerate(tags)}

In [0]:
# Prepare input and output data

from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 50
X = [[word2id[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=len(words)-1)
y = [[tag2id[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2id["."])

In [0]:
# Convert output to one-hot bit

from tensorflow.keras.utils import to_categorical
y = [to_categorical(i, num_classes=len(tags)) for i in y]

In [0]:
y[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]], dtype=float32)

In [0]:
# Training and test split by sentences

from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.20)

In [0]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Input

In [0]:
input = Input(shape=(max_len,)) # Input layer
model = Embedding(input_dim=len(words), output_dim=50, input_length=max_len)(input) # Word embedding layer
model = Dropout(0.1)(model) # Dropout
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model) # Bi-directional LSTM layer
out = TimeDistributed(Dense(len(tags), activation="softmax"))(model)  # softmax output layer

In [0]:
model = Model(input, out) # Complete model

In [0]:
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]) # Compile with an optimizer

In [0]:
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=3, validation_split=0.1, verbose=1) # Train

Train on 34530 samples, validate on 3837 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [0]:
# Demo test on one sample. See how it is mostly correct, but not 100%

i = 1213 # Some test sentence sample
p = model.predict(np.array([X_te[i]])) # Predict on it
p = np.argmax(p, axis=-1) # Map softmax back to a POS index
for w, pred in zip(X_te[i], p[0]): # for every word in the sentence
    print("{:20} -- {}".format(words[w], tags[pred])) # Print word and tag

The                  -- DT
Gilbert              -- NNP
Islands              -- NNP
were                 -- VBD
granted              -- VBN
self-rule            -- NN
by                   -- IN
the                  -- DT
UK                   -- NNP
in                   -- IN
1971                 -- CD
and                  -- CC
complete             -- JJ
independence         -- NN
in                   -- IN
1979                 -- CD
under                -- IN
the                  -- DT
new                  -- JJ
name                 -- NN
of                   -- IN
Kiribati             -- NNP
.                    -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY     

In [0]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
from nltk import word_tokenize

sentence = nltk.word_tokenize('That was a nice jump')
X_Samp = pad_sequences(maxlen=max_len, sequences=[[word2id[word] for word in sentence]], padding="post", value=len(words)-1)

In [0]:
p = model.predict(np.array([X_Samp[0]])) # Predict on it
p = np.argmax(p, axis=-1) # Map softmax back to a POS index
for w, pred in zip(X_Samp[0], p[0]): # for every word in the sentence
    print("{:20} -- {}".format(words[w], tags[pred])) # Print word and tag

That                 -- DT
was                  -- VBD
a                    -- DT
nice                 -- JJ
jump                 -- NN
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY                -- .
DUMMY 