<a href="https://colab.research.google.com/github/SmartyPants042/NER-Deep-Learning/blob/master/NER_pure.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Source

In [0]:
%%shell

git clone https://github.com/SmartyPants042/NER-Deep-Learning.git
pwd
cd NER-Deep-Learning/Data/
unzip dataset.zip
unzip test_dataset.zip

In [0]:
data_input = '/content/NER-Deep-Learning/Data/dataset.csv'
test_input = '/content/NER-Deep-Learning/Data/test_dataset.csv'

In [0]:
# Dataframe manipulations
import pandas as pd
# Array manipulations
import numpy as np

# Library used for deep learning
import tensorflow as tf
# Not all sentences are of same length, padding is required
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Converts the target labels to categories that the neural net can predict
from tensorflow.keras.utils import to_categorical
# Type of model used for DL
from tensorflow.keras.models import Sequential
# Layers present in the network. 
# Refer README.md for more details.
# Analysis of different combinations of layers in README.md
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

# `Data Manipulation`

In [0]:
df = pd.read_csv(data_input)
df_test = pd.read_csv(test_input)

In [0]:
aggregate_function = lambda s: [(w, p, t) for w, p, t in zip(
    list(s['Word'].values),
    list(s['POS Tag'].values),
    list(s['NER Tag'].values)
)]

In [0]:
sentences = df.groupby("Sentence ID").apply(aggregate_function)
sentences_test = df_test.groupby("Sentence ID").apply(aggregate_function)

In [0]:
max_length = max(len(s) for s in sentences)

In [0]:
words = list(set(df["Word"].values))
words.append("__PAD__")
n_words = len(words)
print(n_words)

In [0]:
tags = list(set(df["NER Tag"].values))
n_tags = len(tags)
print(n_tags)

In [0]:
word2id = {w: i for i, w in enumerate(words)}
tag2id = {t: i for i, t in enumerate(tags)}

# `Training & Testing`

In [0]:
# Creates vector of sentences, where each sentence is itself a vector of 62 words maximum.
# We have not yet converted the words to thier respective IDs.
# We have not yet made the sentences of the same length also known as padding.
X_train_sent = [[tup[0] for i, tup in enumerate(sent) if i<max_length] for sent in sentences]
y_train_sent = [[tup[2] for i, tup in enumerate(sent) if i<max_length] for sent in sentences]
X_test_sent = [[tup[0] for i, tup in enumerate(sent) if i<max_length] for sent in sentences_test]
y_test_sent = [[tup[2] for i, tup in enumerate(sent) if i<max_length] for sent in sentences_test]

In [0]:
def generate_encodings(X_sent, y_sent):
    """
    Description: Converts the list of sentences containing words to a list of sentences conataining just numbers.
    If the word is present in the vocabulary, it is assigned the correct corresponding id;
    If the word is present in the twitter data, but not in the GMB data,
    we simply assign it the value of '__PAD__'.
    
    Returns: Two tuple of encoded sentences, encoded target labels
    
    Input Params: Two tuple of sentences and target labels.
    """
    X_train = []
    y_train = []
    
    for x_s, y_s in zip(X_sent, y_sent):

        temp_x = []
        temp_y = []

        for x, y in zip(x_s, y_s):
            try:
                x = word2id[x]
            except:
                x = word2id['__PAD__']
            try:
                y = tag2id[y]
            except:
                y = tag2id['O']

            temp_x.append(x)
            temp_y.append(y)
        
        temp_x = np.array(temp_x)
        temp_y = np.array(temp_y)
        
        X_train.append(temp_x)
        y_train.append(temp_y)
    
    X_train = np.array(X_train)
    y_train = np.array(y_train)

    return (X_train, y_train)

In [0]:
X_train, y_train = generate_encodings(X_train_sent, y_train_sent)
X_test, y_test = generate_encodings(X_test_sent, y_test_sent)

In [0]:
# We make the sentences and the target labels of each of the same length, 62.
X_train = pad_sequences(X_train, maxlen=max_length, padding='post', value=word2id['__PAD__'])
y_train = pad_sequences(y_train, maxlen=max_length, padding='post', value=tag2id['O'])
X_test = pad_sequences(X_test, maxlen=max_length, padding='post', value=word2id['__PAD__'])
y_test = pad_sequences(y_test, maxlen=max_length, padding='post', value=tag2id['O'])

In [0]:
y_train = [to_categorical(i, n_tags) for i in y_train]
y_test = [to_categorical(i, n_tags) for i in y_test]

In [0]:
model = Sequential([
                    Embedding(input_dim=n_words, output_dim=64),
                    Dropout(0.1),
                    Bidirectional(LSTM(
                        128,
                        activation='tanh', 
                        return_sequences=True, 
                        recurrent_activation='sigmoid', 
                        use_bias=True,
                        )),
                    TimeDistributed(Dense(
                        n_tags,
                        activation='softmax'
                    ))
])

In [0]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [0]:
model.fit(X_train, np.array(y_train), batch_size=64, epochs=3, verbose=1, validation_data=(X_test, np.array(y_test)))