# Data Source

In [None]:
data_input = '/home/tanishq/Desktop/DeepWork/AI/Data/dataset.csv'
test_input = '/home/tanishq/Desktop/DeepWork/AI/Data/test_dataset.csv'

# `Imports`

In [None]:
# Dataframe manipulations
import pandas as pd
# Array manipulations
import numpy as np
# Plotting and Graphs
import matplotlib.pyplot as plt

# Split the data into train and test data
from sklearn.model_selection import train_test_split, cross_val_predict, cross_validate
# Final report card
from sklearn.metrics import classification_report
# The base class of all estimators, used for Memory Tagging.
# Gets get_params and set_params
from sklearn.base import BaseEstimator

# Library used for deep learning
import tensorflow as tf
# Not all sentences are of same length, padding is required
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Converts the target labels to categories that the neural net can predict
from tensorflow.keras.utils import to_categorical
# Type of model used for DL
from tensorflow.keras.models import Model, Sequential
# Layers present in the network. 
# Refer README.md for more details.
# Analysis of different combinations of layers in README.md
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

# Plotting style
plt.style.use("ggplot")
# Magic command
%matplotlib inline

# `Data Exploration`

Data: Twitter data for Fifa 2018 World Cup 

(More)Testing Data: An extract from GMB(Groningen Meaning Bank) corpus. Only the relevant NE tags were kept and are modified to parallel with the twitter data. 

## Input

In [None]:
df = pd.read_csv(data_input)
df_test = pd.read_csv(test_input)

In [None]:
df.tail()

In [None]:
df_test.tail()

## Details For Training Data

In [None]:
df.describe()

In [None]:
print("Number of Sentences: {}".format(len(pd.unique(df["Sentence ID"].values))))
print("Number of Words: {}".format(len(pd.unique(df["Word"].values))))
print("Number of POS Tags: {}".format(len(pd.unique(df["POS Tag"].values))))
print("Number of NE Tags: {}".format(len(pd.unique(df["NER Tag"].values))))
print()
print()
print("List of Tags:")
print(pd.unique(df["POS Tag"].values))
print()
print("List of Entity Tags:")
print(pd.unique(df["NER Tag"].values))

In [None]:
print("Number of Sentences: {}".format(len(pd.unique(df_test["Sentence ID"].values))))
print("Number of Words: {}".format(len(pd.unique(df_test["Word"].values))))
print("Number of POS Tags: {}".format(len(pd.unique(df_test["POS Tag"].values))))
print("Number of NE Tags: {}".format(len(pd.unique(df_test["NER Tag"].values))))
print()
print()
print("List of Tags:")
print(pd.unique(df_test["POS Tag"].values))
print()
print("List of Entity Tags:")
print(pd.unique(df_test["NER Tag"].values))

In [None]:
df_test.describe()

## Generating Sentences

Forming sentences from word and Sentence ID data

In [None]:
aggregate_function = lambda s: [(w, p, t) for w, p, t in zip(
    list(s['Word'].values),
    list(s['POS Tag'].values),
    list(s['NER Tag'].values)
)]

In [None]:
sentences = df.groupby("Sentence ID").apply(aggregate_function)
sentences_test = df_test.groupby("Sentence ID").apply(aggregate_function)

## Graphs

Shows distributions of twitter and the GMB corpus's length of sentences.  Intrestingly, Even though the maximum character tweet limit was 140 characters, our tweet corpus show a maximum of 62. The GMB corpus extract shows maxmum length of 104 characters.  
Moreover, the GMB extract shows a normal distribution.

In [None]:
max_length = max(len(s) for s in sentences)

In [None]:
plt.hist([len(s) for s in sentences], bins=50)
plt.show()
print("Maximum Length: ", max_length)

In [None]:
plt.hist([len(s) for s in sentences_test], bins=50)
plt.show()

max_length_test = max(len(s) for s in sentences_test)
print("Maximum Length: ", max_length_test)

## Preparing Words and Tags for Embeddings

Since not all sentences are of equal length, we will add extra padding 'words' to make the input length same for all of the sentences.  
Eg: "The sky is blue." becomes "The sky is blue. --PAD-- --PAD-- --PAD-- ..."

The Neural Network does not understand words. It only understands Numbers; so, we assign numbers to words arbitarily. We do the same for the NE tags.

In [None]:
words = list(set(df["Word"].values))
words.append("__PAD__")
n_words = len(words)
print(n_words)

In [None]:
words_test = list(set(df_test["Word"].values))
words_test.append("__PAD__")
n_words_test = len(words_test)
print(n_words_test)

In [None]:
tags = list(set(df["NER Tag"].values))
n_tags = len(tags)
print(n_tags)

In [None]:
word2id = {w: i for i, w in enumerate(words)}
tag2id = {t: i for i, t in enumerate(tags)}

# `Training & Testing Data`

## Getting Sentences

In [None]:
# Creates vector of sentences, where each sentence is itself a vector of 62 words maximum.
# We have not yet converted the words to thier respective IDs.
# We have not yet made the sentences of the same length aka padding.
X_train_sent = [[tup[0] for i, tup in enumerate(sent) if i<max_length] for sent in sentences]
y_train_sent = [[tup[2] for i, tup in enumerate(sent) if i<max_length] for sent in sentences]

In [None]:
# We do the same for the GMB extact
X_test_sent = [[tup[0] for i, tup in enumerate(sent) if i<max_length] for sent in sentences_test]
y_test_sent = [[tup[2] for i, tup in enumerate(sent) if i<max_length] for sent in sentences_test]

## Generating Encodings

In [None]:
def generate_encodings(X_sent, y_sent):
    """
    Description: Converts the list of sentences containing words to a list of sentences conataining just numbers.
    If the word is present in the vocabulary, it is assigned the correct corresponding id;
    If the word is present in the twitter data, but not in the GMB data,
    we simply assign it the value of '__PAD__'.
    
    Returns: Two tuple of encoded sentences, encoded target labels
    
    Input Params: Two tuple of sentences and target labels.
    """
    X_train = []
    y_train = []
    
    for x_s, y_s in zip(X_sent, y_sent):

        temp_x = []
        temp_y = []

        for x, y in zip(x_s, y_s):
            try:
                x = word2id[x]
            except:
                x = word2id['__PAD__']
            try:
                y = tag2id[y]
            except:
                y = tag2id['O']

            temp_x.append(x)
            temp_y.append(y)

        X_train.append(temp_x)
        y_train.append(temp_y)

    return (X_train, y_train)

In [None]:
X_train, y_train = generate_encodings(X_train_sent, y_train_sent)

In [None]:
X_test, y_test = generate_encodings(X_test_sent, y_test_sent)

## Padding

In [None]:
# We make the sentences and the target labels of each of the same length, 62.
X_train = pad_sequences(X_train, maxlen=max_length, padding='post', value=word2id['__PAD__'])
y_train = pad_sequences(y_train, maxlen=max_length, padding='post', value=tag2id['O'])

In [None]:
# Similarly,
X_test = pad_sequences(X_test, maxlen=max_length, padding='post', value=word2id['__PAD__'])
y_test = pad_sequences(y_test, maxlen=max_length, padding='post', value=tag2id['O'])

## Convert y to categories

Since the target labels in y are categories, we convert them. This is for the loss function, categorical cross entropy.

In [None]:
y_train = [to_categorical(i, n_tags) for i in y_train]

In [None]:
y_test = [to_categorical(i, n_tags) for i in y_test]

# `Memory Tagging`

In [None]:
class MemoryTagger(BaseEstimator):
    def fit(self, X, y):
        """
        Memorizes the most popular tag for the word
        
        Input Parameters:
        X: a list of words
        eg: ["The", "sky", "is", "blue", "."]
        
        y: a lsit of Entity Tags
        eg: ["O", "B-nat", "I-nat", "O"]
        
        Return: None
        """
        
        vocab = {}
        self.tags = []
        for x, t in zip(X, y):
            
            # stores the distinct tags used
            if t not in self.tags:
                self.tags.append(t)
            
            # if x is a word encountered before
            if x in vocab:
                # if tag is seen before
                if t in vocab[x]:
                    vocab[x][t] += 1
                else:
                    vocab[x][t] = 0
            # first time encountering the word
            else:
                vocab[x] = {t: 0}
            
        # Finding the most popular tags for the word
        self.memory = {}
        for key, d in vocab.items():
            self.memory[key] = max(d, key=d.get)
    
    def predict(self, X, y=None):
        """
        Predict the the tag from memory. If the word is unknown, defaults to prediction: 'O'.
        """
        return [self.memory.get(x, 'O') for x in X]

In [None]:
tagger = MemoryTagger()

In [None]:
X_memory_train = list(df["Word"].values)
y_memory_train = list(df["NER Tag"].values)

In [None]:
X_memory_test = list(df_test["Word"].values)
y_memory_test = list(df_test["NER Tag"].values)

In [None]:
tagger.fit(X_memory_train, y_memory_train)

In [None]:
predictions = tagger.predict(X_memory_test)

In [None]:
report = classification_report(y_pred=predictions, y_true=y_memory_test)
print(report)

# Deep Learning

In [None]:
model = Sequential([
                    Embedding(input_dim=n_words, output_dim=64, input_length=max_length),
                    Dropout(0.1),
                    Bidirectional(LSTM(
                        128,
                        activation='tanh', 
                        return_sequences=True, 
                        recurrent_activation='sigmoid', 
                        use_bias=True,
                        )),
                    TimeDistributed(Dense(
                        n_tags,
                        activation='softmax'
                    ))
])

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(X_train, y_train, batch_size=64, epochs=3, verbose=1, validation_data=(X_test, y_test))