In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
from keras.models import Model, Input
from keras.layers.merge import add
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda
from sklearn.metrics import precision_recall_fscore_support, classification_report
import random
random.seed(10)
tf.__version__

Training Set

In [None]:
data = pd.read_csv("trainingSet.csv", encoding="latin1")
data = data.fillna(method="ffill")
data.tail(10)

In [None]:
tags = list(set(data["TAG"].values))
tagCount = len(tags)
tagCount

In [None]:
tags

In [None]:
class getSentence(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(), s["TAG"].values.tolist())]
        self.grouped = self.data.groupby("sent_id").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["{}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getSen = getSentence(data)

In [None]:
sentences = getSen.sentences

In [None]:
max_len = 50
tag2id = {t: i for i, t in enumerate(tags)}

In [None]:
X_tr = [[w[0] for w in s] for s in sentences]

In [None]:
new_X = []
for seq in X_tr:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("__PAD__")
    new_X.append(new_seq)
X_tr = new_X

In [None]:
y_tr = [[tag2id[w[1]] for w in s] for s in sentences]

In [None]:
y_tr = pad_sequences(maxlen=max_len, sequences=y_tr, padding="post", value=tag2id["O"], truncating = 'post')

Validation Set

In [None]:
data = pd.read_csv("validationSet.csv", encoding="latin1")
data = data.fillna(method="ffill")
data.tail(10)

In [None]:
getSen = getSentence(data)
sentences = getSen.sentences

In [None]:
X_te = [[w[0] for w in s] for s in sentences]

In [None]:
new_X = []
for seq in X_te:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("__PAD__")
    new_X.append(new_seq)
X_te = new_X

In [None]:
y_te = [[tag2id[w[1]] for w in s] for s in sentences]

In [None]:
y_te = pad_sequences(maxlen=max_len, sequences=y_te, padding="post", value=tag2id["O"], truncating = 'post')

In [None]:
batch_size = 32

In [None]:
sess = tf.Session()
K.set_session(sess)

Elmo Embedding

In [None]:
elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

In [None]:
def ElmoEmbedding(x):
    return elmo_model(inputs={
                            "tokens": tf.squeeze(tf.cast(x, tf.string)),
                            "sequence_len": tf.constant(batch_size*[max_len])
                      },
                      signature="tokens",
                      as_dict=True)["elmo"]

Building deep learning neural network

In [None]:
input_text = Input(shape=(max_len,), dtype=tf.string)
embedding = Lambda(ElmoEmbedding, output_shape=(None, 1024))(input_text)
x = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(embedding)
x_rnn = Bidirectional(LSTM(units=512, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(x)
x = add([x, x_rnn])  # residual connection to the first biLSTM
out = TimeDistributed(Dense(tagCount, activation="softmax"))(x)

In [None]:
model = Model(input_text, out)

In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
# model.summary()

In [None]:
X_tr, X_val = X_tr[:944*batch_size], X_te[-103*batch_size:]
y_tr, y_val = y_tr[:944*batch_size], y_te[-103*batch_size:]
y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)
y_val = y_val.reshape(y_val.shape[0], y_val.shape[1], 1)

In [None]:
history = model.fit(np.array(X_tr), y_tr, validation_data=(np.array(X_val), y_val),
                    batch_size=batch_size, epochs=5, verbose=1)

In [None]:
hist = pd.DataFrame(history.history)

In [None]:
plt.figure(figsize=(12,12))
plt.plot(hist["accuracy"])
plt.plot(hist["val_accuracy"])
plt.title("Learning curves")
plt.legend()
plt.show()

Test Set

In [None]:
data = pd.read_csv("testSet.csv", encoding="latin1")
data = data.fillna(method="ffill")
data.tail(10)

In [None]:
getSen = getSentence(data)
sentences = getSen.sentences

In [None]:
X_test = [[w[0] for w in s] for s in sentences]

In [None]:
new_X = []
for seq in X_test:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("__PAD__")
    new_X.append(new_seq)
X_test = new_X

In [None]:
y_test = [[tag2id[w[1]] for w in s] for s in sentences]

In [None]:
y_test = pad_sequences(maxlen=max_len, sequences=y_test, padding="post", value=tag2id["O"], truncating = 'post')

In [None]:
fileToWrite = open("test_output_wSentid","a")
fileToWrite.write("{:6}{:30}@#armo{:15}@#armo({})".format("sent_id","Word", "True", "Pred"))
fileToWrite.write("\n")
fileToWrite.write("="*30)
fileToWrite.write("\n")

i=0
print("{:6}{:30}@#armo{:15}@#armo({})".format("sent_id","Word", "True", "Pred"))
print("="*30)
while i<3022:
  p = model.predict(np.array(X_test[i:i+batch_size]))[0]
  p = np.argmax(p, axis=-1)
  for w, true, pred in zip(X_test[i], y_test[i], p):
    if w != "__PAD__":
      # print("{:20}:{:8} ({})".format(w, tags[true], tags[pred]))
      fileToWrite.write("{:6} {:30}@#armo{:15}@#armo({})".format(i+1, w, tags[true], tags[pred]))
      fileToWrite.write("\n")
  fileToWrite.write("="*30)
  fileToWrite.write("\n") 
  i = i+1
fileToWrite.close()

In [None]:
!head -n 30 test_output_wSentid

In [None]:
fileRead = open("test_output_wSentid", "r")

In [None]:
f2 = fileRead.readlines()

In [None]:
len(f2)

In [None]:
#Actually source, cue or content and predicted as the same
tp = 0
#Actually O but, predicted as source, cue or content
fp = 0
#Actually O and predicted as the same
tn = 0
#Actually source, cue or content but, predicted as O
fn = 0
tpAR = 0
tpARList = []
total = len(f2)
current_fp_fn =0
exact_sent_cnt =-1
senCount = -1
arSenIds = []
allZero = True
allZeroSenID = []
attriSenID = []
allZeroMatched = -1
total = len(f2)
for index in range(1, total):
    i = f2[index]
    if (i == "==============================\n"):
        senCount += 1
        new_fp_fn_cnt = fp + fn
        if (new_fp_fn_cnt == current_fp_fn):
            if allZero == True:
                allZeroMatched += 1
                if senCount>0:
                    allZeroSenID.append(senCount)  
            else:
                allZero = True
                if senCount>0:
                    attriSenID.append(senCount)
            exact_sent_cnt+=1
            if senCount>0:
                arSenIds.append(senCount)
        current_fp_fn = new_fp_fn_cnt
        continue
    tok = i.split('@#armo')
    try:
        actual = tok[1].strip()
        predicted = tok[2][1:-2].strip()
        if actual == 'O':
            if actual == predicted:
                tn += 1
#                 tnList.append(i)
            else:
                fp += 1
#                 fpList.append(i)
        else:
            allZero = False
            if actual == predicted:
                tp += 1
#                 tpList.append(i)
            else:
                fn += 1
#                 fnList.append(i)
    except IndexError:
        print(tok)
        break   
print("Total sentences= "+str(senCount))
print("Exact words match sentences ="+str(exact_sent_cnt))
print("Exact matching sentences Id list")
print(arSenIds[0:20])
print ("Number of sentences with all O = "+str(allZeroMatched))
print("All O sentence IDs")
print(allZeroSenID[0:10])
print("Attribution relations sentence IDs")
print(attriSenID[0:10])

In [None]:
# Creating confusion matrix
import numpy as np
conf = np.zeros((4,4))
total = len(f2)
for index in range(1, total):
    i = f2[index]
    if (i == "==============================\n"):
        continue
    tok = i.split('@#armo')
    try:
        actual = tok[1].strip()
        predicted = tok[2][1:-2].strip()
        
        if actual == 'source':
            if actual == predicted:
                conf[0][0] += 1
            elif predicted == 'cue':
                conf[1][0] += 1
            elif predicted == 'content':
                conf[2][0] += 1
            else:
                conf[3][0] += 1
        if actual == 'cue':
            if actual == predicted:
                conf[1][1] += 1
            elif predicted == 'source':
                conf[0][1] += 1
            elif predicted == 'content':
                conf[2][1] += 1
            else:
                conf[3][1] += 1
        if actual == 'content':
            if actual == predicted:
                conf[2][2] += 1
            elif predicted == 'source':
                conf[0][2] += 1
            elif predicted == 'cue':
                conf[1][2] += 1
            else:
                conf[3][2] += 1
        if actual == 'O':
            if actual == predicted:
                conf[3][3] += 1
            elif predicted == 'source':
                conf[0][3] += 1
            elif predicted == 'cue':
                conf[1][3] += 1
            else:
                conf[2][3] += 1

    except IndexError:
        print(tok)
        break 
print(conf) 

In [None]:
# Token-wise performance
y_true = []
y_pred = []
total = len(f2)
for index in range(1, total):
    i = f2[index]
    if (i == "==============================\n"):
        continue
    tok = i.split('@#armo')
    actual = tok[1].strip()
    predicted = tok[2][1:-2].strip() 
    y_true.append(actual)
    y_pred.append(predicted)
print(classification_report(y_true, y_pred))