# Contents in this notebook
<ol>
  <li> <b> <em> This notebook has codes involving NLP based features extraction and simple ANN and a deep embedding based model </em></b></li> 
  <li> <b> <em> It also has codes for ELMo and Infersent Sentence embedding. Even BERT codes will be added later for embedding extraction </em></b></li>

# Downloading packaes

In [2]:
! pip install --upgrade allennlp

from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision


Collecting allennlp
[?25l  Downloading https://files.pythonhosted.org/packages/d6/37/e2418f74df007e9516013828205462021e330c27aed8ffc64c6fca44c7d6/allennlp-0.8.1-py3-none-any.whl (5.5MB)
[K    100% |████████████████████████████████| 5.5MB 6.3MB/s 
Collecting parsimonious==0.8.0 (from allennlp)
  Downloading https://files.pythonhosted.org/packages/4a/89/32c55944cd30dff856f16859ee325b13c83c260d0c56c0eed511e8063c87/parsimonious-0.8.0.tar.gz
Collecting pytorch-pretrained-bert==0.3.0 (from allennlp)
  Downloading https://files.pythonhosted.org/packages/26/85/de4dd7e018a197280752881adf7b4142886f20155145f641f7c803c0018a/pytorch_pretrained_bert-0.3.0-py3-none-any.whl
Collecting responses>=0.7 (from allennlp)
  Downloading https://files.pythonhosted.org/packages/2f/d8/a77cbd4cb8366ad0e275f2c642b50b401da22a2f5714e003e499fddca106/responses-0.10.5-py2.py3-none-any.whl
Collecting matplotlib==2.2.3 (from allennlp)
[?25l  Downloading https://files.pythonhosted.org/packages/9e/59/f235ab21bbe7b7c6570

# Import packaes and mount gdrive

In [0]:
import os
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive
import tensorflow as tf

In [4]:
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
TPU_WORKER = "gprc://" + os.environ['COLAB_TPU_ADDR']

In [0]:
data = pd.read_csv("gdrive/My Drive/ASAG Work/All Data/SemEval_till_POSNew.csv",sep=',',usecols=['question','ref_answer','stu_answer','train/test','accuracy'])

In [0]:
data['question'] = data['question'].apply(lambda x: str(x).split())
data['ref_answer'] = data['ref_answer'].apply(lambda x: str(x).split())
data['stu_answer'] = data['stu_answer'].apply(lambda x: str(x).split())

In [0]:
qlen = len(max(data['question'].tolist(), key=len))
rlen = len(max(data['ref_answer'].tolist(), key=len))
slen = len(max(data['stu_answer'].tolist(), key=len))

In [0]:
qlen,rlen,slen

(186, 53, 110)

In [0]:
def padding(data,max_len,col_name):
    for i in range(len(data)):
        if i%5000 == 0:
            print(i)
        pad_len = max_len - len(data[col_name][i])
        l = ["<pad>" for i in range(pad_len)]
        data[col_name][i] = data[col_name][i] + l
    return data

In [0]:
data = padding(data,qlen,'question')
data = padding(data,rlen,'ref_answer')
data = padding(data,slen,'stu_answer')

0
5000
10000
15000
20000
25000
30000
35000
40000
45000
0
5000
10000
15000
20000
25000
30000
35000
40000
45000
0
5000
10000
15000
20000
25000
30000
35000
40000
45000


In [0]:
def get_req_data(data,tr_ts,n_class):
    indices = data[(data['train/test'].str.contains(tr_ts)) & (data['train/test'].str.contains(str(n_class)))].index.tolist()
    d = data.loc[indices,:].reset_index(drop=True)
    q = d.loc[:,'question']
    r = d.loc[:,'ref_answer']
    s = d.loc[:,'stu_answer']
    y = d.loc[:,'accuracy']
    return q,r,s,y

In [0]:
n_classes = 5
qtrain,rtrain,strain,ytrain = get_req_data(data,'train',n_classes)
qtest1,rtest1,stest1,ytest1 = get_req_data(data,'unseen-answers',n_classes)
qtest2,rtest2,stest2,ytest2 = get_req_data(data,'unseen-questions',n_classes)
qtest3,rtest3,stest3,ytest3 = get_req_data(data,'unseen-domains',n_classes)

In [0]:
un_acc = ytrain.unique()
lab_to_int, int_to_lab = {},{}
for n,u in enumerate(un_acc):
    lab_to_int[u] = n
    int_to_lab[n] = u

In [0]:
ytrain = ytrain.replace(list(int_to_lab.values()), list(int_to_lab.keys()))
ytest1 = ytest1.replace(list(int_to_lab.values()), list(int_to_lab.keys()))
ytest2 = ytest2.replace(list(int_to_lab.values()), list(int_to_lab.keys()))
ytest3 = ytest3.replace(list(int_to_lab.values()), list(int_to_lab.keys()))

# Embeddings

## ELMO embeddings

<center> **ELMO Model** <br>
<img src="https://tsenghungchen.github.io/posts/elmo/taglm.png" alt="drawing" width="700"/> </center>

In [0]:
from allennlp.modules.elmo import Elmo, batch_to_ids

! wget "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
! wget "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

options_file = "./elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "./elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

elmo = Elmo(options_file, weight_file, 2, dropout=0)
elmo = elmo.cuda()

In [0]:
import torch
word2vec = {}

In [0]:
def padded(l,n):
    for k in range(len(l),n):
        l.append('<pad>')
    return l

In [0]:
q,r,s = [],[],[]
for i in range(len(data)):
    q.append(padded(data['question'][i].split(),186))
    r.append(padded(data['ref_answer'][i].split(),53))
    s.append(padded(data['stu_answer'][i].split(),110))

In [0]:
len(max(q,key=len)), len(max(r,key=len)), len(max(s,key=len))

(186, 53, 110)

In [0]:
for i in range(0,len(w),200):
    print(i)
    sentences = w[i:i+200]
    character_ids = batch_to_ids(sentences).cuda()
    embeddings = np.float16(elmo(character_ids)['elmo_representations'][0].cpu().detach().numpy())
    for j in range(len(embeddings)):
        for k in range(len(embeddings[j])):
            if k < len(w[i+j]):
                if w[i+j][k] in word2vec.keys():
                    word2vec[w[i+j][k]] = np.mean([word2vec[w[i+j][k]],embeddings[j][k]],axis=0)
                else:
                    word2vec[w[i+j][k]] = embeddings[j][k]
            else:
                if '<pad>' in word2vec.keys():
                    word2vec['<pad>'] = np.mean([word2vec['<pad>'],embeddings[j][k]],axis=0)
                else:
                    word2vec['<pad>'] = embeddings[j][k]

In [0]:
print(len(word2vec.keys()))

7416


In [0]:
l = []
for k,v in word2vec.items():
    p = [k] + v.tolist()
    l.append(p)

In [0]:
w2v = pd.DataFrame(l)

In [0]:
w2v.to_csv("gdrive/My Drive/ASAG Work/All Data/elmo_semeval_embeddings.csv",index=None)

In [0]:
# use batch_to_ids to convert sentences to character ids
sentences = [['First', 'sentence', '.'], ['Another', '.']]
character_ids = batch_to_ids(sentences)

embeddings = elmo(character_ids)
embeddings = embeddings['elmo_representations'][0]

# embeddings is a list of 2 tensors of shapes (batch_size,max_words,1024)

In [0]:
from allennlp.commands.elmo import ElmoEmbedder

! wget "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
! wget "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

options_file = "./elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "./elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

elmo = ElmoEmbedder(options_file, weight_file, 0)

embeddings = np.float16(np.array(elmo.embed_batch(data))[:,2,:,:])

## Bert Embeddings

<center> **ELMO Model** <br>
<img src="https://1.bp.blogspot.com/-RLAbr6kPNUo/W9is5FwUXmI/AAAAAAAADeU/5y9466Zoyoc96vqLjbruLK8i_t8qEdHnQCLcBGAs/s1600/image3.png" alt="drawing" width="700"/> </center>

In [0]:
! mkdir ./tmp
! echo 'Who was Jim Henson ? ||| Jim Henson was a puppeteer' > ./tmp/inputs.txt

%run gdrive/My\ Drive/Microsoft\ AI/bert-master/extract_features.py \
  --input_file=./tmp/inputs.txt \
  --output_file=./tmp/output.jsonl \
  --vocab_file=./gdrive/My\ Drive/Microsoft\ AI/bert-master/vocab.txt \
  --bert_config_file=./gdrive/My\ Drive/Microsoft\ AI/bert-master/bert_config.json \
  --init_checkpoint=./gdrive/My\ Drive/Microsoft\ AI/bert-master/bert_model.ckpt \
  --layers=-1 \
  --max_seq_length=32 \
  --batch_size=8

# Deep NLP Models

In [0]:
from allennlp.modules.elmo import Elmo, batch_to_ids

! wget "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
! wget "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

options_file = "./elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "./elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

elmo = Elmo(options_file, weight_file, 2, dropout=0)
elmo = elmo.cuda()

--2019-01-28 19:58:50--  https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 54.231.185.48
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|54.231.185.48|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 336 [application/json]
Saving to: ‘elmo_2x4096_512_2048cnn_2xhighway_options.json’


2019-01-28 19:58:50 (16.9 MB/s) - ‘elmo_2x4096_512_2048cnn_2xhighway_options.json’ saved [336/336]

--2019-01-28 19:58:53--  https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.218.209.104
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.218.209.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 374434792 (357M) 

01/28/2019 19:59:13 - INFO - allennlp.modules.elmo -   Initializing ELMo


In [0]:
class EmbedLoader(tf.keras.utils.Sequence):
    def __init__(self,elmo,qpad,rpad,spad,y_data,batch_size,n_classes):
        self.elmo = elmo
        self.ques = qpad
        self.refs = rpad
        self.stud = spad
        self.label = y_data
        self.batch_size = batch_size
        self.n_classes = n_classes
    
    def __len__(self):
        return int(np.ceil(len(self.ques) / float(self.batch_size)))
    
    def __getitem__(self,idx):
        # get question embeddings
        character_ids = batch_to_ids(self.ques[idx*self.batch_size:(idx+1)*self.batch_size]).cuda()
        qembed = np.float16(self.elmo(character_ids)['elmo_representations'][0].cpu().detach().numpy())
        # get reference embeddings
        character_ids = batch_to_ids(self.refs[idx*self.batch_size:(idx+1)*self.batch_size]).cuda()
        rembed = np.float16(self.elmo(character_ids)['elmo_representations'][0].cpu().detach().numpy())
        # get student embeddings
        character_ids = batch_to_ids(self.stud[idx*self.batch_size:(idx+1)*self.batch_size]).cuda()
        sembed = np.float16(self.elmo(character_ids)['elmo_representations'][0].cpu().detach().numpy())
        
        label = self.label[idx*self.batch_size:(idx+1)*self.batch_size]
        
        return [qembed,rembed,sembed], label

In [0]:
qtrain.shape, qtest1.shape, qtest2.shape, qtest3.shape

((8910,), (979,), (1552,), (4562,))

In [0]:
train_loader = EmbedLoader(elmo,qtrain,rtrain,strain,ytrain,32,n_classes)
test1_loader = EmbedLoader(elmo,qtest1,rtest1,stest1,ytest1,32,n_classes)
test2_loader = EmbedLoader(elmo,qtest2,rtest2,stest2,ytest2,32,n_classes)
test3_loader = EmbedLoader(elmo,qtest3,rtest3,stest3,ytest3,32,n_classes)

In [0]:
def deep_nlp_model(qlen,rlen,slen,n_classes):
    
    qinp = tf.keras.layers.Input(shape=(qlen,1024))
    rinp = tf.keras.layers.Input(shape=(rlen,1024))
    sinp = tf.keras.layers.Input(shape=(slen,1024))
    
    q = tf.keras.layers.Conv1D(300,1)(qinp)
    q = tf.keras.layers.Conv1D(64,32)(q)
    q = tf.keras.layers.MaxPool1D(2)(q)
    q = tf.keras.layers.Conv1D(32,32)(q)
    q = tf.keras.layers.MaxPool1D(2)(q)
    q = tf.keras.layers.LSTM(units=32,recurrent_dropout=0.2,return_sequences=True)(q)
    q = tf.keras.layers.LSTM(units=32,recurrent_dropout=0.2,return_sequences=True)(q)
    q_embed = tf.keras.layers.LSTM(units=32,recurrent_dropout=0.2)(q)
    
    r = tf.keras.layers.Conv1D(300,1)(rinp)
    r = tf.keras.layers.Conv1D(32,16)(r)
    r = tf.keras.layers.MaxPool1D(2)(r)
    r = tf.keras.layers.LSTM(units=32,recurrent_dropout=0.2,return_sequences=True)(r)
    r = tf.keras.layers.LSTM(units=32,recurrent_dropout=0.2,return_sequences=True)(r)
    r_embed = tf.keras.layers.LSTM(units=32,recurrent_dropout=0.2)(r)
    
    s = tf.keras.layers.Conv1D(300,1)(sinp)
    s = tf.keras.layers.Conv1D(64,32)(s)
    s = tf.keras.layers.MaxPool1D(2)(s)
    s = tf.keras.layers.Conv1D(32,16)(s)
    s = tf.keras.layers.MaxPool1D(2)(s)
    s = tf.keras.layers.LSTM(units=32,recurrent_dropout=0.2,return_sequences=True)(s)
    s = tf.keras.layers.LSTM(units=32,recurrent_dropout=0.2,return_sequences=True)(s)
    s_embed = tf.keras.layers.LSTM(units=32,recurrent_dropout=0.2)(s)
    
    add_qr = tf.keras.layers.Add()([q_embed,r_embed])
    add_rs = tf.keras.layers.Add()([r_embed,s_embed])
    sub_rs = tf.keras.layers.Subtract()([s_embed,r_embed])
    sub_srq = tf.keras.layers.Subtract()([s_embed,add_qr])
    mul_rs = tf.keras.layers.Multiply()([r_embed,s_embed])
    
#     concat = tf.keras.layers.Concatenate()([add_qr,add_rs,sub_rs,sub_srq,mul_rs,q_embed,r_embed,s_embed])
    concat = tf.keras.layers.Concatenate()([q_embed,r_embed,s_embed])
    
    x = tf.keras.layers.Dense(256,activation='relu')(concat)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(256,activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(256,activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    out = tf.keras.layers.Dense(n_classes,activation='softmax')(x)
    
    model = tf.keras.models.Model(inputs=[qinp,rinp,sinp],outputs=[out])
    model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.01),loss='sparse_categorical_crossentropy',metrics=['accuracy'])
    return model

In [0]:
keras_model = deep_nlp_model(qlen,rlen,slen,n_classes)
keras_model.summary()
keras_model.fit_generator(train_loader,epochs=10)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           (None, 186, 1024)    0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           (None, 110, 1024)    0                                            
__________________________________________________________________________________________________
conv1d_24 (Conv1D)              (None, 186, 300)     307500      input_16[0][0]                   
__________________________________________________________________________________________________
conv1d_29 (Conv1D)              (None, 110, 300)     307500      input_18[0][0]                   
__________________________________________________________________________________________________
conv1d_25 

In [0]:
from sklearn.metrics import f1_score,precision_score,recall_score
def get_res(keras_model,test1_loader,test2_loader,test3_loader,method,n_classes):
    results = []
    probs = keras_model.predict_generator(test1_loader)
    y_pred = np.argmax(probs,axis=1)
    if n_classes == 2:
        f1 = f1_score(ytest1,y_pred,labels=list(int_to_lab.keys()),average='binary')
        pr = precision_score(ytest1,y_pred,labels=list(int_to_lab.keys()),average='binary')
        re = recall_score(ytest1,y_pred,labels=list(int_to_lab.keys()),average='binary')
    else:
        f1 = f1_score(ytest1,y_pred,labels=list(int_to_lab.keys()),average=method)
        pr = precision_score(ytest1,y_pred,labels=list(int_to_lab.keys()),average=method)
        re = recall_score(ytest1,y_pred,labels=list(int_to_lab.keys()),average=method)
    results.extend([pr,re,f1])

    probs = keras_model.predict_generator(test2_loader)
    y_pred = np.argmax(probs,axis=1)
    if n_classes == 2:
        f1 = f1_score(ytest2,y_pred,labels=list(int_to_lab.keys()),average='binary')
        pr = precision_score(ytest2,y_pred,labels=list(int_to_lab.keys()),average='binary')
        re = recall_score(ytest2,y_pred,labels=list(int_to_lab.keys()),average='binary')
    else:
        f1 = f1_score(ytest2,y_pred,labels=list(int_to_lab.keys()),average=method)
        pr = precision_score(ytest2,y_pred,labels=list(int_to_lab.keys()),average=method)
        re = recall_score(ytest2,y_pred,labels=list(int_to_lab.keys()),average=method)
    results.extend([pr,re,f1])

    probs = keras_model.predict_generator(test3_loader)
    y_pred = np.argmax(probs,axis=1)
    if n_classes == 2:
        f1 = f1_score(ytest3,y_pred,labels=list(int_to_lab.keys()),average='binary')
        pr = precision_score(ytest3,y_pred,labels=list(int_to_lab.keys()),average='binary')
        re = recall_score(ytest3,y_pred,labels=list(int_to_lab.keys()),average='binary')
    else:
        f1 = f1_score(ytest3,y_pred,labels=list(int_to_lab.keys()),average=method)
        pr = precision_score(ytest3,y_pred,labels=list(int_to_lab.keys()),average=method)
        re = recall_score(ytest3,y_pred,labels=list(int_to_lab.keys()),average=method)
    results.extend([pr,re,f1])
    
    return results

In [0]:
get_res(keras_model,test1_loader,test2_loader,test3_loader,"macro",n_classes)

In [0]:
get_res(keras_model,test1_loader,test2_loader,test3_loader,"weighted",n_classes)

In [0]:
keras_model = deep_nlp_model(qlen,rlen,slen,n_classes)
keras_model.summary()
tf.keras.backend.clear_session()
tpu_model = tf.contrib.tpu.keras_to_tpu_model(keras_model, 
                                              strategy=tf.contrib.tpu.TPUDistributionStrategy(
                                                  tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER)))

tpu_model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.01),loss='sparse_categorical_crossentropy',metrics=['accuracy'])
tpu_model.fit_generator([qtrain,rtrain,strain],ytrain,epochs=1)#,callbacks=[tf.keras.callbacks.EarlyStopping('loss',0.1,10)])
tpu_model.save_weights('./model.h5', overwrite=True)