In [0]:
from __future__ import division

import os
import sys
import time
import nltk
import math
import json
import random
import pickle
import zipfile
import gensim
import numpy as np
import pandas as pd
#nltk.download('all')
import tensorflow as tf
from google.colab import drive
from collections import Counter
from keras.utils import Sequence
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [65]:
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']

In [0]:
def tokenizer(df):
  df['query'] = [df['query'][i].strip().split() for i in range(df.shape[0])]
  df['response'] = [df['response'][i].strip().split() for i in range(df.shape[0])]
  return df

In [0]:
def get_word_dict(df):
  w = []
  w2i = {}
  i2w = {}
  unq_no = 0
  
  for i in range(df.shape[0]):
    for k in df['query'][i]:
      if k not in w:
        w.append(k)
        w2i[k] = unq_no
        i2w[unq_no] = k
        unq_no+=1
    for k in df['response'][i]:
      if k not in w:
        w.append(k)
        w2i[k] = unq_no
        i2w[unq_no] = k
        unq_no+=1
  return w,w2i,i2w

In [0]:
def convert_to_ids(df,w2i):
  for i in range(df.shape[0]):
    for k in range(len(df['query'][i])):
      df['query'][i][k] = w2i[df['query'][i][k]]
    for k in range(len(df['response'][i])):
      df['response'][i][k] = w2i[df['response'][i][k]]
  return df

In [0]:
def padding(df,pad_val,q_max,r_max):
  for i in range(df.shape[0]):
    l = len(df['query'][i])
    pad_len = q_max - l
    for k in range(pad_len):
      df['query'][i].append(pad_val)
    l = len(df['response'][i])
    pad_len = r_max - l
    for k in range(pad_len):
      df['response'][i].append(pad_val)
  return df

In [0]:
def find_max(data):
  q_max = -1
  r_max = -1
  for i in range(data.shape[0]):
    if len(data['query'][i]) > q_max:
      q_max = len(data['query'][i])
    if len(data['response'][i]) > r_max:
      r_max = len(data['response'][i])
  return q_max,r_max

In [0]:
with open("gdrive/My Drive/Microsoft AI/Finalised Version/Hashed/Latest Data/int2word.json","r") as fp:
  int2word = json.load(fp)
  
with open("gdrive/My Drive/Microsoft AI/Finalised Version/Hashed/Latest Data/word2int.json","r") as fp:
  word2int = json.load(fp)
  
with open("gdrive/My Drive/Microsoft AI/Finalised Version/Hashed/Latest Data/vocabulary.pkl","rb") as fp:
  vocab = pickle.load(fp)

In [0]:
embeddings_index = {}
f = open('gdrive/My Drive/Microsoft AI/Junk/Just For now/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float16')
    embeddings_index[word] = coefs
f.close()

In [0]:
comm = list(set(vocab).intersection(set(embeddings_index.keys())))

In [0]:
emb_matrix = np.zeros((len(vocab)+2,100),dtype=np.float16)

In [72]:
emb_matrix.shape

(102593, 100)

In [0]:
for i in range(len(comm)):
  emb_matrix[word2int[comm[i]]] = embeddings_index[comm[i]]

In [0]:
df = pd.read_csv("./gdrive/My Drive/Microsoft AI/Finalised Version/Hashed/Latest Data/clean_data_latest_#.csv",sep=',').iloc[:100000]
df.columns = ['queryID','query','response','label','labelID']
df = df.drop(columns=['queryID','labelID'])

In [0]:
df = tokenizer(df)

In [0]:
vocab, word2int, int2word = get_word_dict(df)

with open("gdrive/My Drive/Microsoft AI/mFinalised Version/Hashed/Latest Data/int2word.json","w") as fp:
  json.dump(int2word,fp)
  
with open("gdrive/My Drive/Microsoft AI/Finalised Version/Hashed/Latest Data/word2int.json","w") as fp:
  json.dump(word2int,fp)
  
with open("gdrive/My Drive/Microsoft AI/Finalised Version/Hashed/Latest Data/vocabulary.pkl","wb") as fp:
  pickle.dump(vocab,fp)

In [0]:
df = convert_to_ids(df,word2int)

In [0]:
q_max, r_max = find_max(df)
# df = padding(df,word2int['nan'],q_max,r_max)

In [0]:
qry,rsp = [],[]
for i in range(df.shape[0]):
  qry.append([[j] for j in df['query'][i]])
  rsp.append([[j] for j in df['response'][i]])

In [0]:
with open("./gdrive/My Drive/Microsoft AI/Finalised Version/Hashed/Latest Data/query_pad_encoded_#.pkl","wb") as fp:
  pickle.dump(qry,fp)
  
with open("./gdrive/My Drive/Microsoft AI/Finalised Version/Hashed/Latest Data/response_pad_encoded_#.pkl","wb") as fp:
  pickle.dump(rsp,fp)

In [0]:
with open("./gdrive/My Drive/Microsoft AI/Finalised Version/Hashed/Latest Data/query_pad_encoded_#.pkl","rb") as fp:
  qry = pickle.load(fp)
  
with open("./gdrive/My Drive/Microsoft AI/Finalised Version/Hashed/Latest Data/response_pad_encoded_#.pkl","rb") as fp:
  rsp = pickle.load(fp)

In [0]:
split=70000
q_train_df, q_test_df = np.array(qry[:split]), np.array(qry[split:])
r_train_df, r_test_df = np.array(rsp[:split]), np.array(rsp[split:])
train_labels, test_labels = df['label'][:split].values, df['label'][split:].values

In [0]:
def model_create(q_shape,r_shape,vocab_size):
  inp1 = tf.keras.layers.Input(shape=q_shape)
  inp2 = tf.keras.layers.Input(shape=r_shape)
  
  q = tf.keras.layers.Embedding(vocab_size+2,100,input_length=q_shape[0],weights=[emb_matrix])(inp1)
  q = tf.keras.layers.Reshape((q_shape[0],100))(q)
  r = tf.keras.layers.Embedding(vocab_size+2,100,input_length=r_shape[0],weights=[emb_matrix])(inp2)
  r = tf.keras.layers.Reshape((r_shape[0],100))(r)
  print(tf.keras.backend.int_shape(q),tf.keras.backend.int_shape(r))
  
  q = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True))(q)
#   q = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,activation='relu',return_sequences=True))(q)
  q = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(q)
  
  r = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True))(r)
#   r = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,activation='relu',return_sequences=True))(r)
  r = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(r)
  
  comb = tf.keras.layers.Concatenate(axis=-1)([q,r])
  
  x = tf.keras.layers.Dense(1024,activation='relu')(comb)
  x = tf.keras.layers.Dense(512,activation='relu')(x)
  x = tf.keras.layers.Dense(256,activation='relu')(x)
  x = tf.keras.layers.Dense(128,activation='sigmoid')(x)
  out = tf.keras.layers.Dense(1,activation='sigmoid')(x)
  
  model = tf.keras.models.Model(inputs=[inp1,inp2],outputs=[out])
  model.summary()
  return model

In [84]:
keras_model = model_create((17,1),(175,1),len(vocab))
tf.keras.backend.clear_session()
tpu_model = tf.contrib.tpu.keras_to_tpu_model(keras_model, 
                                              strategy=tf.contrib.tpu.TPUDistributionStrategy(
                                                  tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER)))

tpu_model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.001),loss='binary_crossentropy',metrics=['accuracy'])
tpu_model.fit([q_train_df,r_train_df],train_labels,epochs=10,batch_size=128,validation_data=([q_test_df,r_test_df],test_labels))

tpu_model.save_weights('./embeddings_bilstm-model.h5', overwrite=True)
! cp -fR "./embeddings_bilstm-model.h5" "./gdrive/My Drive/Microsoft AI/Junk/"

(None, 17, 100) (None, 175, 100)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 17, 1)        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 175, 1)       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 17, 1, 100)   10259300    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 175, 1, 100)  10259300    input_2[0][0]                    
____________________________________________________________________________

UnavailableError: ignored

In [0]:
data2 = pd.read_csv("./gdrive/My Drive/Microsoft AI/Finalised Version/Hashed/Latest Data/clean_test_data_#.csv",sep=',')
data2.columns = ['queryID','query','response','labelID']
data2 = data2.drop(columns=['queryID','labelID'])

In [0]:
data2 = tokenizer(data2)