In [None]:
# %load data.py




import gzip
import os
import re
import tarfile
import argparse

from six.moves import urllib

from tensorflow.python.platform import gfile
import pandas as pd
from tqdm import *
import numpy as np
from os.path import join as pjoin
from spacy.lang.en import English
from nltk.tokenize import word_tokenize
from random import shuffle

_PAD = b"<pad>"
_SOS = b"<sos>"
_UNK = b"<unk>"
_START_VOCAB = [_PAD, _SOS, _UNK]

PAD_ID = 0
SOS_ID = 1
UNK_ID = 2

def setup_args():
    parser = argparse.ArgumentParser()
    code_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)))
    vocab_dir = os.path.join("data", "quora")
    glove_dir = os.path.join("download", "dwr")
    data_dir = os.path.join("download", "quora")
    source_dir = os.path.join("data", "quora")
    parser.add_argument("--source_dir", default=source_dir)
    parser.add_argument("--data_dir", default=data_dir)
    parser.add_argument("--glove_dir", default=glove_dir)
    parser.add_argument("--vocab_dir", default=vocab_dir)
    parser.add_argument("--glove_dim", default=100, type=int)
    parser.add_argument("--random_init", default=True, type=bool)
    parser.add_argument("--split", default = 0.8, type = float)
    parser.add_argument("--tokenizer", default = 'spaCy')
    parser.add_argument("--question_lower", default = 5) #this is to discard len(q1) + len(q2) < threshold
    parser.add_argument("--question_upper", default = 70) #this is to discard len(q1) or len(q2) > threshold
    return parser.parse_args()


def create_txt(data_path, train_path, val_path, split, question_lower, question_upper):
    xin = pd.read_csv(data_path)
    xin_notnull = xin[~(xin.question1.isnull() | xin.question2.isnull())]
    xin_notnull = xin_notnull.sample(frac = 1).reset_index(drop = True)
    splitid = int(xin_notnull.shape[0]*split)
    xin_train = xin_notnull[:splitid]
    xin_val = xin_notnull[splitid:]

    question1_train_list = (xin_train.question1.str.replace("\n", "") + "\n").tolist()
    question2_train_list = (xin_train.question2.str.replace("\n", "") + "\n").tolist()
    question1_val_list = (xin_val.question1.str.replace("\n", "") + "\n").tolist()
    question2_val_list = (xin_val.question2.str.replace("\n", "") + "\n").tolist()

    train_upper_flag = np.array([False if (len(i.split()) > question_upper) or (len(j.split()) > question_upper) else True for i, j in zip(question1_train_list, question2_train_list)])

    val_upper_flag = np.array([False if (len(i.split()) > question_upper) or (len(j.split()) > question_upper) else True for i, j in zip(question1_val_list, question2_val_list)])
    question1_train = pd.Series(question1_train_list)[train_upper_flag].tolist()
    question2_train = pd.Series(question2_train_list)[train_upper_flag].tolist()
    question1_val = pd.Series(question2_val_list)[val_upper_flag].tolist()
    question2_val = pd.Series(question2_val_list)[val_upper_flag].tolist()

    question1_train = "".join(question1_train)
    question2_train = "".join(question2_train)
    question1_val = "".join(question1_val)
    question2_val = "".join(question2_val)
    labels_train = "".join(map(str, xin_train.is_duplicate[train_upper_flag].tolist()))
    labels_val = "".join(map(str, xin_val.is_duplicate[val_upper_flag].tolist()))

    
    with open(train_path + "\question1_train.txt", 'w', encoding = 'utf-8') as q1_t, \
        open(train_path + "\question2_train.txt", 'w', encoding = 'utf-8') as q2_t, \
        open(val_path + "\question1_val.txt", 'w', encoding = 'utf-8') as q1_v, \
        open(val_path + "\question2_val.txt", 'w', encoding = 'utf-8') as q2_v, \
        open(train_path + "\labels_train.txt", 'w', encoding = 'utf-8') as l_t, \
        open(val_path + "\labels_val.txt", 'w', encoding = 'utf-8') as l_v:
            q1_t.write(question1_train), \
            q2_t.write(question2_train), \
            q1_v.write(question1_val), \
            q2_v.write(question2_val), \
            l_t.write(labels_train), \
            l_v.write(labels_val)


def initialize_vocabulary(vocabulary_path):
    # map vocab to word embeddings
    '''arguments:
            vocabulary_path: vocabulary file with a token in each line
        retuns:
            vocab, rev_vocab
            vocab: a dictionary with signature {'vocab': idx}
            rev_vocab: a list of all the tokens in vocabulary_path
            There is 1 to 1 mapping between rev_vocab and vocab'''
    print('initializing vocabulary')
    if gfile.Exists(vocabulary_path):
        rev_vocab = []
        with gfile.GFile(vocabulary_path, mode="r") as f:
            rev_vocab.extend(f.readlines())
        rev_vocab = [line.strip('\n') for line in rev_vocab]
        vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
        return vocab, rev_vocab
    else:
        raise ValueError("Vocabulary file %s not found.", vocabulary_path)


def process_glove(args, vocab_list, save_path, size=4e5, random_init=True):
    """
    signature: 
    creates a numpy matrix glove with word vectors corresonding to tokens in vocab_list
    word vec for vocab_list[i] = glove[i]
    writes glove to save_path.npz
    
    :param vocab_list: [vocab]. a list of vocab
    :return:
    """
    print('procesing glove')
    if not gfile.Exists(save_path + ".npz"):
        glove_path = os.path.join(args.glove_dir, "glove.6B.{}d.txt".format(args.glove_dim))
        if random_init:
            glove = np.random.randn(len(vocab_list), args.glove_dim)
        else:
            glove = np.zeros((len(vocab_list), args.glove_dim))
        found = 0
        with open(glove_path, 'r', encoding = 'utf-8') as fh:
            for line in tqdm(fh, total=size):
                array = line.lstrip().rstrip().split(" ")
                word = array[0]
                vector = list(map(float, array[1:]))
                if word in vocab_list:
                    idx = vocab_list.index(word)
                    glove[idx, :] = vector
                    found += 1
                if word.capitalize() in vocab_list:
                    idx = vocab_list.index(word.capitalize())
                    glove[idx, :] = vector
                    found += 1
                if word.upper() in vocab_list:
                    idx = vocab_list.index(word.upper())
                    glove[idx, :] = vector
                    found += 1

        print(("{}/{} of word vocab have corresponding vectors in {}".format(found, len(vocab_list), glove_path)))
        np.savez_compressed(save_path, glove=glove)
        print(("saved trimmed glove matrix at: {}".format(save_path)))


def create_vocabulary(vocabulary_path, data_paths, tokenizer=None):
    '''Iterates through all data_paths and creates a vocab of unique tokens 
    sorted according to their frequency in collective of data_paths
    writes it at vocabulary_path'''
    print('creating vocabulary')
    if not gfile.Exists(vocabulary_path):
        print(("Creating vocabulary %s from data %s" % (vocabulary_path, str(data_paths))))
        vocab = {}
        for path in data_paths:
            with open(path, mode="rb") as f:
                counter = 0
                for line in f:
                    counter += 1
                    if counter % 100000 == 0:
                        print(("processing line %d" % counter))
                    tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
                    for w in tokens:
                        if w in vocab:
                            vocab[w] += 1
                        else:
                            vocab[w] = 1
        vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
        print(("Vocabulary size: %d" % len(vocab_list)))
        with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
            for w in vocab_list:
                vocab_file.write(w + b"\n")


def sentence_to_token_ids(sentence, vocabulary, tokenizer=None):
    '''converts sentence to a list  of their token ids according to vocabulary provided
    in case a token is not present it is replaced by token id for unk symbol'''
    if tokenizer:
        words = tokenizer(sentence)
        words = [word.orth_ for word in words if not word.orth_.isspace()]
    else:
        words = basic_tokenizer(sentence)
    return [vocabulary.get(w, UNK_ID) for w in words]


def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None):
    '''converts file at data_path to a list of token_ids mapped 1 to 1 according to open(vocabulary_path)'''
    print('converting data to token ids')
    if not gfile.Exists(target_path):
        print(("Tokenizing data in %s" % data_path))
        vocab, _ = initialize_vocabulary(vocabulary_path)
        with gfile.GFile(data_path, mode="r") as data_file:
            with gfile.GFile(target_path, mode="w") as tokens_file:
                counter = 0
                for line in data_file:
                    counter += 1
                    if counter % 5000 == 0:
                        print(("tokenizing line %d" % counter))
                    token_ids = sentence_to_token_ids(line, vocab, tokenizer)
                    tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
                    
def get_tokenizer(name):
    if name == 'spaCy':
        tokenizer = English()
    if name == 'word_tokenize':
        tokenizer = word_tokenize
    return tokenizer

def basic_tokenizer(sentence):
    return sentence.split()

if __name__ == '__main__':
    args = setup_args()
    tokenizer = get_tokenizer(args.tokenizer)
    vocab_path = pjoin(args.vocab_dir, "vocab.dat")
    data_path = pjoin(args.data_dir, "train.csv")
    train_path = pjoin(args.source_dir)
    valid_path = pjoin(args.source_dir)
    
    split = args.split
    
    create_txt(data_path, train_path, valid_path, split,args.question_lower, args.question_upper)
    
    create_vocabulary(vocab_path,
                      [pjoin(args.source_dir, "question1_train.txt"),
                       pjoin(args.source_dir, "question2_train.txt"),
                      pjoin(args.source_dir, "question1_val.txt"),
                      pjoin(args.source_dir, "question2_val.txt")])
    vocab, rev_vocab = initialize_vocabulary(vocab_path)

    process_glove(args, rev_vocab, args.source_dir + "/glove.trimmed.{}".format(args.glove_dim),
                  random_init=args.random_init)

    question1_train_ids_path = train_path + "/.ids.train.question1"
    question2_train_ids_path = train_path + "/.ids.train.question2"
    data_to_token_ids(train_path + "/question1_train.txt", question1_train_ids_path, vocab_path, tokenizer = tokenizer)
    data_to_token_ids(train_path + "/question2_train.txt", question2_train_ids_path, vocab_path, tokenizer = tokenizer)

    question1_val_ids_path = valid_path + "/.ids.val.question1"
    question2_val_ids_path = valid_path + "/.ids.val.question2"
    data_to_token_ids(valid_path + "/question1_val.txt", question1_val_ids_path, vocab_path, tokenizer = tokenizer)
    data_to_token_ids(valid_path + "/question2_val.txt", question2_val_ids_path, vocab_path, tokenizer = tokenizer)

In [311]:
# %load "../code/utils/general_utils.py"
import sys
import time
import numpy as np


def get_minibatches(data, minibatch_size, shuffle=True):
    """
    Iterates through the provided data one minibatch at at time. You can use this function to
    iterate through data in minibatches as follows:

        for inputs_minibatch in get_minibatches(inputs, minibatch_size):
            ...

    Or with multiple data sources:

        for inputs_minibatch, labels_minibatch in get_minibatches([inputs, labels], minibatch_size):
            ...

    Args:
        data: there are two possible values:
            - a list or numpy array
            - a list where each element is either a list or numpy array
        minibatch_size: the maximum number of items in a minibatch
        shuffle: whether to randomize the order of returned data
    Returns:
        minibatches: the return value depends on data:
            - If data is a list/array it yields the next minibatch of data.
            - If data a list of lists/arrays it returns the next minibatch of each element in the
              list. This can be used to iterate through multiple data sources
              (e.g., features and labels) at the same time.

    """
    list_data = type(data) is list and (type(data[0]) is list or type(data[0]) is np.ndarray)
    data_size = len(data[0]) if list_data else len(data)
    indices = np.arange(data_size)
    if shuffle:
        np.random.shuffle(indices)
    for minibatch_start in np.arange(0, data_size, minibatch_size):
        minibatch_indices = indices[minibatch_start:minibatch_start + minibatch_size]
        yield [minibatch(d, minibatch_indices) for d in data] if list_data \
            else minibatch(data, minibatch_indices)


def minibatch(data, minibatch_idx):
    return data[minibatch_idx] if type(data) is np.ndarray else [data[i] for i in minibatch_idx]


def test_all_close(name, actual, expected):
    if actual.shape != expected.shape:
        raise ValueError("{:} failed, expected output to have shape {:} but has shape {:}"
                         .format(name, expected.shape, actual.shape))
    if np.amax(np.fabs(actual - expected)) > 1e-6:
        raise ValueError("{:} failed, expected {:} but value is {:}".format(name, expected, actual))
    else:
        print((name, "passed!"))

In [316]:
abcd.shape

NameError: name 'abcd' is not defined

In [22]:
import numpy as np
np.array([1,2,3,4]) == np.array([2,2,4,4])

array([False,  True, False,  True])

In [34]:
%%writefile "model.py"

import time
import logging
from datetime import datetime
import numpy as np
# from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf
from utils.general_utils import get_minibatches
import os
import pickle
import functools
import copy

logging.basicConfig(level=logging.INFO)

_BIAS_VARIABLE_NAME = "bias"
_WEIGHTS_VARIABLE_NAME = "kernel"
def get_optimizer(opt):
    if opt == "adam":
        optfn = tf.train.AdamOptimizer
    elif opt == "sgd":
        optfn = tf.train.GradientDescentOptimizer
    else:
        assert (False)
    return optfn
 
        
class Encoder():
    def __init__(self, encoder_size):
        print("building encoder")
        self.size = encoder_size

    def encode(self, inputs, masks):
        """
        In a generalized encode function, you pass in your inputs,
        masks, and an initial
        hidden state input into this function.

        :param inputs: Symbolic representations of your input
         masks: this is to make sure tf.nn.dynamic_rnn doesn't iterate
                      through masked steps
        :return: an encoded representation of your input.
                 It can be context-level representation, word-level representation,
                 or both.
        """
        with tf.variable_scope("encoder") as scope_encoder:
            #compute sequence length
            sequence_lengths = tf.reduce_sum(masks, axis = 1) 
            #create a forward cell
            fw_cell = tf.contrib.rnn.LSTMCell(self.size)

            #pass the cells to bilstm and create the bilstm
            bw_cell = tf.contrib.rnn.LSTMCell(self.size)
            output, final_state = tf.nn.bidirectional_dynamic_rnn(fw_cell, \
                                                                  bw_cell, inputs, \
                                                                  sequence_length = sequence_lengths, \
                                                                  dtype = tf.float32, \
                                                                  parallel_iterations = 256)
            output_lstm = tf.concat([output[0], output[1]], axis = -1)
            final_state_lstm = tf.concat([final_state[0], final_state[1]], axis = -1)
            print(output_lstm)
            return output_lstm, final_state_lstm
    
    

class QSystem(object):
    def __init__(self, encoder, pretrained_embeddings, config, train_flag = True):
        """
        Initializes your System

        :param encoder: an encoder that you constructed in train.py
        :param decoder: a decoder that you constructed in train.py
        :param args: pass in more arguments as needed
        """

        # ==== set up placeholder tokens ========
        print("building question similarity calculator")
        self.encoder = encoder
        
        self.pretrained_embeddings_vars = tf.constant(pretrained_embeddings, dtype = tf.float32)
        
        self.embed_dim = config['embed_dim']
        self.optimizer = config['optimizer']
        self.minibatch_size = config['minibatch_size']
        self.learning_rate = config['learning_rate']
        self.max_grad_norm = config['max_grad_norm']
        self.max_sent_len = config['max_sent_len']
        self.encoder_size = config['encoder_size']
        
        self.q1_masks = tf.placeholder(tf.int32, shape = [None, self.max_sent_len])
        self.q2_masks = tf.placeholder(tf.int32, shape = [None, self.max_sent_len])
        
        self.q1_id = tf.placeholder(tf.int32, shape = [None, self.max_sent_len]) #batch_size x question_length
        self.q2_id = tf.placeholder(tf.int32, shape = [None, self.max_sent_len]) #batch_size x sent_max_length

        self.a = tf.placeholder(tf.int32, shape = [None, ]) #batch_size x 1
        # ==== assemble pieces ====
        with tf.variable_scope("qa"):
            self.embed_lookup()
            self.setup_system()
            self.setup_loss()
            self.make_optimizer()
            self.saver = self.saver_prot()

        # ==== set up training/updating procedure ====
        
        
        
        
    def setup_system(self):
        """
        After your modularized implementation of encoder and decoder
        you should call various functions inside encoder, decoder here
        to assemble your reading comprehension system!
        :return:
        """
        with tf.variable_scope("questionRNN", reuse=tf.AUTO_REUSE):
            encoder_q1, final_state_q1 = self.encoder.encode(self.q1, self.q1_masks)
            encoder_q2, final_state_q2 = self.encoder.encode(self.q2, self.q2_masks)
        dot = tf.multiply(encoder_q1,encoder_q2)
        diff = encoder_q1 - encoder_q2
        batch = tf.shape(diff)[0]
        similarity = tf.concat([encoder_q1, encoder_q2, dot, diff], axis = -1)
        similarity = tf.reshape(similarity, [batch, self.max_sent_len*self.encoder_size*2*4])
        with tf.variable_scope("affine", regularizer = tf.contrib.layers.l2_regularizer(0.001)):
            self.logits = tf.contrib.layers.fully_connected(similarity, 2, activation_fn=None) #batch, 2
            
        
        self.prediction = tf.argmax(self.logits, axis = 1) #batch

    def setup_loss(self):
        with tf.variable_scope("loss"):
            self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels = self.a, logits = self.logits))
#             + tf.losses.get_regularization_loss()
            
            
            
    def make_optimizer(self):
        optimizer = get_optimizer(self.optimizer)
        _optimizer_op = optimizer(self.learning_rate)
        gradients, variables = zip(*_optimizer_op.compute_gradients(self.loss))
        clipped_gradients, self.global_norm = tf.clip_by_global_norm(gradients, self.max_grad_norm)
        self.optimizer_op = _optimizer_op.apply_gradients(zip(gradients, variables))
        self.optimizer_op = _optimizer_op.minimize(self.loss)
    

    def embed_lookup(self):
        self.q1 = tf.nn.embedding_lookup(self.pretrained_embeddings_vars, self.q1_id) #batch,q_max_len,embed_dim
        self.q2 = tf.nn.embedding_lookup(self.pretrained_embeddings_vars, self.q2_id)

    def validate(self, session, valid_dataset):
        """
        Iterate through the validation dataset and determine what
        the validation cost is.

        :return:
        """
        valid_cost = 0

        input_feed = self.feed_dict(valid_dataset)
        output_feed = [self.loss]
        valid_cost = session.run(output_feed, input_feed)
        
        return valid_cost
    
    def predict(self, session, prediction_dataset):
        input_feed = self.feed_dict(prediction_dataset)
        output = self.prediction
        
        predictions = session.run(output, input_feed)
        return predictions
        

    def evaluate_answer(self, session, dataset, log=False):
        """
        dataset: [q1, q2, a]
        naive accuracy measure
        """
        predictions = np.array(self.predict(session, dataset))
        
        gold = np.array(dataset[2])
        print(predictions.shape,gold.shape)
#         print(predictions[:10] == gold[:10] )
        correct = predictions == gold
        print(correct)
        accuracy = np.sum(correct)/gold.shape
        
        
        if log:
            logging.info("accuracy: {}".format(accuracy))
        return accuracy
    
    
    def pad(self, datalist):
        '''pads q1 and q2 to max length of q1 and q2
        Params: datalist: [q1,q2]
                where q1, q2: [[w1, w2, ..], [w6,w7, ..]]'''
        

        padded = []
        masks = []
        m_len = self.max_sent_len
        q1 = datalist[0]
        q2 = datalist[1]
        q1 = [i[:70] if len(i) > m_len else i for i in q1]
        q2 = [i[:70] if len(i) > m_len else i for i in q2]
        datalist = [q1, q2]
        padded = [[k + [0]*(m_len-len(k)) for k in j] for j in datalist]
        masks = [[[1 if t != 0 else 0 for t in k] for k in j] for j in padded]
        return padded, masks
    
    def feed_dict(self, dataset_feed, mode = 'train'):
        '''dataset_feed: ([q1,q2,a])'''
        input_feed = {}
        if mode == 'train':
            a = dataset_feed[2]
            input_feed[self.a] = a
        
        padded, padded_masks = self.pad(dataset_feed[:2])
        input_feed[self.q1_id], input_feed[self.q2_id] = padded
        input_feed[self.q1_masks], input_feed[self.q2_masks]= padded_masks
        return input_feed
         
    def run_epoch(self, dataset, sess):
        '''dataset is a list [q1, q2, a]'''
        
        n_minibatches = 0.
        total_loss = 0.
        for dataset_mini in get_minibatches(dataset, self.minibatch_size):
            n_minibatches += 1
            feed_dict = self.feed_dict(dataset_mini)
            output = [self.optimizer_op , self.loss, self.global_norm]
            _, loss, global_norm = sess.run(output, feed_dict)
            if not n_minibatches % 1:
                print("n_minibatch = {}".format(n_minibatches), "loss: {}".format(loss), "global_norm{}".format(global_norm))
            total_loss += loss  
        return total_loss/n_minibatches
    
    def saver_prot(self):
        return tf.train.Saver()

    def train(self, session, dataset,sent_max_len, epochs, train_dir, test):
        """
        Implement main training loop
        :param session: it should be passed in from train.py
               dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function.
                        In this implimentation it is passed down as a list of train and val:
                        [[train_q1, train_q2, train_a], [val_q1, val_q2, val_a]]
                        
               train_dir: path to the directory where you save the model checkpoint
        :return: best_score
        """                                                  
        results_path = os.path.join(train_dir, "{:%Y%m%d_%H%M%S}".format(datetime.now()))
        tic = time.time()
        params = tf.trainable_variables()
        num_params = sum(map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
        toc = time.time()
        logging.info("Number of params: %d (retreival took %f secs)" % (num_params, toc - tic))
        
       
        dataset_train = dataset[0]
        dataset_val = dataset[1]
        best_score = 0
        accuracy = -1
#         best_score, _ = self.evaluate_answer(session, dataset_val,log=True)
    
        
        
        for epoch in range(epochs):
            logging.info("Epoch %d out of %d", epoch + 1, epochs)
            logging.info("Best score so far: " + str(best_score))
            loss = self.run_epoch(dataset_train, session)
            print(len(dataset_train[0]),len(dataset_train[1]),len(dataset_train[2]))
            accuracy = self.evaluate_answer(session, dataset_train, log=True)
            logging.info("loss: " + str(loss) + " accuracy: "+str(accuracy))
            if accuracy > best_score:
                best_score = accuracy
                logging.info("New best score! Saving model in %s", results_path)
                self.saver.save(session, results_path)    
            print("")

        return best_score
    

Overwriting model.py


In [39]:
%%writefile train.py




import os
import json

import tensorflow as tf
import numpy as np

from model import Encoder, QSystem
from os.path import join as pjoin
import argparse

import logging
import data.py
logging.basicConfig(level=logging.INFO)


parser = argparse.ArgumentParser()
parser.add_argument("--test", default = False, type = bool)
args = parser.parse_args()


class _FLAGS():
    def __init__(self):
        self.learning_rate = 0.001
        self.max_gradient_norm = 5.0
        self.dropout = 0.15
        self.batch_size = 32
        self.epochs = 60
        self.state_size = 10
        self.embedding_size = 300
        self.data_dir = "data/quora"
        self.train_dir = "train"
        self.load_train_dir = ""
        self.log_dir = "log"
        self.optimizer = "adam"
        self.vocab_path = "data/quora/vocab.dat"
        self.max_sent_len = 70
FLAGS = _FLAGS()

def initialize_model(session, model, train_dir):
    ckpt = tf.train.get_checkpoint_state(train_dir)
    v2_path = ckpt.model_checkpoint_path + ".index" if ckpt else ""
    if ckpt and (tf.gfile.Exists(ckpt.model_checkpoint_path) or tf.gfile.Exists(v2_path)):
        logging.info("Reading model parameters from %s" % ckpt.model_checkpoint_path)
        model.saver.restore(session, ckpt.model_checkpoint_path)
    else:
        logging.info("Created model with fresh parameters.")
        session.run(tf.global_variables_initializer())
        logging.info('Num params: %d' % sum(v.get_shape().num_elements() for v in tf.trainable_variables()))
    return model

def get_device_name():
    print('device in use:',tf.test.gpu_device_name() )
    return 'CPU' if tf.test.gpu_device_name() == '' else tf.test.gpu_device_name()

def get_normalized_train_dir(train_dir):
    """
    Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    file paths saved in the checkpoint. This allows the model to be reloaded even
    if the location of the checkpoint files has moved, allowing usage with CodaLab.
    This must be done on both train.py and qa_answer.py in order to work.
    """
    global_train_dir = 'dir_node_wordvar'
    if os.path.exists(global_train_dir):
#         os.unlink(global_train_dir) #forlinux
        os.system('rmdir "%s"' % global_train_dir)
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    os.symlink(os.path.abspath(train_dir), global_train_dir, True)
    return global_train_dir

def get_pretrained_embeddings(embed_path):
    glove = np.load(embed_path)
    return glove['glove']


def main():
    get_device_name()

    print('Device in use {}'.format(get_device_name()))
    dataset = None

    embed_path = pjoin("data", "quora", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    pretrained_embeddings = get_pretrained_embeddings(embed_path)
    config = {}
    config['embed_dim'] = FLAGS.embedding_size
    config['optimizer'] = FLAGS.optimizer
    config['minibatch_size'] = FLAGS.batch_size
    config['learning_rate'] = FLAGS.learning_rate
    config['max_grad_norm'] = FLAGS.max_gradient_norm
    config['max_sent_len'] = FLAGS.max_sent_len
    config['encoder_size'] = FLAGS.state_size

    encoder = Encoder(FLAGS.state_size)
    qsystem = QSystem(encoder, pretrained_embeddings, config)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    with open(vocab_path, encoding = 'utf-8') as file1:
        f = file1.readlines()

    print((vars(FLAGS)))

    with open("data/quora"+"/.ids.train.question1", encoding = 'utf8') as tq1, \
        open("data/quora"+"/.ids.train.question2", encoding = 'utf8') as tq2, \
        open("data/quora"+"/.ids.val.question1", encoding = 'utf8') as vq1, \
        open("data/quora"+"/.ids.val.question2", encoding = 'utf8') as vq2, \
        open("data/quora"+"/labels_train.txt", encoding = 'utf8') as ta, \
        open("data/quora"+"/labels_val.txt", encoding = 'utf8') as va:
                tq1 = tq1.readlines()
                tq2 = tq2.readlines()
                ta = ta.readlines()
                vq1 = vq1.readlines()
                vq2 = vq2.readlines()
                va = va.readlines()
    ta = [int(i) for i in list(ta[0])]
    va = [int(i) for i in list(va[0])]
    tq1 = [[int(i) for i in j.replace("\n", "").split()] for j in tq1]
    tq2 = [[int(i) for i in j.replace("\n", "").split()] for j in tq2]
    vq1 = [[int(i) for i in j.replace("\n", "").split()] for j in vq1]
    vq2 = [[int(i) for i in j.replace("\n", "").split()] for j in vq2]
    dataset = [[tq1[1560:51560],tq2[1560:51560],ta[1560:51560]],[vq1[1560:51560],vq2[1560:51560],va[1560:51560]]]

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
        initialize_model(sess, qsystem, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qsystem.train(sess, dataset, FLAGS.max_sent_len, FLAGS.epochs, save_train_dir, test = args.test)
#         qa.evaluate_answer(sess, dataset_val, FLAGS.evaluation_size, log=True)

if __name__ == "__main__":
    main()


Overwriting train.py


In [None]:
# %load data.py




import gzip
import os
import re
import tarfile
import argparse

from six.moves import urllib

from tensorflow.python.platform import gfile
import pandas as pd
from tqdm import *
import numpy as np
from os.path import join as pjoin
from spacy.lang.en import English
from nltk.tokenize import word_tokenize
from random import shuffle

_PAD = b"<pad>"
_SOS = b"<sos>"
_UNK = b"<unk>"
_START_VOCAB = [_PAD, _SOS, _UNK]

PAD_ID = 0
SOS_ID = 1
UNK_ID = 2

def setup_args():
    parser = argparse.ArgumentParser()
    code_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)))
    vocab_dir = os.path.join("data", "quora")
    glove_dir = os.path.join("download", "dwr")
    data_dir = os.path.join("download", "quora")
    source_dir = os.path.join("data", "quora")
    parser.add_argument("--source_dir", default=source_dir)
    parser.add_argument("--data_dir", default=data_dir)
    parser.add_argument("--glove_dir", default=glove_dir)
    parser.add_argument("--vocab_dir", default=vocab_dir)
    parser.add_argument("--glove_dim", default=100, type=int)
    parser.add_argument("--random_init", default=True, type=bool)
    parser.add_argument("--split", default = 0.8, type = float)
    parser.add_argument("--tokenizer", default = 'spaCy')
    parser.add_argument("--question_lower", default = 5) #this is to discard len(q1) + len(q2) < threshold
    parser.add_argument("--question_upper", default = 70) #this is to discard len(q1) or len(q2) > threshold
    return parser.parse_args()


def create_txt(data_path, train_path, val_path, split, question_lower, question_upper):
    xin = pd.read_csv(data_path)
    xin_notnull = xin[~(xin.question1.isnull() | xin.question2.isnull())]
    xin_notnull = xin_notnull.sample(frac = 1).reset_index(drop = True)
    splitid = int(xin_notnull.shape[0]*split)
    xin_train = xin_notnull[:splitid]
    xin_val = xin_notnull[splitid:]

    question1_train_list = (xin_train.question1.str.replace("\n", "") + "\n").tolist()
    question2_train_list = (xin_train.question2.str.replace("\n", "") + "\n").tolist()
    question1_val_list = (xin_val.question1.str.replace("\n", "") + "\n").tolist()
    question2_val_list = (xin_val.question2.str.replace("\n", "") + "\n").tolist()

    train_upper_flag = np.array([False if (len(i.split()) > question_upper) or (len(j.split()) > question_upper) else True for i, j in zip(question1_train_list, question2_train_list)])

    val_upper_flag = np.array([False if (len(i.split()) > question_upper) or (len(j.split()) > question_upper) else True for i, j in zip(question1_val_list, question2_val_list)])
    question1_train = pd.Series(question1_train_list)[train_upper_flag].tolist()
    question2_train = pd.Series(question2_train_list)[train_upper_flag].tolist()
    question1_val = pd.Series(question2_val_list)[val_upper_flag].tolist()
    question2_val = pd.Series(question2_val_list)[val_upper_flag].tolist()

    question1_train = "".join(question1_train)
    question2_train = "".join(question2_train)
    question1_val = "".join(question1_val)
    question2_val = "".join(question2_val)
    labels_train = "".join(map(str, xin_train.is_duplicate[train_upper_flag].tolist()))
    labels_val = "".join(map(str, xin_val.is_duplicate[val_upper_flag].tolist()))

    
    with open(train_path + "\question1_train.txt", 'w', encoding = 'utf-8') as q1_t, \
        open(train_path + "\question2_train.txt", 'w', encoding = 'utf-8') as q2_t, \
        open(val_path + "\question1_val.txt", 'w', encoding = 'utf-8') as q1_v, \
        open(val_path + "\question2_val.txt", 'w', encoding = 'utf-8') as q2_v, \
        open(train_path + "\labels_train.txt", 'w', encoding = 'utf-8') as l_t, \
        open(val_path + "\labels_val.txt", 'w', encoding = 'utf-8') as l_v:
            q1_t.write(question1_train), \
            q2_t.write(question2_train), \
            q1_v.write(question1_val), \
            q2_v.write(question2_val), \
            l_t.write(labels_train), \
            l_v.write(labels_val)


def initialize_vocabulary(vocabulary_path):
    # map vocab to word embeddings
    '''arguments:
            vocabulary_path: vocabulary file with a token in each line
        retuns:
            vocab, rev_vocab
            vocab: a dictionary with signature {'vocab': idx}
            rev_vocab: a list of all the tokens in vocabulary_path
            There is 1 to 1 mapping between rev_vocab and vocab'''
    print('initializing vocabulary')
    if gfile.Exists(vocabulary_path):
        rev_vocab = []
        with gfile.GFile(vocabulary_path, mode="r") as f:
            rev_vocab.extend(f.readlines())
        rev_vocab = [line.strip('\n') for line in rev_vocab]
        vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
        return vocab, rev_vocab
    else:
        raise ValueError("Vocabulary file %s not found.", vocabulary_path)


def process_glove(args, vocab_list, save_path, size=4e5, random_init=True):
    """
    signature: 
    creates a numpy matrix glove with word vectors corresonding to tokens in vocab_list
    word vec for vocab_list[i] = glove[i]
    writes glove to save_path.npz
    
    :param vocab_list: [vocab]. a list of vocab
    :return:
    """
    print('procesing glove')
    if not gfile.Exists(save_path + ".npz"):
        glove_path = os.path.join(args.glove_dir, "glove.6B.{}d.txt".format(args.glove_dim))
        if random_init:
            glove = np.random.randn(len(vocab_list), args.glove_dim)
        else:
            glove = np.zeros((len(vocab_list), args.glove_dim))
        found = 0
        with open(glove_path, 'r', encoding = 'utf-8') as fh:
            for line in tqdm(fh, total=size):
                array = line.lstrip().rstrip().split(" ")
                word = array[0]
                vector = list(map(float, array[1:]))
                if word in vocab_list:
                    idx = vocab_list.index(word)
                    glove[idx, :] = vector
                    found += 1
                if word.capitalize() in vocab_list:
                    idx = vocab_list.index(word.capitalize())
                    glove[idx, :] = vector
                    found += 1
                if word.upper() in vocab_list:
                    idx = vocab_list.index(word.upper())
                    glove[idx, :] = vector
                    found += 1

        print(("{}/{} of word vocab have corresponding vectors in {}".format(found, len(vocab_list), glove_path)))
        np.savez_compressed(save_path, glove=glove)
        print(("saved trimmed glove matrix at: {}".format(save_path)))


def create_vocabulary(vocabulary_path, data_paths, tokenizer=None):
    '''Iterates through all data_paths and creates a vocab of unique tokens 
    sorted according to their frequency in collective of data_paths
    writes it at vocabulary_path'''
    print('creating vocabulary')
    if not gfile.Exists(vocabulary_path):
        print(("Creating vocabulary %s from data %s" % (vocabulary_path, str(data_paths))))
        vocab = {}
        for path in data_paths:
            with open(path, mode="rb") as f:
                counter = 0
                for line in f:
                    counter += 1
                    if counter % 100000 == 0:
                        print(("processing line %d" % counter))
                    tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
                    for w in tokens:
                        if w in vocab:
                            vocab[w] += 1
                        else:
                            vocab[w] = 1
        vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
        print(("Vocabulary size: %d" % len(vocab_list)))
        with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
            for w in vocab_list:
                vocab_file.write(w + b"\n")


def sentence_to_token_ids(sentence, vocabulary, tokenizer=None):
    '''converts sentence to a list  of their token ids according to vocabulary provided
    in case a token is not present it is replaced by token id for unk symbol'''
    if tokenizer:
        words = tokenizer(sentence)
        words = [word.orth_ for word in words if not word.orth_.isspace()]
    else:
        words = basic_tokenizer(sentence)
    return [vocabulary.get(w, UNK_ID) for w in words]


def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None):
    '''converts file at data_path to a list of token_ids mapped 1 to 1 according to open(vocabulary_path)'''
    print('converting data to token ids')
    if not gfile.Exists(target_path):
        print(("Tokenizing data in %s" % data_path))
        vocab, _ = initialize_vocabulary(vocabulary_path)
        with gfile.GFile(data_path, mode="r") as data_file:
            with gfile.GFile(target_path, mode="w") as tokens_file:
                counter = 0
                for line in data_file:
                    counter += 1
                    if counter % 5000 == 0:
                        print(("tokenizing line %d" % counter))
                    token_ids = sentence_to_token_ids(line, vocab, tokenizer)
                    tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
                    
def get_tokenizer(name):
    if name == 'spaCy':
        tokenizer = English()
    if name == 'word_tokenize':
        tokenizer = word_tokenize
    return tokenizer

def basic_tokenizer(sentence):
    return sentence.split()

if __name__ == '__main__':
    args = setup_args()
    tokenizer = get_tokenizer(args.tokenizer)
    vocab_path = pjoin(args.vocab_dir, "vocab.dat")
    data_path = pjoin(args.data_dir, "train.csv")
    train_path = pjoin(args.source_dir)
    valid_path = pjoin(args.source_dir)
    
    split = args.split
    
    create_txt(data_path, train_path, valid_path, split,args.question_lower, args.question_upper)
    
    create_vocabulary(vocab_path,
                      [pjoin(args.source_dir, "question1_train.txt"),
                       pjoin(args.source_dir, "question2_train.txt"),
                      pjoin(args.source_dir, "question1_val.txt"),
                      pjoin(args.source_dir, "question2_val.txt")])
    vocab, rev_vocab = initialize_vocabulary(vocab_path)

    process_glove(args, rev_vocab, args.source_dir + "/glove.trimmed.{}".format(args.glove_dim),
                  random_init=args.random_init)

    question1_train_ids_path = train_path + "/.ids.train.question1"
    question2_train_ids_path = train_path + "/.ids.train.question2"
    data_to_token_ids(train_path + "/question1_train.txt", question1_train_ids_path, vocab_path, tokenizer = tokenizer)
    data_to_token_ids(train_path + "/question2_train.txt", question2_train_ids_path, vocab_path, tokenizer = tokenizer)

    question1_val_ids_path = valid_path + "/.ids.val.question1"
    question2_val_ids_path = valid_path + "/.ids.val.question2"
    data_to_token_ids(valid_path + "/question1_val.txt", question1_val_ids_path, vocab_path, tokenizer = tokenizer)
    data_to_token_ids(valid_path + "/question2_val.txt", question2_val_ids_path, vocab_path, tokenizer = tokenizer)
    
    question1_val_ids_path = v_path + "/.ids.val.question1"
    question2_val_ids_path = valid_path + "/.ids.val.question2"
    data_to_token_ids(test_path + "/question1_val.txt", question1_val_ids_path, vocab_path, tokenizer = tokenizer)
    data_to_token_ids(test_path + "/question2_val.txt", question2_val_ids_path, vocab_path, tokenizer = tokenizer)

In [None]:
# %load "../../nlp/cs224n/assignment4/assignment4/code/traincolab_dmn_wordvar.py"




import os
import json

import tensorflow as tf
import numpy as np

from qa_model_dmn_wordvar import Encoder, QASystem, Decoder, Config, EpisodicMemoryCell, EpisodicMemory
from os.path import join as pjoin

import logging

logging.basicConfig(level=logging.INFO)

class _FLAGS():
    def __init__(self):
        self.learning_rate = 0.0003
        self.max_gradient_norm = 5.0
        self.dropout = 0.15
        self.batch_size = 128
        self.epochs = 40
        self.state_size = 250
        self.output_size =  750
        self.embedding_size = 100
        self.data_dir = "data/squad"
        self.train_dir = "traindmn_wordvar"
        self.load_train_dir = ""
        self.log_dir = "log"
        self.optimizer = "adam"
        self.print_every = 1
        self.keep = 0
        self.vocab_path = "data/squad/vocab.dat"
        self.embed_path = ""
        self.evaluation_size = 500
        self.n_layers = 5
        self.attention_hidden_units = 500
FLAGS = _FLAGS()

def initialize_model(session, model, train_dir):
    ckpt = tf.train.get_checkpoint_state(train_dir)
    v2_path = ckpt.model_checkpoint_path + ".index" if ckpt else ""
    if ckpt and (tf.gfile.Exists(ckpt.model_checkpoint_path) or tf.gfile.Exists(v2_path)):
        logging.info("Reading model parameters from %s" % ckpt.model_checkpoint_path)
        model.saver.restore(session, ckpt.model_checkpoint_path)
    else:
        logging.info("Created model with fresh parameters.")
        session.run(tf.global_variables_initializer())
        logging.info('Num params: %d' % sum(v.get_shape().num_elements() for v in tf.trainable_variables()))
    return model

def get_device_name():
    print('device in use:',tf.test.gpu_device_name() )
    return 'CPU' if tf.test.gpu_device_name() == '' else tf.test.gpu_device_name()

def get_normalized_train_dir(train_dir):
    """
    Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    file paths saved in the checkpoint. This allows the model to be reloaded even
    if the location of the checkpoint files has moved, allowing usage with CodaLab.
    This must be done on both train.py and qa_answer.py in order to work.
    """
    global_train_dir = 'dir_node_wordvar'
    if os.path.exists(global_train_dir):
        os.unlink(global_train_dir) #forlinux
#         os.system('rmdir "%s"' % global_train_dir)
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    os.symlink(os.path.abspath(train_dir), global_train_dir, True)
    return global_train_dir

def get_pretrained_embeddings(embed_path):
    glove = np.load(embed_path)
    return glove['glove']


def main():
    get_device_name()
    
    print('Device in use {}'.format(get_device_name()))

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = None

    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    pretrained_embeddings = get_pretrained_embeddings(embed_path)
    
    config = Config(FLAGS.embedding_size, FLAGS.evaluation_size, FLAGS.optimizer, FLAGS.batch_size, FLAGS.learning_rate, \
                   FLAGS.max_gradient_norm)
    cell = EpisodicMemoryCell(2*FLAGS.state_size)
    episodicmemory = EpisodicMemory(cell, FLAGS.n_layers, FLAGS.state_size, FLAGS.attention_hidden_units)
    encoder = Encoder(FLAGS.state_size, vocab_dim=FLAGS.embedding_size)
    decoder = Decoder(decoder_size=FLAGS.output_size)
    
    qa = QASystem(encoder, decoder, episodicmemory, pretrained_embeddings, config)

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)
    
    with open(vocab_path, encoding = 'utf8') as file1:
        f = file1.readlines()
        period_location = [i for i in range(len(f)) if f[i] == '.\n']

    print((vars(FLAGS)))

    with open("data/squad"+"/train.ids.question", encoding = 'utf8') as t_i_q, open("data/squad" + "/train.ids.context", encoding = 'utf8') as t_i_c,\
         open("data/squad" + "/train.span", encoding = 'utf8') as t_s, open("data/squad" + "/val.ids.question", encoding = 'utf8') as v_i_q,\
         open("data/squad" + "/val.ids.context", encoding = 'utf8') as v_i_c, open("data/squad" + "/val.span", encoding = 'utf8') as v_s:
                q = t_i_q.readlines()
                c = t_i_c.readlines()
                a = t_s.readlines()
                vq = v_i_q.readlines()
                vc = v_i_c.readlines()
                va = v_s.readlines()
    dataset = [[q,c,a],[vq,vc,va]]
                

    with tf.Session() as sess:
        load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir)
        initialize_model(sess, qa, load_train_dir)

        save_train_dir = get_normalized_train_dir(FLAGS.train_dir)
        qa.train(sess, dataset, FLAGS.epochs, period_location, save_train_dir)
#         qa.evaluate_answer(sess, dataset_val, FLAGS.evaluation_size, log=True)

if __name__ == "__main__":
    main()
#     tf.app.run()