# Preparation

[View in Colaboratory](https://colab.research.google.com/github/stikbuf/Language_Modeling/blob/master/Keras_Character_Aware_Neural_Language_Models.ipynb)

## Configure the cloud environment


### Mount Google Drive

In [None]:
# Install a Drive FUSE wrapper.
# https://github.com/astrada/google-drive-ocamlfuse
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse


# Generate auth tokens for Colab
from google.colab import auth
auth.authenticate_user()


# Generate creds for the Drive FUSE library.
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [None]:
# If you got a "Transport endpoint is not connected." error. Please run this line first to unmount the drive.
# See https://stackoverflow.com/questions/49588113/google-colab-script-throws-transport-endpoint-is-not-connected?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
!fusermount -u drive

# Create a directory and mount Google Drive using that directory.
!mkdir -p drive
!google-drive-ocamlfuse drive
a = !ls drive/
print('Files in Drive:', a)
assert a!=[], 'Drive should not be empty!'

In [None]:
local_path='./drive/share_with_me/AI/Character-aware_LM/'
#local_path='./'
import sys
sys.path.append(local_path)
!ls './drive/share_with_me/AI/Character-aware_LM/'

In [None]:
import tensorflow as tf
#assert tf.test.gpu_device_name() != '', "GPU not avaliable!"
tf.test.gpu_device_name()

## Load data (Penn Tree bank -- PTB)

In [None]:
local_path='./'

In [None]:
from __future__ import print_function
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" # Use single card. THIS LINE MUST BE RUN BEFORE TENSORFLOW IS IMPORTED
import pylab
%pylab inline
import matplotlib.pyplot as plt
import matplotlib  
%matplotlib inline
import tensorflow as tf
import numpy as np
import pandas as pd
import random

from reader import ptb_raw_data, ptb_producer # by Google

In [None]:
train_data, valid_data, test_data, word_to_id = ptb_raw_data(local_path + 'data') # tokens
id_to_word = dict((v, k) for k, v in word_to_id.items())
voc_size = len(id_to_word)
voc_size

In [None]:
print('Train data size: {0}, Valid data size: {1}, Test data size: {2}\n'.
      format(len(train_data), len(valid_data), len(test_data)))
print('train/val/test_data is a list, some elements in train_data is', train_data[:10])

In [None]:
id_to_word[voc_size]='<SS>' # Add start word token '<SS>'
id_to_word[voc_size+1]='<EE>' # Add end word token '<EE>'
word_to_id = dict((v, k) for k, v in id_to_word.items())
voc_size = len(id_to_word)
voc_size

In [None]:
word_id = pd.DataFrame.from_dict(word_to_id, orient='index').sort_values(by=0, ascending=True)
word_id.columns = ['id']
print(word_id.head())
print(word_id.tail())

In [None]:
id_word = pd.DataFrame.from_dict(id_to_word, orient='index')
id_word.columns = ['word']
print(id_word.head())
print(id_word.tail())

In [None]:
' '.join([id_to_word[id] for id in train_data[:50]])

In [None]:
seq_len = 35

# RNN baseline

## data generator

In [None]:
import random

from tensorflow.python.keras.utils import to_categorical 

def gen_word_word(batch_size=128, dataset='train'):
    assert dataset in ['train', 'valid', 'test'], 'Dataset must be train or valid or test.'
    
    dic = {'train':train_data, 'valid':valid_data, 'test':test_data}
    data = dic[dataset]
    
    while True:
        rnd_idxs = list(range(len(data)-seq_len-1))
        random.shuffle(rnd_idxs)
        cnt = 0
        while cnt < len(rnd_idxs) - batch_size :
            X = np.array([[word_to_id['<SS>']] + data[i:i+seq_len] + [word_to_id['<EE>']]
                          for i in rnd_idxs[cnt:cnt+batch_size]])
            Y = X[:,1:]
            X = X[:,:-1]
            Y = to_categorical(Y, num_classes=voc_size)
            #print(X.shape)
            cnt += batch_size
            yield X, Y

## Keras model

In [None]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import GRU, Dense, Embedding, InputLayer, Dropout
from tensorflow.python.keras.optimizers import RMSprop

Add dropout between layers, see [Recurrent Neural Network Regularization](https://arxiv.org/abs/1409.2329)

In [None]:
model = Sequential()

embedding_size = 128


model.add(Embedding(input_dim=voc_size,
                    output_dim=embedding_size,
                    name='inputEmbedding'))
model.add(GRU(units=128, return_sequences=True))
model.add(Dropout(0.3)) 
model.add(GRU(units=64, return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(voc_size, activation='softmax'))

If we denote $w_{1:T} = [w_1, w_2,...,w_T ]$ to be the sequence of words in thes, training involves minimizing
the negative log-likelihood ($NLL$)
$$NLL = - \sum_{t=1}^{T} \log Pr (w_t | w_{1:t-1})$$
i.e. the Crossentropy loss (with out averaging).  
As is standard in language modeling, we use perplexity(PPL) to evaluate the performance of our models. Perplexity of a model over a sequence $[w_1, w_2,...,w_T ]$ is given by
$$PPL = e^\frac{NLL}{T} = e^{ave (Crossentropy)}$$
where $NLL/Crossentropy$ is calculated over the test set.


In [None]:
# perplexity
def PPL(y_true, y_pred):
    return tf.exp(tf.reduce_mean(tf.keras.backend.categorical_crossentropy(y_true, y_pred)))

def ACC(y_true, y_pred):
    ACC = tf.equal(tf.argmax(y_true, axis = 2), 
                   tf.argmax(y_pred, axis = 2))
    ACC = tf.cast(ACC, tf.float32)

    return tf.reduce_mean(ACC)

In [None]:
optimizer = RMSprop(lr=1e-3)
model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=[ACC, PPL])

In [None]:
model.summary()

In [None]:
from tensorflow.python.keras.callbacks import TensorBoard, ModelCheckpoint
import os
if not os.path.exists(local_path + 'model/'):
    os.mkdir(local_path + 'model/')

path_model = local_path + 'model/model.keras'    
tensorboard = TensorBoard(log_dir='log')
checkpoint = ModelCheckpoint(filepath=path_model, verbose=1,
                             monitor='val_PPL',mode='min' ,save_best_only='True')
# path_model = local_path + 'model/model.keras'
# model.save(path_model)

callback_lists=[tensorboard,checkpoint]

In [None]:
history = model.fit_generator(generator=gen_word_word(), 
                           steps_per_epoch=50, epochs=125,
                           callbacks=callback_lists,
                           validation_data=gen_word_word(dataset='valid'),
                           validation_steps=30)

In [None]:
logs = pd.DataFrame(history.history)

In [None]:
print(logs.columns)
pylab.rcParams['figure.figsize'] = (13, 8)
logs.loc[1:,['PPL','val_PPL']].plot() # start with 1 makes the figure prettier

In [None]:
# path_model = local_path + 'model/model.keras'
# model.save(path_model)

In [None]:
from tensorflow.python.keras.models import load_model

model_restore = load_model(path_model, custom_objects={'ACC':ACC,'PPL': PPL})

In [None]:
history = model_restore.fit_generator(generator=gen_word_word(), 
                           steps_per_epoch=50, epochs=3,
                           callbacks=callback_lists,
                           validation_data=gen_word_word(dataset='valid'),
                           validation_steps=30)

In [None]:
logs = pd.DataFrame(history.history)
print(logs.columns)
pylab.rcParams['figure.figsize'] = (13, 8)
logs.loc[:,['PPL','val_PPL']].plot()

In [None]:
def predict_seq(model, preSeq=None, genLen=seq_len, power=1):
    """ Predict a sequence with length genLen.
        arg:
            model: Keras model used to predict.
            preSeq: list. The leading sequence.
            genLen: float or np.inf. If power is equal to np.inf, then an argmax will be used. 
            power: Probility power.
    """
    preSeq = [word_to_id['<SS>']] if preSeq == None else [word_to_id['<SS>']] + preSeq   
    pointer = len(preSeq) - 1
    
    for _ in range(genLen):
        inputSeq = np.array([preSeq])
        prob = model.predict(inputSeq)[0, pointer, :]
        if power==np.inf:
            pred = np.argmax(prob)
        else:
            prob = np.power(prob, power)
            prob = prob / np.sum(prob)
            pred = np.random.choice(range(voc_size), p=prob)
        preSeq.append(pred)
        pointer = pointer + 1

    return preSeq, ' '.join([id_to_word[id] for id in preSeq])

In [None]:
_, seq = predict_seq(model, power=1)
seq

# Character aware model

[Character-Aware Neural Language Models -- arxiv-1508.06615 -- AAAI 2016](https://arxiv.org/abs/1508.06615)

[Ref: Github/jarfo/kchar](https://github.com/jarfo/kchar)


![model](https://github.com/stikbuf/Language_Modeling/blob/master/Character%20aware.png?raw=true)


## Load data

### convert to text

In [None]:
train_data_text = [id_to_word[idx] for idx in train_data]
valid_data_text = [id_to_word[idx] for idx in valid_data]
test_data_text = [id_to_word[idx] for idx in test_data]
total_data_text = train_data_text + valid_data_text + test_data_text

maxWordLen = max([len(word) for word in total_data_text])
maxWordLen += 2 # Inclued Start and End character

ds = pd.Series([len(word) for word in total_data_text])
plt.figure(figsize=(10,5))
ds.plot.hist(bins=range(1, maxWordLen))
plt.title('word length distribution, max={0}, min={1}'.
          format(ds.max(), ds.min()))

### merge chars

In [None]:
chars = []
for word in total_data_text:
    chars.extend(list(word))
    
ds = pd.Series(chars)
plt.figure(figsize=(15,10))   
matplotlib.rc('xtick', labelsize=20)
matplotlib.rc('ytick', labelsize=15)
ds.value_counts().plot.bar()
plt.title('character distribution, total {0} characters(without \'S\' and \'E\')'.format(len(set(chars))))
plt.show()

chars = list(set(chars + ['S'] + ['E'] + [' ']))
# 'S' for word leading char, 'E' for word ending char, space for padding
id_to_chars = dict(enumerate(chars))
chars_to_id = dict((v, k) for k,v in id_to_chars.items())
num_chars = len(chars)

print('number of chars:', num_chars, '\n')
print(chars, '\n')
print(id_to_chars, '\n')
print(chars_to_id, '\n')

In [None]:
def word_to_charId(wordId):
    # Convert word to a string of word Ids
    wordString = 'S' + id_to_word[wordId].center(maxWordLen - 2) + 'E'
    return [chars_to_id[char] for char in wordString]

def wordSeq_charSeq(bWordSeq):
    batch, seqLen = bWordSeq.shape
    bWordSeq = bWordSeq.ravel()
    charSeq = np.array([word_to_charId(wordId) for wordId in bWordSeq])
    return charSeq.reshape(batch, seqLen, -1)

word_to_charId(word_to_id['the']) # 

In [None]:
import random

from tensorflow.python.keras.utils import to_categorical 

def gen_char_word(batch_size=128, dataset='train'):
    assert dataset in ['train', 'valid', 'test'], 'Dataset must be train or valid or test.'
    
    dic = {'train':train_data, 'valid':valid_data, 'test':test_data}
    data = dic[dataset]
    
    while True:
        rnd_idxs = list(range(len(data)-seq_len-1))
        random.shuffle(rnd_idxs)
        cnt = 0
        while cnt < len(rnd_idxs) - batch_size :
            X = np.array([[word_to_id['<SS>']] + data[i:i+seq_len] + [word_to_id['<EE>']]
                          for i in rnd_idxs[cnt:cnt+batch_size]])
            Y = X[:,1:]
            X = X[:,:-1]
            Y = to_categorical(Y, num_classes=voc_size)
            X = wordSeq_charSeq(X) 
            #print(X.shape)
            cnt += batch_size
            yield X, Y

In [None]:
print(next(gen_char_word(batch_size=1, dataset='train'))[1][0][2].shape)

## option

In [None]:
class Option():
    def __init__(self):
        self.batch_size = 20
        self.seq_length = seq_len + 1
        self.max_word_l = maxWordLen # Include Start and End character
        self.char_vocab_size = num_chars
        self.char_vec_size = 15
        self.feature_maps = [50,100,150,200,200,200,200]
        self.kernels = [1,2,3,4,5,6,7]
        self.highway_layers = 2
        self.num_lstm_layers = 2
        self.rnn_size = 128
        self.word_vocab_size = voc_size
        self.dropout = 0.5
        self.learing_rate = 1e-5
        
opt = Option()

## CNN
![CNN part](https://github.com/stikbuf/Language_Modeling/blob/master/Character%20aware-CNN.png?raw=true)

In [None]:
from tensorflow.python.keras.layers import Conv2D, MaxPooling2D, Concatenate, Reshape

def CNN(seq_length, length, feature_maps, kernels, x):

    concat_input = []
    for feature_map, kernel in zip(feature_maps, kernels):
        reduced_l = length - kernel + 1
        conv = Conv2D(feature_map, (1, kernel), activation='tanh', data_format="channels_last")(x)
        maxp = MaxPooling2D((1, reduced_l), data_format="channels_last")(conv)
        concat_input.append(maxp)

    x = Concatenate()(concat_input)
    x = Reshape((seq_length, sum(feature_maps)))(x)
    return x

## Highway Network  
[Srivastava et al.](https://arxiv.org/abs/1505.00387)

Input vector is $\textbf{y}$, then layer output $\textbf{z}$ is
$$\textbf{z = t} \odot g(\textbf{W}_H\textbf{y}+\textbf{b}_H) + \textbf{(1 - t)} \odot \textbf{y}$$
where 
$$\textbf{t} = \sigma(\textbf{W}_T\textbf{y}+\textbf{b}_T)$$

$\textbf{t}$ is called the
*transform gate*, and $(\textbf{1}−\textbf{t})$ is called the *carry gate*. 
Similar to the memory cells in LSTM networks, highway layers allow for training of deep networks by adaptively carrying some dimensions of the input directly to the output. By construction the dimensions of $\textbf{y}$ and $\textbf{z}$ have to match, and hence $\textbf{W}_T$ and $\textbf{W}_H$ are square matrices. **Basically, a highway layer is a dense layer with residual connection modulated by an adaptive gate.**

A keras model is also a keras layer! So you can combine some keras layers to design your own layer. This is useful when combining with TimeDistributed wrapper. [See section 3 in this blog](https://keunwoochoi.wordpress.com/2016/11/18/for-beginners-writing-a-custom-keras-layer/)

In [None]:
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.layers import Dense, Activation, Multiply, Add, Lambda, Input
from tensorflow.python.keras.initializers import Constant
from tensorflow.python.keras.models import Model

class LambdaWithShape(Lambda):
#     def __init__(self, function, **kwargs):
#         super(LambdaWithShape, self).__init__(function, **kwargs)
    def compute_output_shape(self, input_shape):
        return input_shape

def Highway(value, nLayers, activation='tanh', gateBias=-3):
    dim = K.int_shape(value)[-1]
    gateBiasInitalizer = Constant(gateBias)
    for i in range(nLayers):
        tGate = Dense(units=dim, bias_initializer=gateBiasInitalizer)(value)
        tGate = Activation('sigmoid')(tGate)
        #cGate = Lambda(lambda x: 1.0-x)(tGate) # WARNING:tensorflow:All custom layers should implement the `compute_output_shape`
        cGate = LambdaWithShape(lambda x: 1.0-x)(tGate) # I do not specify output_shape
        transformed = Dense(units=dim, bias_initializer=gateBiasInitalizer)(value)
        transformed = Activation(activation)(value)
        transformedGate = Multiply()([tGate, transformed])
        identityGate = Multiply()([cGate, value])
        value = Add()([transformedGate, identityGate])
    return value

inputs = Input((sum(opt.feature_maps),))
HighwayLayer = Model(inputs=inputs, outputs=Highway(inputs, nLayers=opt.highway_layers))

## Model

In [None]:
from tensorflow.python.keras.layers import Input, Embedding, GRU, Dropout, BatchNormalization, TimeDistributed
#from tensorflow.python.keras.optimizers import SGD

#chars = Input(batch_shape=(opt.batch_size, opt.seq_length, opt.max_word_l), name='chars')
chars = Input(shape=(opt.seq_length, opt.max_word_l), name='chars') # will get a warning if you do not specify batch_shape
chars_embedding = Embedding(opt.char_vocab_size, opt.char_vec_size, name='chars_embedding')(chars)
cnn = CNN(opt.seq_length, opt.max_word_l, opt.feature_maps, opt.kernels, chars_embedding)
x = cnn
inputs = chars

x = BatchNormalization()(x)

x = TimeDistributed(HighwayLayer)(x)
highway = x

for l in range(opt.num_lstm_layers):
    #x = GRU(opt.rnn_size, return_sequences=True, stateful=True)(x)
    x = GRU(opt.rnn_size, return_sequences=True, stateful=False)(x)

    if opt.dropout > 0:
        x = Dropout(opt.dropout)(x)
        
output = Dense(opt.word_vocab_size, activation='softmax')(x)

modelCAware = Model(inputs=inputs, outputs=output)
modelCAware.summary()    

In [None]:
modelConvWordFeatureBeforeHighway = Model(inputs=inputs, outputs=cnn)
modelConvWordFeatureAfterHighway = Model(inputs=inputs, outputs=highway)

In [None]:
# perplexity
def PPL(y_true, y_pred):
    return tf.exp(tf.reduce_mean(tf.keras.backend.categorical_crossentropy(y_true, y_pred)))

def ACC(y_true, y_pred):
    ACC = tf.equal(tf.argmax(y_true, axis = 2), 
                   tf.argmax(y_pred, axis = 2))
    ACC = tf.cast(ACC, tf.float32)

    return tf.reduce_mean(ACC)

In [None]:
from tensorflow.python.keras.optimizers import RMSprop

optimizer = RMSprop(lr=opt.learing_rate)
modelCAware.compile(loss='categorical_crossentropy', optimizer='rmsprop',metrics=[ACC, PPL])  

#### Tensorboard histograms
Keras has a bug in TensorBoard visualization, see https://github.com/keras-team/keras/issues/3358  
DO NOT set show_hist_gram=True  unless you want TensorBoard visualization

In [None]:
show_hist_gram = True 

In [None]:
from tensorflow.python.keras.callbacks import TensorBoard, ModelCheckpoint
import os
if not os.path.exists(local_path + 'model/'):
    os.mkdir(local_path + 'model/')

path_model = local_path + 'model/modelAware.keras'
if show_hist_gram:
    tensorboard = TensorBoard(log_dir='log', histogram_freq=1, write_grads=True)
else:
    tensorboard = TensorBoard(log_dir='log')
checkpoint = ModelCheckpoint(filepath=path_model, verbose=1,
                             monitor='val_PPL',mode='min' ,save_best_only='True')
# path_model = local_path + 'model/model.keras'
# model.save(path_model)

callback_lists=[tensorboard,checkpoint]

In [None]:
if show_hist_gram:
    history = modelCAware.fit_generator(generator=gen_char_word(batch_size=opt.batch_size), 
                           steps_per_epoch=50, epochs=5,
                           callbacks=callback_lists, 
                           #validation_data=gen_char_word(batch_size=opt.batch_size, dataset='valid'),#ValueError: If printing histograms, validation_data must be provided, and cannot be a generator.
                           validation_data=next(gen_char_word(batch_size=(len(valid_data)-seq_len-2)//100, dataset='valid')),  # //100 to avoid memory error.        
                           validation_steps=None) 
else:
    history = modelCAware.fit_generator(generator=gen_char_word(batch_size=opt.batch_size), 
                           steps_per_epoch=50, epochs=5,
                           callbacks=callback_lists, 
                           validation_data=gen_char_word(batch_size=opt.batch_size, dataset='valid'),#ValueError: If printing histograms, validation_data must be provided, and cannot be a generator.
                           #validation_data=next(gen_char_word(batch_size=(len(valid_data)-seq_len-2)//100, dataset='valid')),  # //10 to avoid memory error. DO NOT use this line unless you want TensorBoard visualization       
                           validation_steps=30)

In [None]:
logs = pd.DataFrame(history.history)
print(logs.columns)
pylab.rcParams['figure.figsize'] = (13, 8)
logs.loc[1:,['PPL','val_PPL']].plot()
plt.show()

## Word feature analysis
The convolution and the highway network can be viewed as a word feature extractor. Let's establish some intuition by the following experiments.

In [None]:
def extractWordFeature(words, extcModel=modelConvWordFeatureAfterHighway):
    """
    words: List of word text.
    extcModel: Extractor model.
    
    returns: Pandas dataframe. Index is the word, corresponding to the feature vector.
    """
    
    padWord = lambda word: 'S' + word.lower().center(opt.max_word_l - 2) + 'E' # pad a single word
    wordsPadedCroped = [padWord(word[:opt.max_word_l-2]) for word in words] # cut input word if it is too long
    #print(wordsPadedCroped)
    wordsPadedCroped = [[chars_to_id[char] for char in word] for word in wordsPadedCroped] # convert to char index
    #print(wordsPadedCroped)
    
    # split into batches, list of tuples
    batches = []
    while wordsPadedCroped != []:
        if len(wordsPadedCroped)>opt.seq_length:
            batches.append((np.array(wordsPadedCroped[:opt.seq_length]), opt.seq_length))
            wordsPadedCroped = wordsPadedCroped[opt.seq_length:]
        else:
            wLast = wordsPadedCroped + [[chars_to_id[' ']]*opt.max_word_l]*(opt.seq_length-len(wordsPadedCroped))         
            batches.append((np.array(wLast), len(wordsPadedCroped)))
            wordsPadedCroped = []   
    
    features = []
    for batch in batches:
        data = batch[0]
        validNum = batch[1]
        features.append(extcModel.predict(np.expand_dims(data,0))[0,:validNum,:])
    
    features = np.vstack(features)
    
    # return a dataframe
    features = pd.DataFrame(data=features, index=words)
    features.index.name = 'featVecs'
    features.columns.name = 'vecDims'
    
    return features

In [None]:
sampleWordList = ['look','looks','looked','looking','lok','looooooook',
                  'lk','loop','lock','locked','cook','see','observation',
                  'hear','run','reading','news','book','computer',
                  'programming','python','java','lisp','c#','matlab','jupyter']
vocWordList = list(word_id.index)
print(vocWordList[:5], '......', vocWordList[-4:])

In [None]:
from scipy.spatial.distance import cdist

def sortedWordsByDistance(queryWord, Words, metric='cosine', 
                          extcModel=modelConvWordFeatureBeforeHighway):
    """
    queryWord: Single query word
    Words: Words to compare
    metric: metrics -- 'cosine',euclidean','correlation',...
    """
    queryWordFeat = extractWordFeature([queryWord], extcModel=extcModel)
    wordFeats = extractWordFeature(Words, extcModel=extcModel) 
    
    dis = pd.Series(cdist(queryWordFeat, wordFeats, metric=metric)[0])
    dis.index = Words
    dis = dis.sort_values(ascending=True)
    return dis
    
    
sortedDis = sortedWordsByDistance('look', sampleWordList)
sortedDis

In [None]:
smpFeatures = extractWordFeature(sampleWordList, extcModel=modelConvWordFeatureAfterHighway)
smpFeatures

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def featureVis(feautres, usePCA=True):
    """
    features: A pandas dataframe. Index should be words, values should be word features.
    """
    wordList = feautres.index
    featTransed = feautres.values
    if usePCA:
        featTransed = PCA(n_components=len(feautres.columns)//7).fit_transform(featTransed)
    featTransed = TSNE(n_components=2).fit_transform(featTransed)
    featTransed = pd.DataFrame(featTransed, index=wordList)
    featTransed.index.name = 'featVecs'
    featTransed.columns.name = 'vecDims'

    pylab.rcParams['figure.figsize'] = (13, 8)
    axes = featTransed.plot.scatter(x=0, y=1)
    for txt in wordList:
        axes.annotate(txt, (featTransed.loc[txt,0],featTransed.loc[txt,1]))
        
featureVis(smpFeatures, usePCA=True)

In [None]:
vocFeatures = extractWordFeature(vocWordList, extcModel=modelConvWordFeatureAfterHighway)
vocFeatures.head(1000)

In [None]:
%%time
sample = vocFeatures.sample(200)
featureVis(sample, usePCA=True)

# Gated CNN model

[Language Modeling with Gated Convolutional Networks -- arxiv-1612.08083 -- Facebook AI Research](https://arxiv.org/abs/1612.08083)
![Gated CNN model](https://github.com/stikbuf/Language_Modeling/blob/master/Gated%20CNN.png?raw=true)

## Data generator

Same as RNN baseline.

In [None]:
import random

from tensorflow.python.keras.utils import to_categorical 

def gen_word_word(batch_size=128, dataset='train'):
    assert dataset in ['train', 'valid', 'test'], 'Dataset must be train or valid or test.'
    
    dic = {'train':train_data, 'valid':valid_data, 'test':test_data}
    data = dic[dataset]
    
    while True:
        rnd_idxs = list(range(len(data)-seq_len-1))
        random.shuffle(rnd_idxs)
        cnt = 0
        while cnt < len(rnd_idxs) - batch_size :
            X = np.array([[word_to_id['<SS>']] + data[i:i+seq_len] + [word_to_id['<EE>']]
                          for i in rnd_idxs[cnt:cnt+batch_size]])
            Y = X[:,1:]
            X = X[:,:-1]
            Y = to_categorical(Y, num_classes=voc_size)
            #print(X.shape)
            cnt += batch_size
            yield X, Y

## Imports

The tf integrated keras does not support causal convolution, i.e. padding param can not be 'causal' in Conv1D. (see [WaveNet: A Generative Model for Raw Audio, section 2.1.](https://arxiv.org/abs/1609.03499) ), so we use original keras instead.

![causal convolutional layers](https://github.com/stikbuf/Language_Modeling/blob/dev/figures/causal_convolution.png?raw=true)

I recommend that you read the paper section 2

In [None]:
import keras
keras.__version__

## Model

Model config
![model configuration](https://github.com/stikbuf/Language_Modeling/blob/dev/figures/gCNNConfig.png?raw=true)

In [None]:
# embedding_size=128 in the paper
GCNN_13 = [ [(4, 1268)] ]*1 ,\
          [ [(4,1268), (4,1268)] ]*12 
    
GCNN_14B = [ [(5, 512)] ]*1 ,\
           [ [(1,128), (5,128), (1,512)] ]*3 ,\
           [ [(1,512), (5,512), (1,1024)] ]*3 ,\
           [ [(1,1024), (5,1024), (1,2048)] ]*6 ,\
            [ [(1,1024), (5,1024), (1,4096)] ]*1 
            
GCNN_9 = [ [(4, 807)] ]*1 ,\
          [ [(4,807), (4,807)] ]*4  
    
GCNN_8B = [ [(1, 512)] ]*1 ,\
           [ [(1,128), (5,128), (1,512)] ]*3 ,\
           [ [(1,256), (5,256), (1,512)] ]*3 ,\
           [ [(1,1024), (1,1024), (1,2048)] ]*1 

# embedding_size=208 in the paper
GCNN_8 = [ [(4, 900)] ]*1 ,\
           [ [(4,900)] ]*7

GCNN_14 = [ [(6, 850)] ]*3 ,\
           [ [(1,850)] ]*1 ,\
           [ [(5,850)] ]*4 ,\
           [ [(1,850)] ]*1 ,\
            [ [(4,850)] ]*3 ,\
            [ [(4,1024)] ]*1 ,\
            [ [(4,2048)] ]*1  
    
modelParam = GCNN_8B
modelParam

I call:  
"(4, 1268)" a unit  
"[(4, 1268), (4, 1268)]" a block  
"[(4, 1268), (4, 1268)] * 12" a chunk  
A model is made by chunks.

GLU layer:
$$h_l(X) = (\textbf{X} * \textbf{W} + \textbf{b}) \otimes \sigma(\textbf{X} * \textbf{V} + \textbf{c})$$
implemented in `gatedCNNUnit()`

In [None]:
from keras.layers import Conv1D, Multiply, Add, Input, Dense, Embedding
from keras.models import Model
from keras import backend as K

def gatedCNNUnit(kernel_size=3, filters=1024, input_shape=None):
    inputs = Input(shape=input_shape)
    conv = Conv1D(filters=filters, kernel_size=kernel_size, data_format='channels_last',
                  strides=1, padding='causal')(inputs)
    gated = Conv1D(filters=filters, kernel_size=kernel_size, data_format='channels_last',
                   strides=1, padding='causal', activation='sigmoid')(inputs)
    value = Multiply()([conv, gated])
    return Model(inputs=inputs, outputs=value)
    
    
def gatedCNNChunk(chunk, input_shape):
    origin = Input(shape=input_shape)
    inputs = origin
    for block in chunk:
        x = inputs
        for unit in block:
            kernel_size, filters = unit
            x = gatedCNNUnit(kernel_size=kernel_size, filters=filters, input_shape=K.int_shape(x)[1:])(x)
        if K.int_shape(inputs)[-1] != K.int_shape(x)[-1]: # see Eqn.(2).in arxiv-1512.03385
            inputs = Dense( K.int_shape(x)[-1])(inputs)
            #print('DIM CHANGE IN RES MOD!!')
        x = Add()([inputs, x])
        inputs = x
    return Model(inputs=origin, outputs=x)

In [None]:
embedding_size = 128

inputs = Input(shape=(None,))
x = Embedding(input_dim=voc_size,
              output_dim=embedding_size)(inputs)
for chunk in modelParam:
    x = gatedCNNChunk(chunk, K.int_shape(x)[1:])(x)

x = Dense(voc_size, activation='softmax')(x)
gCNNModel = Model(inputs=inputs, outputs=x)
gCNNModel.summary()

In [None]:
# perplexity
def PPL(y_true, y_pred):
    return tf.exp(tf.reduce_mean(tf.keras.backend.categorical_crossentropy(y_true, y_pred)))

def ACC(y_true, y_pred):
    ACC = tf.equal(tf.argmax(y_true, axis = 2), 
                   tf.argmax(y_pred, axis = 2))
    ACC = tf.cast(ACC, tf.float32)

    return tf.reduce_mean(ACC)

In [None]:
from keras.optimizers import RMSprop

optimizer = RMSprop(lr=1e-4)
gCNNModel.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=[ACC, PPL])

In [None]:
from keras.callbacks import TensorBoard, ModelCheckpoint
import os
if not os.path.exists(local_path + 'model/'):
    os.mkdir(local_path + 'model/')

path_model = local_path + 'model/model.keras'    
tensorboard = TensorBoard(log_dir='log')
checkpoint = ModelCheckpoint(filepath=path_model, verbose=1,
                             monitor='val_PPL',mode='min' ,save_best_only='True')
# path_model = local_path + 'model/model.keras'
# model.save(path_model)

callback_lists=[tensorboard,checkpoint]

In [None]:
history = gCNNModel.fit_generator(generator=gen_word_word(batch_size=16), 
                           steps_per_epoch=100, epochs=30,
                           callbacks=callback_lists,
                           validation_data=gen_word_word(dataset='valid'),
                           validation_steps=15)

In [None]:
logs = pd.DataFrame(history.history)
print(logs.columns)
pylab.rcParams['figure.figsize'] = (13, 8)
logs.loc[1:,['PPL','val_PPL']].plot()
plt.show()

In [None]:
def predict_seq(model, preSeq=None, genLen=seq_len, power=1):
    """ Predict a sequence with length genLen.
        arg:
            model: Keras model used to predict.
            preSeq: list. The leading sequence.
            genLen: float or np.inf. If power is equal to np.inf, then an argmax will be used. 
            power: Probility power.
    """
    preSeq = [word_to_id['<SS>']] if preSeq == None else [word_to_id['<SS>']] + preSeq   
    pointer = len(preSeq) - 1
    
    for _ in range(genLen):
        inputSeq = np.array([preSeq])
        prob = model.predict(inputSeq)[0, pointer, :]
        if power==np.inf:
            pred = np.argmax(prob)
        else:
            prob = np.power(prob, power)
            prob = prob / np.sum(prob)
            pred = np.random.choice(range(voc_size), p=prob)
        preSeq.append(pred)
        pointer = pointer + 1

    return preSeq, ' '.join([id_to_word[id] for id in preSeq])

In [None]:
_, seq = predict_seq(gCNNModel, power=1)
seq

# Memory networks model

## Option

In [None]:
class OptionMemNet():
    def __init__(self):
        self.batch_size = 128
        self.seq_length = seq_len + 1
        self.dropout = 0.5
        self.learing_rate = 1e-5
        
optMemNet = OptionMemNet()

## Data generator

X is also categorical data.

In [None]:
import random

from tensorflow.python.keras.utils import to_categorical 

def gen_word_word(batch_size=128, dataset='train'):
    assert dataset in ['train', 'valid', 'test'], 'Dataset must be train or valid or test.'
    
    dic = {'train':train_data, 'valid':valid_data, 'test':test_data}
    data = dic[dataset]
    
    while True:
        rnd_idxs = list(range(len(data)-seq_len-1))
        random.shuffle(rnd_idxs)
        cnt = 0
        while cnt < len(rnd_idxs) - batch_size :
            X = np.array([[word_to_id['<SS>']] + data[i:i+seq_len] + [word_to_id['<EE>']]
                          for i in rnd_idxs[cnt:cnt+batch_size]])
            Y = X[:,1:]
            X = X[:,:-1]
            X = to_categorical(X, num_classes=voc_size)
            Y = to_categorical(Y, num_classes=voc_size)
            #print(X.shape)
            cnt += batch_size
            yield X, Y

## tensorflow model

In [None]:
import tensorflow as tf

inputs = tf.placeholder(tf.float32, shape=(optMemNet.batch_size, optMemNet.seq_length, voc_size), name='input')
y_true = tf.placeholder(tf.float32, shape=(optMemNet.batch_size, optMemNet.seq_length, voc_size), name='tureWords')

def batch_matMul(X, Y):
    b, n, m = X.get_shape()
    m, c = Y.get_shape()

    X = tf.reshape(X, [-1, m])
    h = tf.matmul(X, Y)
    h = tf.reshape(h, [-1, n, c])
    return h

def MemoryLayer(name, extMemory, inMemorySize=128, fitOutW=None, updateHopH=False, RNNLike=True):
    """
    Keras-like Memory Layer
    name: layer name
    Memory: memory tensor
    memorySize: the length of the internel memory tensor
    fitOutW: int or None. None for disable. Int for output dimension of the final linear transormation.
    updateHopH: boolean. Whether use a linear transormation H to update u between hops.
                see Section 2.2 RNN-like
    """
    batch, seqLen, extMemorySize = extMemory.get_shape() # extMemory shape (batch, seqLen, extMemorySize)
    print('shape of extMemory:', extMemory.get_shape())
    sharedName = 'MEMNET' if RNNLike else name
    
    def Layer(inputTensor): # query, shape: (batch, seqLen, querySize)
        with tf.variable_scope(name):
            _, _, querySize = inputTensor.get_shape()
            print('shape of inputTensor:', inputTensor.get_shape())
            # (querySize, inMemorySize)
            embB = tf.get_variable('embB', shape=(querySize, inMemorySize), dtype=tf.float32,
                                   initializer=tf.contrib.layers.xavier_initializer())
            u = batch_matMul(inputTensor, embB) # u, shape: (batch, seqLen, inMemorySize)
            print('shape of u:', u.get_shape())
            
        with tf.variable_scope(sharedName, reuse=tf.AUTO_REUSE):
            embA = tf.get_variable('embA', shape=(extMemorySize, inMemorySize), dtype=tf.float32,
                                   initializer=tf.contrib.layers.xavier_initializer())
            M = batch_matMul(extMemory, embA) # shape: (batch, seqLen, inMemorySize)
            print('shape of M:', M.get_shape())

            p = tf.nn.softmax(tf.matmul(M,tf.transpose(u, perm=(0,2,1))), axis=1) # shape: (batch, seqLen, seqLen)
            print('shape of p:', p.get_shape())
             
            embC = tf.get_variable('embC', shape=(extMemorySize, inMemorySize), dtype=tf.float32,
                                   initializer=tf.contrib.layers.xavier_initializer())
            C = batch_matMul(extMemory, embC) # shape: (batch, seqLen, inMemorySize)
            print('shape of C:', C.get_shape())
            
        with tf.variable_scope(name):
            O = tf.matmul(tf.transpose(p, perm=(0,2,1)), C)
            if updateHopH:
                H = tf.get_variable('H', shape=(inMemorySize, inMemorySize))
                u = batch_matMul(u, H)
            O = tf.add(O, u) #shape: (batch, seqLen, inMemorySize)
            O = tf.nn.relu(O)

            if fitOutW:
                W = tf.get_variable('W', shape=(inMemorySize, fitOutW))
                O = batch_matMul(O, W)
            print('shape of O:', O.get_shape())
            return O
            
    return Layer

x = inputs
tf.ones(shape=x.get_shape())
x = MemoryLayer('Mem1', inputs, updateHopH=True)(0.1*tf.ones(shape=x.get_shape())) # see paper
x = MemoryLayer('Mem2', inputs, fitOutW=voc_size)(x)

In [None]:
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=x, labels=y_true)
loss = tf.reduce_mean(cross_entropy)
PPL = tf.exp(loss)

optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss)

In [None]:
session = tf.Session()
session.run(tf.global_variables_initializer())

for i in range(630):
    x_batch, y_true_batch = next(gen_word_word(batch_size=optMemNet.batch_size))
    feed_dict_train = {inputs: x_batch, y_true: y_true_batch}
    _, trainPPL = session.run([optimizer, PPL], feed_dict=feed_dict_train)
    print('On iter {0} with training PPL: {1}'.format(i, trainPPL))