# cross-validation script for keras models

this model demonstrates a multi-input model

In [1]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
# restrict GPU usage here - just for testing
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

## data construction

we will compare texts in the Brown corpus from different genres provided by NLTK.

the model will accept two lines of text, and try to classify as `same-source==1` or `different-source==0`

In [3]:
from nltk.corpus import brown
government = [s for s in brown.sents(categories=['government']) if len(s) > 30]
religion = [s for s in brown.sents(categories=['religion']) if len(s) > 30]

In [4]:
# generate some samples
def generateData(a, b, samples=1000):
    x1, x2, y = [], [], []
    for i in range(samples):
        # decide whether to use same of different
        g = np.random.randint(2)
        h = np.random.randint(2)
        if g > 0:
            a_s = a[:]
        else:
            a_s = b[:]
        if h > 0:
            b_s = a[:]
        else:
            b_s = b[:]
        x1.append(a_s[np.random.randint(len(a_s))])
        x2.append(b_s[np.random.randint(len(b_s))])
        if g == h:
            y.append(1)
        else:
            y.append(0)
    return x1, x2, y

In [5]:
x1, x2, y = generateData(government, religion, samples=500)

In [6]:
# character-tokenize
import re
def charTokenize(txt):
    seqs = []
    for l in txt:
        ll = ' '.join(l).lower()
        ll = re.sub(r'[^0-9a-z\s]', '', ll)
        seqs.append(list(ll))
    return seqs

In [7]:
x1 = charTokenize(x1)
x2 = charTokenize(x2)

In [8]:
# index and pad the data
class CharIndexer:
    def __init__(self, max_len=64, chars='abcdefghijklmnopqrstuvwxyz0123456789 '):
        self.char2idx=dict([(c, i+1) for i, c in enumerate(list(chars))]+[('O', 0)])
        self.idx2char=dict([(i+1, c) for i, c in enumerate(list(chars))]+[(0, 'O')])
        self.max_len=max_len
    
    def transform(self, lol):
        seqs = []
        for l in lol:
            s = [self.char2idx[c] for c in l]
            s = s + [0 for _ in range(self.max_len)]
            s = s[:self.max_len]
            seqs.append(s)
        return np.array(seqs)
    
    def inverse_transform(self, lol, no_pad=True):
        txt = []
        for l in lol:
            l = [self.idx2char[i] for i in l]
            if no_pad:
                l = [c for c in l if c != 'O']
            txt.append(l)
        return txt

In [9]:
idxr = CharIndexer()
x1 = idxr.transform(x1)
x2 = idxr.transform(x2)

In [10]:
x1.shape

(500, 64)

In [11]:
# check data and transformers
for i in range(10):
    print(y[i], ":", ''.join(idxr.inverse_transform([x1[i]])[0]), '\t', ''.join(idxr.inverse_transform([x2[i]])[0]))

1 : as another thanksgiving draws near  let us take time out from th 	 in the event that agreement is not reached on the use of the rup
1 : in fact  during the first century bc  an extensive literature sp 	 thus  america   the most widely sung of the patriotic songs  was
0 : the action of the commission in allowing or denying any claim un 	 how explicit such factors have been historically is evident in a
0 :  2  realtors realize  of course  that they are involved in an in 	 mr devey will be responsible for the commercial expansion of vec
0 : it is not unfair to add on the other side that the crude and alm 	 and so  let us remember on this day not only to thank the almigh
0 : conduct engineering research and technical development work to d 	 after all this destruction of old literature  it should be obvio
1 : we should not allow the image of an immanent end brought about i 	 all this emphasis on centrality and on the number 5 as a symboli
1 : rather  such assignments are made  as they m

In [12]:
# check distribution of y's and reshape
print(len([x for x in y if x==0]), len([x for x in y if x==1]))
y = np.array(y)[:, np.newaxis]
y.shape

250 250


(500, 1)

## define hyperparameters to search over

if we define constants with UPPERCASE, they won't be shown in the `verbose` printout

In [13]:
configs = {
    'embed_size' : [100, 200],
    'cell_size': [100, 200],
    'drop_rate': [0.25, 0.5],
    'OPTIMIZER' : ['adam'],
    'MAXLEN': [64],
    'VOCAB' : [len(idxr.char2idx)]
}

## create model function

In [14]:
# model testing
from keras.models import Sequential, Model
from keras.layers import Input, Bidirectional, Embedding, Dropout, LSTM, Concatenate, Dense, Activation

def getModel(config):

    input1 = Input((config['MAXLEN'],))
    input2 = Input((config['MAXLEN'],))

    embedd = Embedding(config['VOCAB'], config['embed_size'], input_length=config['MAXLEN'])

    ## RNN
    lstm1 = Bidirectional(LSTM(config['cell_size'], return_sequences=True))
    lstm2 = Bidirectional(LSTM(config['cell_size']))

    x1r = embedd(input1)
    x1r = Dropout(config['drop_rate'])(x1r)
    x1r = lstm1(x1r)
    x1r = Dropout(config['drop_rate'])(x1r)
    x1r = lstm2(x1r)

    x2r = embedd(input2)
    x2r = Dropout(config['drop_rate'])(x2r)
    x2r = lstm1(x2r)
    x2r = Dropout(config['drop_rate'])(x2r)
    x2r = lstm2(x2r)

    ## DENSE
    x = Concatenate(axis=1)([x1r, x2r])
    x = Dense(200)(x)
    x = Dense(1)(x)
    out = Activation("sigmoid")(x)

    model = Model(inputs=[input1, input2], outputs=out)
    model.compile(optimizer=config['OPTIMIZER'], loss='binary_crossentropy', metrics=["acc"])

    return model

## define a custom loss function

here we will use the f1 score just for demonstration

In [15]:
# custom score
from sklearn.metrics import f1_score

def evalF1(model, x_val, y_val, **kwargs):
    preds = model.predict(x_val)
    preds = np.round(preds)
    return f1_score(y_val, preds)

## instantiate a KerasGridSearchCV object and call fit to grid-search

In [16]:
from keras_extras.model_selection import KerasGridSearchCV

In [17]:
mycv = KerasGridSearchCV(getModel, configs, eval_model=evalF1, epochs=1, k=3, verbose=False, k_verbose=0)

In [18]:
best, trials, scores = mycv.fit([x1, x2], y, verbose=True)



testing 1 :
embed_size :	 100
cell_size :	 100
drop_rate :	 0.25


training 1 th fold...
evaluating 1 th fold...


  'precision', 'predicted', average, warn_for)


evaluation score : 0.0

training 2 th fold...
evaluating 2 th fold...
evaluation score : 0.5680473372781065

training 3 th fold...
evaluating 3 th fold...
evaluation score : 0.0

final avg score : 0.1893491124260355

new best model with avg score 0.1893491124260355


testing 2 :
embed_size :	 100
cell_size :	 100
drop_rate :	 0.5


training 1 th fold...
evaluating 1 th fold...
evaluation score : 0.0

training 2 th fold...
evaluating 2 th fold...
evaluation score : 0.34615384615384615

training 3 th fold...
evaluating 3 th fold...
evaluation score : 0.5263157894736842

final avg score : 0.2908232118758434

new best model with avg score 0.2908232118758434


testing 3 :
embed_size :	 100
cell_size :	 200
drop_rate :	 0.25


training 1 th fold...
evaluating 1 th fold...
evaluation score : 0.08163265306122451

training 2 th fold...
evaluating 2 th fold...
evaluation score : 0.4675324675324675

training 3 th fold...
evaluating 3 th fold...
evaluation score : 0.6557377049180327

final avg sco

In [19]:
best

{'MAXLEN': 64,
 'OPTIMIZER': 'adam',
 'VOCAB': 38,
 'cell_size': 200,
 'drop_rate': 0.25,
 'embed_size': 200}