In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import json

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.activations import relu, sigmoid
from tensorflow.keras.layers import Dense, Dropout, Layer, Embedding, MaxPool1D, Conv1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
TRAIN_Q1_DATA = 'q1_train.npy'
TRAIN_Q2_DATA = 'q2_train.npy'
TRAIN_LABEL_DATA = 'label_train.npy'
DATA_CONFIGS = 'data_configs.npy'

q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA, 'rb'))
q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA, 'rb'))
labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

In [3]:
class SentenceEmbedding(Layer) :
    def __init__(self, **kargs) :
        super(SentenceEmbedding, self).__init__()
        
        self.conv = Conv1D(kargs['conv_num_filters'], kargs['conv_window_size'], activation=relu, padding='same')
        self.max_pool = MaxPool1D(kargs['max_pool_seq_len'], 1)
        self.dense = Dense(kargs['sent_embedding_dimension'], activation=relu)
        
    def call(self, x) :
        x = self.conv(x)
        x = self.max_pool(x)
        x = self.dense(x)
        
        return tf.squeeze(x, 1)

In [4]:
class SentenceSimilarityModel(tf.keras.Model) :
    def __init__(self, **kargs) :
        super(SentenceSimilarityModel, self).__init__(name=kargs['model_name'])
        
        self.word_embedding = Embedding(kargs['vocab_size'], kargs['word_embedding_dimension'])
        self.base_encoder = SentenceEmbedding(**kargs)
        self.hypo_encoder = SentenceEmbedding(**kargs)
        self.dense = Dense(kargs['hidden_dimension'], activation=relu)
        self.logit = Dense(1, activation=sigmoid)
        self.dropout = Dropout(kargs['dropout_rate'])
        
    def call(self, x) :
        x1, x2 = x
        b_x = self.word_embedding(x1)
        h_x = self.word_embedding(x2)
        b_x = self.dropout(b_x)
        h_x = self.dropout(h_x)
        
        b_x = self.base_encoder(b_x)
        h_x = self.hypo_encoder(h_x)
        
        e_x = tf.concat([b_x, h_x], -1)
        e_x = self.dense(e_x)
        e_x = self.dropout(e_x)
        
        return self.logit(e_x)

In [5]:
model_name = 'cnn_similarity'
BATCH_SIZE = 1024
NUM_EPOCHS = 100
VALID_SPLIT = 0.1
MAX_LEN = 31

kargs = {'model_name':model_name,
         'vocab_size' : prepro_configs['vocab_size'], 
         'word_embedding_dimension': 100,
         'conv_num_filters' : 300,
         'conv_window_size': 3,
         'max_pool_seq_len' : MAX_LEN,
         'sent_embedding_dimension' : 128,
         'dropout_rate': 0.2,
         'hidden_dimension': 200,
         'output_dimension': 1}

In [6]:
model = SentenceSimilarityModel(**kargs)

model.compile(optimizer=Adam(1e-3), loss=BinaryCrossentropy(), metrics=BinaryAccuracy(name='accuracy'))

In [7]:
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=1)
checkpoint_path = DATA_OUT_PATH + model_name + '/weights.h5'
checkpoint_dir = os.path.dirname(checkpoint_path)

if os.path.exists(checkpoint_dir) :
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else :
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))
    
cp_callback = ModelCheckpoint(checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

./data_out/cnn_similarity -- Folder already exists 



In [9]:
history = model.fit((q1_data, q2_data), labels, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, validation_split=VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])


Epoch 1/100
Epoch 1: val_accuracy improved from -inf to 0.71929, saving model to ./data_out/cnn_similarity\weights.h5
Epoch 2/100
Epoch 2: val_accuracy improved from 0.71929 to 0.74817, saving model to ./data_out/cnn_similarity\weights.h5
Epoch 3/100
Epoch 3: val_accuracy improved from 0.74817 to 0.78274, saving model to ./data_out/cnn_similarity\weights.h5
Epoch 4/100
Epoch 4: val_accuracy improved from 0.78274 to 0.84032, saving model to ./data_out/cnn_similarity\weights.h5
Epoch 5/100
Epoch 5: val_accuracy did not improve from 0.84032
