#### Imports

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Input, Dense, Concatenate, BatchNormalization, Dropout
from tensorflow.keras.utils.np_utils import to_categorical
from tensorflow.keras.models import Model
from keras.utils import plot_model
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from PIL import Image

#### Variables

In [15]:
MAX_NO_WORDS = 200000
MAX_SEQUENCE_LEN = 30
VOCAB_SIZE = 0  # will be updated after tokenizing
EMBEDDINGS_DIM = 300
GLOVE_EMBEDDINGS_FILEPATH = ''
MODEL_FILEPATH = 'dl_model.h5'
MODEL_ARCHITECTURE_FILEPATH = 'model_architecture.png'
MODEL_CHECKPOINT_FILEPATH = 'lstm_model_three_.{epoch:02d}-{val_loss:.6f}.h5'

VALIDATION_SPLIT = 0.3
RATE_DROP_LSTM = 0.17
RATE_DROP_DENSE = 0.25
NUMBER_DENSE_UNITS = 64
ACTIVATION_FUNCTION = 'relu'
LEARNING_RATE_REDUCTION_FACTOR = 0.9
MIN_EPCOHS_NO_IMPROVEMENT_BEFORE_SAVING_CHECKPOINT = 0.8
MIN_EPOCHS_NO_IMPROVEMENT_BEFORE_REDUCING_LR = 0.2
MINIMUM_LR = 0.000001

#### Functions

In [None]:
def read_embeddings_in_dict():
    embeddings = {}
    file = open(GLOVE_EMBEDDINGS_FILEPATH)
    for line in file:
        line = line.split()
        word = line[0]
        embedding = line[1:]
        embeddings[word] = embedding
    
    file.close()
    return embeddings

def filter_embeddings(tokenizer, loaded_embeddings: dict):
    """For words present in our vocabulary, we're using embedding from loaded embeddings if the word is present there else using zeros."""
    filtered_embeddings = np.array((VOCAB_SIZE+1, EMBEDDINGS_DIM))
    for word, i in tokenizer.word_index.items():
        word_embedding = loaded_embeddings.get(word)
        if word_embedding is not None:
            filtered_embeddings[i] = word_embedding
    return filtered_embeddings

def create_length_features(questions_1: list, questions_2: list):
    """The inputs needs to be a list of lists. We create three features i.e. length of unique words in q1 and same for q2 and len of common words."""
    length_features = [[len(set(question1)), len(set(question2)), len(set(question1).intersection(set(question2)))] for question1, question2 in zip(questions_1, questions_2)]
    return np.array(length_features, dtype = 'float32')


#### Data Preprocessing

In [None]:

df = pd.read_csv('data/train.csv')
questions = list(df['question1']) + list(df['question2'])

tokenizer = Tokenizer(num_words=MAX_NO_WORDS, filters='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(questions)
VOCAB_SIZE = len(tokenizer.word_counts) # unique words

sequences_1 = tokenizer.texts_to_sequences(df['question1'])
sequences_2 = tokenizer.texts_to_sequences(df['question2'])

sequences_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LEN)
sequences_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LEN)

#### Preprocessing & Feature Engineering

In [None]:

target_col_categorized = to_categorical(df['is_duplicate'], num_classes=2)

# creating embeddings
embeddings = read_embeddings_in_dict()
print(f"Embeddings vocabulary size: {len(embeddings)}. Our data vocabulary size: {VOCAB_SIZE}")
embeddings = filter_embeddings(tokenizer, embeddings)
print(f"Filtered embeddings vocabulary size: {len(embeddings)}.")

# develping length features
length_feats = create_length_features(sequences_1, sequences_2)

# data splittion
sequences_1train, sequences_1test, sequences_2train, sequences_2test, length_feats_train, length_feats_test, target_train, target_test = train_test_split(
    sequences_1,
    sequences_2,
    length_feats,
    target_col_categorized,
    test_size=0.3,
    random_state=20
)

#### NN Architecture Setup

In [None]:
#### NN Architecture Setup

# creating embeddings layer
embedding_layer = Embedding(input_dim=VOCAB_SIZE+1, output_dim=EMBEDDINGS_DIM, input_length=MAX_SEQUENCE_LEN, weights=[embeddings], trainable=False)
# creating lstm layer
lstm_layer = Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))     # recurrent_dropout causes dropout in the internal gate neurons because the standard dropout only words for input and output gate neurons

# creating lstm_sequences for the 1st sentence
sequence_1_input = Input(shape=(MAX_SEQUENCE_LEN,), dtype='int32')
embeddings_sequence_1 = embedding_layer(sequence_1_input)
lstm_for_sequence_1 = lstm_layer(embeddings_sequence_1)

# creating lstm_sequences for the 2nd sentence
sequence_2_input = Input(shape=(MAX_SEQUENCE_LEN,), dtype='int32')
embeddings_sequence_2 = embedding_layer(sequence_2_input)
lstm_for_sequence_2 = lstm_layer(embeddings_sequence_2)

# creating leak input
length_feats_input = Input(shape=(length_feats_train.shape[1],))
length_feats_dense = Dense(int(NUMBER_DENSE_UNITS/2), activation=ACTIVATION_FUNCTION)(length_feats_input)

#### Building the model

In [None]:

# concatenating output of lstm and dense layer (processing both texts and length features properly)
merged_layers = Concatenate([lstm_for_sequence_1, lstm_for_sequence_2, length_feats_dense])
merged_layers = BatchNormalization()(merged_layers)
merged_layers = Dropout(RATE_DROP_DENSE)(merged_layers)
merged_layers = Dense(NUMBER_DENSE_UNITS, activation=ACTIVATION_FUNCTION)(merged_layers)
merged_layers = BatchNormalization()(merged_layers)
merged_layers = Dropout(RATE_DROP_DENSE)(merged_layers)
output_layer = Dense(2, activation='sigmoid')(merged_layers)

model = Model(inputs=[sequence_1_input, sequence_2_input, length_feats_input], outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])

In [None]:
model.summary()

In [None]:
# displaying model architecture
plot_model(model, MODEL_ARCHITECTURE_FILEPATH)
Image.open(MODEL_ARCHITECTURE_FILEPATH)

#### Early Stopper and Checkpoint saver

In [None]:

earlystopper = EarlyStopping(patience=MIN_EPCOHS_NO_IMPROVEMENT_BEFORE_SAVING_CHECKPOINT, verbose=1)
checkpointer = ModelCheckpoint(filepath=MODEL_CHECKPOINT_FILEPATH, save_best_only=True, save_weights_only=True, verbose=1)
lr_reducer = ReduceLROnPlateau(monitor='val_loss', patience=MIN_EPOCHS_NO_IMPROVEMENT_BEFORE_REDUCING_LR, factor=LEARNING_RATE_REDUCTION_FACTOR, min_lr=MINIMUM_LR, verbose=1)


#### Training Model

In [None]:
history = model.fit([sequences_1train, sequences_2train, length_feats_train], target_train, validation_data=([sequences_1test, sequences_2test, length_feats_test], target_test), verbose=1, epochs=200, batch_size=1024)

#### Saving Model

In [None]:
model.save(MODEL_FILEPATH)

### Results

#### Accuracy

In [None]:
train_loss = history.history['loss']
val_loss = history.history['val_loss']
train_acc = history.history['acc']
val_acc = history.history['val_acc']

print(f"Training loss: {train_loss} \
      \nTest loss: {val_loss} \
      \nTrain accuracy: {train_acc} \
      \nTest accuracy: {val_acc}")

#### Visuals

In [None]:
plt.plot(train_loss, color='red', label='Train loss')
plt.plot(val_loss, color='blue', label='Test loss')
plt.title("Loss visualization")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.savefig('Loss-visualization.png')
plt.show()

In [None]:
from matplotlib import pyplot as plt
plt.plot(train_acc, color='red', label='Train loss')
plt.plot(val_acc, color='blue', label='Test loss')
plt.title("Accuracy visualization")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.savefig('Accuracy-visualization.png')
plt.show()