In [1]:
import re
import pandas as pd
import numpy as np

from scipy import spatial

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import tensorflow as tf
# tf.config.run_functions_eagerly(False)
from tensorflow import keras
from tensorflow.keras import models, layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy

from tensorflow.keras import backend as K

from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# !curl -LO https://raw.githubusercontent.com/MohamadMerchant/SNLI/master/data.tar.gz
# !tar -xvzf data.tar.gz

In [3]:
df_train = pd.read_csv('data/SNLI/train.csv')
df_dev = pd.read_csv('data/SNLI/dev.csv')
df_test = pd.read_csv('data/SNLI/test.csv')

In [4]:
df_test.head()

Unnamed: 0,similarity,sentence1,sentence2
0,neutral,This church choir sings to the masses as they ...,The church has cracks in the ceiling.
1,entailment,This church choir sings to the masses as they ...,The church is filled with song.
2,contradiction,This church choir sings to the masses as they ...,A choir singing at a baseball game.
3,neutral,"A woman with a green headscarf, blue shirt and...",The woman is young.
4,entailment,"A woman with a green headscarf, blue shirt and...",The woman is very happy.


In [5]:
df_train.head()

Unnamed: 0,similarity,sentence1,sentence2
0,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.
1,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette."
2,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse."
3,neutral,Children smiling and waving at camera,They are smiling at their parents
4,entailment,Children smiling and waving at camera,There are children present


In [6]:
similarity_map = {
    'neutral': np.nan,
    'contradiction': 0,
    'entailment': 1,
    '-': np.nan
}

In [7]:
df_train['similarity'] = df_train['similarity'].apply(lambda column: similarity_map[column])
df_dev['similarity'] = df_dev['similarity'].apply(lambda column: similarity_map[column])
df_test['similarity'] = df_test['similarity'].apply(lambda column: similarity_map[column])

In [8]:
df_train.dropna(axis=0, inplace=True)
df_dev.dropna(axis=0, inplace=True)
df_test.dropna(axis=0, inplace=True)

In [9]:
df_dev.head()

Unnamed: 0,similarity,sentence1,sentence2
1,1.0,Two women are embracing while holding to go pa...,Two woman are holding packages.
2,0.0,Two women are embracing while holding to go pa...,The men are fighting outside a deli.
3,1.0,"Two young children in blue jerseys, one with t...",Two kids in numbered jerseys wash their hands.
5,0.0,"Two young children in blue jerseys, one with t...",Two kids in jackets walk to school.
6,0.0,A man selling donuts to a customer during a wo...,A woman drinks her coffee in a small cafe.


In [10]:
df_train.similarity.value_counts()

1.0    183414
0.0    183185
Name: similarity, dtype: int64

In [11]:
df_dev.similarity.value_counts()

1.0    3329
0.0    3278
Name: similarity, dtype: int64

In [12]:
df_test.similarity.value_counts()

1.0    3368
0.0    3237
Name: similarity, dtype: int64

In [13]:
def clean_word(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'[^a-zA-Z ]', '', sentence)
    return sentence

In [14]:
df_train['sentence1'] = df_train['sentence1'].apply(clean_word)
df_train['sentence2'] = df_train['sentence2'].apply(clean_word)

df_dev['sentence1'] = df_dev['sentence1'].apply(clean_word)
df_dev['sentence2'] = df_dev['sentence2'].apply(clean_word)

df_test['sentence1'] = df_test['sentence1'].apply(clean_word)
df_test['sentence2'] = df_test['sentence2'].apply(clean_word)

In [15]:
df_dev.head()

Unnamed: 0,similarity,sentence1,sentence2
1,1.0,two women are embracing while holding to go pa...,two woman are holding packages
2,0.0,two women are embracing while holding to go pa...,the men are fighting outside a deli
3,1.0,two young children in blue jerseys one with th...,two kids in numbered jerseys wash their hands
5,0.0,two young children in blue jerseys one with th...,two kids in jackets walk to school
6,0.0,a man selling donuts to a customer during a wo...,a woman drinks her coffee in a small cafe


In [16]:
# df_train = df_train[:10000]

In [17]:
def create_dataset(df):
    unq_anchors = df['sentence1'].unique()
    anchors = []
    positives = []
    negatives = []
    for a in unq_anchors:
        pos = df.loc[(df['sentence1'] == a) & (df['similarity'] == 1)]
        neg = df.loc[(df['sentence1'] == a) & (df['similarity'] == 0)]
        positive_values = pos['sentence2'].values
        negative_values = neg['sentence2'].values
        if len(pos) <= len(neg):
            for idx in range(len(pos)):
                anchors.append(a)
                positives.append(positive_values[idx])
                negatives.append(negative_values[idx])
        elif len(pos) > len(neg):
            extra_negs = df.loc[(df['sentence1'] != a)]
            extra_neg_values = extra_negs['sentence2'].values
            negative_values = list(negative_values) + list(extra_neg_values[:(len(pos)-len(neg))+1])
            for idx in range(len(pos)):
                anchors.append(a)
                positives.append(positive_values[idx])
                negatives.append(negative_values[idx])
    return anchors, positives, negatives

In [None]:
# train_anchors, train_positives, train_negatives = create_dataset(df_train[:10000])
# dev_anchors, dev_positives, dev_negatives = create_dataset(df_dev)

In [18]:
def tokenize(sentences):
    tokens = []
    for sentence in sentences:
        tokens.append(word_tokenize(sentence))
    return tokens

def stem_word(sentences):
    tokens = []
    stemmer = PorterStemmer()
    for sentence in sentences:
        sentence_tokens = []
        for word in sentence:
            sentence_tokens.append(stemmer.stem(word))
        tokens.append(sentence_tokens)

    return tokens

def remove_stopwords(sentences):
    tokens = []
    stop = stopwords.words('english')
    for sentence in sentences:
        new_sen = []
        for word in sentence:
            if word not in stop:
                new_sen.append(word)
        tokens.append(new_sen)
    return tokens

def flatten(item_list):
    return [item for sublist in item_list for item in sublist]

def create_vocabulary(anchor, positive, negative):
    anchor = set(flatten(anchor))
    positive = set(flatten(positive))
    negative = set(flatten(negative))
    vocab = anchor.union(positive).union(negative)
    return sorted(list(vocab))

def create_mappings(vocab):
    word2idx = {word:idx+2 for idx,word in enumerate(vocab)}
    idx2word = {idx+2:word for idx, word in enumerate(vocab)}
    return word2idx, idx2word

def map_to_token(map_dict, tokens):
    all_tokens = []
    for sentence in tokens:
        sentence_tokens = []
        for word in sentence:
            if word in map_dict.keys():
                sentence_tokens.append(map_dict[word])
            else:
                sentence_tokens.append(1)
        all_tokens.append(sentence_tokens)
    return all_tokens

In [None]:
# train_anchor_tokens = tokenize(train_anchors)
# train_positive_tokens = tokenize(train_positives)
# train_negative_tokens = tokenize(train_negatives)

# train_anchor_tokens = remove_stopwords(train_anchor_tokens)
# train_positive_tokens = remove_stopwords(train_positive_tokens)
# train_negative_tokens = remove_stopwords(train_negative_tokens)

# train_anchor_tokens = stem_word(train_anchor_tokens)
# train_positive_tokens= stem_word(train_positive_tokens)
# train_negative_tokens = stem_word(train_negative_tokens)


# dev_anchor_tokens = tokenize(dev_anchors)
# dev_positive_tokens = tokenize(dev_positives)
# dev_negative_tokens = tokenize(dev_negatives)

# dev_anchor_tokens = remove_stopwords(dev_anchor_tokens)
# dev_positive_tokens = remove_stopwords(dev_positive_tokens)
# dev_negative_tokens = remove_stopwords(dev_negative_tokens)

# dev_anchor_tokens = stem_word(dev_anchor_tokens)
# dev_positive_tokens= stem_word(dev_positive_tokens)
# dev_negative_tokens = stem_word(dev_negative_tokens)

In [None]:
# vocab = create_vocabulary(train_anchor_tokens, train_positive_tokens, train_negative_tokens)

In [None]:
# word2idx, idx2word = create_mappings(vocab)

In [None]:
# train_anchor_maps = map_to_token(word2idx, train_anchor_tokens)
# train_positive_maps = map_to_token(word2idx, train_positive_tokens)
# train_negative_maps = map_to_token(word2idx, train_negative_tokens)

# dev_anchor_maps = map_to_token(word2idx, dev_anchor_tokens)
# dev_positive_maps = map_to_token(word2idx, dev_positive_tokens)
# dev_negative_maps = map_to_token(word2idx, dev_negative_tokens)

In [None]:
# anchor_train = pad_sequences(train_anchor_maps, maxlen=50)
# positive_train = pad_sequences(train_positive_maps, maxlen=50)
# negative_train = pad_sequences(train_negative_maps, maxlen=50)

# anchor_dev = pad_sequences(dev_anchor_maps, maxlen=50)
# positive_dev = pad_sequences(dev_positive_maps, maxlen=50)
# negative_dev = pad_sequences(dev_negative_maps, maxlen=50)

In [None]:
# anchor_train = np.array(anchor_train, dtype='object').astype('float')
# positive_train = np.array(positive_train, dtype='object').astype('float')
# negative_train = np.array(negative_train, dtype='object').astype('float')

# anchor_dev = np.array(anchor_dev, dtype='object').astype('float')
# positive_dev = np.array(positive_dev, dtype='object').astype('float')
# negative_dev = np.array(negative_dev, dtype='object').astype('float')

In [None]:
# y_train = np.ones((anchor_train.shape[0],1), dtype='float')
# y_dev = np.ones((anchor_dev.shape[0],1), dtype='float')

In [None]:
def identity_loss(y_true, y_pred):
    return K.mean(y_pred)

def triplet_loss(y_true,y_pred, alpha = 0.25):
    anchor = y_pred[:,:128]
    positive = y_pred[:,128:128*2]
    negative = y_pred[:,-128:]
    pos_dist = K.sum(K.square(anchor-positive),axis=1)
    neg_dist = K.sum(K.square(anchor-negative),axis=1)
    basic_loss = pos_dist-neg_dist+alpha
    loss = K.maximum(basic_loss,0.0)
    return loss

In [None]:
siamese_model = models.Sequential()
siamese_model.add(layers.Embedding(input_dim=len(vocab)+2, output_dim=256))
siamese_model.add(layers.Bidirectional(layers.LSTM(256, return_sequences=True)))
siamese_model.add(layers.GlobalAveragePooling1D())
siamese_model.add(layers.Dense(128, activation='relu'))

In [None]:
siamese_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 256)         1059840   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 512)         1050624   
_________________________________________________________________
global_average_pooling1d (Gl (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               65664     
Total params: 2,176,128
Trainable params: 2,176,128
Non-trainable params: 0
_________________________________________________________________


In [None]:
anchor_input = keras.Input(shape=(50,))
positive_input = keras.Input(shape=(50,))
negative_input = keras.Input(shape=(50,))

anchor_output = siamese_model(anchor_input)
positive_output = siamese_model(positive_input)
negative_output = siamese_model(negative_input)

output = layers.Concatenate()([anchor_output, positive_output, negative_output])

model = models.Model(inputs=[anchor_input, positive_input, negative_input], outputs =output)

In [None]:
model.compile(
    loss= triplet_loss,
    optimizer = Adam()
)

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, 128)          2176128     input_1[0][0]                    
                                                                 input_2[0][0]                

In [None]:
model.fit([anchor_train, positive_train, negative_train],y_train,
          validation_data = ([anchor_dev, positive_dev, negative_dev],y_dev),
          epochs=5, 
          batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x22e6bf3c550>

In [None]:
def predict(model,input1,input2):
    input1 = input1.reshape(50,1)
    input2 = input2.reshape(50,1)
    output1 = model(input1)
    output2 = model(input2)
    # print(output1, output2)
    # distance = tf.keras.losses.CosineSimilarity(axis=1)
    similarity = cos_distance([output1, output2])
    # print(similarity)
    if similarity >= 0 :
        return 0
    else:
        return 1

def cos_distance(vectors):
    y_true, y_pred = vectors
    def l2_normalize(x, axis):
        norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=True))
        return K.sign(x) * K.maximum(K.abs(x), K.epsilon()) / K.maximum(norm, K.epsilon())
    y_true = l2_normalize(y_true, axis=1)
    y_pred = l2_normalize(y_pred, axis=1)
    return K.mean(1 - y_true * y_pred, axis=1)

In [None]:
# inference_model= models.load_model('siamese.h5')

In [None]:
sent1_tokenize = tokenize(df_test['sentence1'])
sent2_tokenize = tokenize(df_test['sentence2'])

sent1_tokenize = remove_stopwords(sent1_tokenize)
sent2_tokenize = remove_stopwords(sent2_tokenize)

sent1_tokenize = stem_word(sent1_tokenize)
sent2_tokenize = stem_word(sent2_tokenize)

sent1_tokenize = map_to_token(word2idx,sent1_tokenize)
sent2_tokenize = map_to_token(word2idx,sent2_tokenize)

sent1_tokenize = pad_sequences(sent1_tokenize, maxlen=50)
sent2_tokenize = pad_sequences(sent2_tokenize, maxlen=50)

In [None]:
# preds = []
# for idx in range(20):
#     preds.append(predict(siamese_model,sent1_tokenize[idx],sent2_tokenize[idx]))

In [None]:
y = df_test['similarity'].tolist()

In [None]:
preds = np.array(preds)

In [None]:
# sum(preds == 1)
# len(y)

In [None]:
np.sum(y == preds) / len(sent1_tokenize)

  np.sum(y == preds) / len(sent1_tokenize)


0.0

In [None]:
# for idx in range(100):
#     anchor_temp = anchor_train[idx].reshape(1,20)
#     positive_temp = positive_train[idx].reshape(1,20)
#     negative_temp = negative_train[idx].reshape(1,20)
#     print(s_model.predict([anchor_temp,positive_temp ,negative_temp]))

## Contrastive Loss Function

In [19]:
df_train.head()

Unnamed: 0,similarity,sentence1,sentence2
1,0.0,a person on a horse jumps over a broken down a...,a person is at a diner ordering an omelette
2,1.0,a person on a horse jumps over a broken down a...,a person is outdoors on a horse
4,1.0,children smiling and waving at camera,there are children present
5,0.0,children smiling and waving at camera,the kids are frowning
6,0.0,a boy is jumping on skateboard in the middle o...,the boy skates down the sidewalk


In [20]:
df_dev.head()

Unnamed: 0,similarity,sentence1,sentence2
1,1.0,two women are embracing while holding to go pa...,two woman are holding packages
2,0.0,two women are embracing while holding to go pa...,the men are fighting outside a deli
3,1.0,two young children in blue jerseys one with th...,two kids in numbered jerseys wash their hands
5,0.0,two young children in blue jerseys one with th...,two kids in jackets walk to school
6,0.0,a man selling donuts to a customer during a wo...,a woman drinks her coffee in a small cafe


In [21]:
df_test.head()

Unnamed: 0,similarity,sentence1,sentence2
1,1.0,this church choir sings to the masses as they ...,the church is filled with song
2,0.0,this church choir sings to the masses as they ...,a choir singing at a baseball game
4,1.0,a woman with a green headscarf blue shirt and ...,the woman is very happy
5,0.0,a woman with a green headscarf blue shirt and ...,the woman has been shot
6,1.0,an old man with a package poses in front of an...,a man poses in front of an ad


In [22]:
train_sent1 = df_train['sentence1'].tolist()
train_sent2 = df_train['sentence2'].tolist()

dev_sent1 = df_dev['sentence1'].tolist()
dev_sent2 = df_dev['sentence2'].tolist()

test_sent1 = df_test['sentence1'].tolist()
test_sent2 = df_test['sentence2'].tolist()

In [23]:
train_label = df_train['similarity'].tolist()
dev_label = df_dev['similarity'].tolist()
test_label = df_test['similarity'].tolist()

In [24]:
train_sent1 = train_sent1[:100000]
train_sent2 = train_sent2[:100000]
train_label = train_label[:100000]

In [25]:
train_sent1_tokens = tokenize(train_sent1)
train_sent2_tokens = tokenize(train_sent2)

dev_sent1_tokens = tokenize(dev_sent1)
dev_sent2_tokens = tokenize(dev_sent2)

test_sent1_tokens = tokenize(test_sent1)
test_sent2_tokens = tokenize(test_sent2)

In [26]:
train_sent1_tokens = remove_stopwords(train_sent1_tokens)
train_sent2_tokens = remove_stopwords(train_sent2_tokens)

dev_sent1_tokens = remove_stopwords(dev_sent1_tokens)
dev_sent2_tokens = remove_stopwords(dev_sent2_tokens)

test_sent1_tokens = remove_stopwords(test_sent1_tokens)
test_sent2_tokens = remove_stopwords(test_sent2_tokens)

In [27]:
train_sent1_tokens = stem_word(train_sent1_tokens)
train_sent2_tokens = stem_word(train_sent2_tokens)

dev_sent1_tokens = stem_word(dev_sent1_tokens)
dev_sent2_tokens = stem_word(dev_sent2_tokens)

test_sent1_tokens = stem_word(test_sent1_tokens)
test_sent2_tokens = stem_word(test_sent2_tokens)

In [28]:
def create_vocabulary_contra(sentence1, sentence2):
    sentence1 = set(flatten(sentence1))
    sentence2 = set(flatten(sentence2))
    vocab = sentence1.union(sentence2)
    return sorted(list(vocab))

In [29]:
vocab = create_vocabulary_contra(train_sent1_tokens, train_sent2_tokens)

In [30]:
word2idx, idx2word = create_mappings(vocab)

In [31]:
train_sent1_tokens = map_to_token(word2idx,train_sent1_tokens)
train_sent2_tokens = map_to_token(word2idx,train_sent2_tokens)

dev_sent1_tokens = map_to_token(word2idx,dev_sent1_tokens)
dev_sent2_tokens = map_to_token(word2idx,dev_sent2_tokens)

test_sent1_tokens = map_to_token(word2idx,test_sent1_tokens)
test_sent2_tokens = map_to_token(word2idx,test_sent2_tokens)

In [32]:
train_data1 = pad_sequences(train_sent1_tokens, maxlen=50)
train_data2 = pad_sequences(train_sent2_tokens,maxlen=50)

dev_data1 = pad_sequences(dev_sent1_tokens, maxlen=50)
dev_data2 = pad_sequences(dev_sent2_tokens,maxlen=50)

test_data1 = pad_sequences(test_sent1_tokens, maxlen=50)
test_data2 = pad_sequences(test_sent2_tokens,maxlen=50)

In [33]:
train_data1 = np.array(train_data1, dtype='object').astype('float')
train_data2 = np.array(train_data2, dtype='object').astype('float')
train_label = np.array(train_label).astype("float")

dev_data1 = np.array(dev_data1, dtype='object').astype('float')
dev_data2 = np.array(dev_data2, dtype='object').astype('float')
dev_label = np.array(dev_label).astype("float")

test_data1 = np.array(test_data1, dtype='object').astype('float')
test_data2 = np.array(test_data2, dtype='object').astype('float')
test_label = np.array(test_label).astype("float")

In [34]:
def contrastive_loss(y, preds, margin=1):
    # explicitly cast the true class label data type to the predicted
    # class label data type (otherwise we run the risk of having two
    # separate data types, causing TensorFlow to error out)
    y = tf.cast(y, preds.dtype)
    # calculate the contrastive loss between the true labels and
    # the predicted labels
    squaredPreds = K.square(preds)
    squaredMargin = K.square(K.maximum(margin - preds, 0))
    loss = K.mean(y * squaredPreds + (1 - y) * squaredMargin)
    # return the computed contrastive loss to the calling function
    return loss

In [35]:
def euclidean_distance(vectors):
    # unpack the vectors into separate lists
    (featsA, featsB) = vectors
    # compute the sum of squared distances between the vectors
    sumSquared = K.sum(K.square(featsA - featsB), axis=1, keepdims=True)
    # return the euclidean distance between the vectors
    return K.sqrt(K.maximum(sumSquared, K.epsilon()))

In [36]:
def cos_distance(vectors):
    y_true, y_pred = vectors
    def l2_normalize(x, axis):
        norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=True))
        return K.sign(x) * K.maximum(K.abs(x), K.epsilon()) / K.maximum(norm, K.epsilon())
    y_true = l2_normalize(y_true, axis=-1)
    y_pred = l2_normalize(y_pred, axis=-1)
    return K.mean(y_true * y_pred, axis=-1)

In [42]:
base_model = models.Sequential()
base_model.add(layers.Embedding(input_dim=len(vocab)+2, output_dim=100))
base_model.add(layers.LSTM(100, return_sequences=True))
base_model.add(layers.Dropout(0.3))
base_model.add(layers.GlobalAveragePooling1D())
base_model.add(layers.Dense(128, activation='relu'))
base_model.add(layers.Dense(100, activation='relu'))

In [43]:
base_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         1127700   
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 100)         80400     
_________________________________________________________________
dropout (Dropout)            (None, None, 100)         0         
_________________________________________________________________
global_average_pooling1d_1 ( (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 128)               12928     
_________________________________________________________________
dense_5 (Dense)              (None, 100)               12900     
Total params: 1,233,928
Trainable params: 1,233,928
Non-trainable params: 0
____________________________________________

In [44]:
input1 = keras.Input(shape=(train_data1.shape[1],))
input2 = keras.Input(shape=(train_data1.shape[1],))

encoding1 = base_model(input1)
encoding2 = base_model(input2)

distance = layers.Concatenate()([encoding1, encoding2])
# distance = layers.Lambda(euclidean_distance)([encoding1, encoding2])
dense1 = layers.Dense(128, activation='relu')(distance)
dropout1 = layers.Dropout(0.2)(dense1)
dense2 = layers.Dense(128, activation='relu')(dropout1)
final = layers.Dense(1, activation='sigmoid')(dense1)

model = models.Model(inputs = [input1, input2], outputs = final)

In [45]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 100)          1233928     input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 200)          0           sequential_1[0][0]         

In [46]:
model.compile(
    loss='binary_crossentropy', 
    optimizer=Adam(0.001),
    metrics=['accuracy']
    )

In [47]:
model.fit(
    [train_data1,train_data2],
    train_label,
    batch_size=64,
    validation_data=([dev_data1,dev_data2], dev_label), 
    epochs=5
    )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x22b79082e80>

In [48]:
model.save('binary_loss_model1.h5')

In [49]:
model.predict([test_data1,test_data2])[1][0]

0.39341095

In [50]:
def predict():
    preds = model.predict([test_data1,test_data2])
    preds_list = [1 if p[0] >= 0.5 else 0 for p in preds ]
    return preds_list

In [51]:
preds = predict()

In [52]:
sum(test_label == preds) / len(preds)

0.7990915972747918

In [53]:
from sklearn.metrics import classification_report


In [55]:
print(classification_report(test_label, preds))

              precision    recall  f1-score   support

         0.0       0.82      0.75      0.79      3237
         1.0       0.78      0.84      0.81      3368

    accuracy                           0.80      6605
   macro avg       0.80      0.80      0.80      6605
weighted avg       0.80      0.80      0.80      6605



In [56]:
from sklearn.metrics import confusion_matrix

In [58]:
conf_mat = confusion_matrix(test_label, preds)

In [59]:
import seaborn as sns

In [None]:
sns.heatmap(conf_mat)

In [None]:
def cos_distance(y_true, y_pred):
    #y_true = K.l2_normalize(y_true, axis=-1)
    #y_pred = K.l2_normalize(y_pred, axis=-1)
    return K.mean(1 - K.sum((y_true * y_pred), axis=-1))

## Triplet loss with just positive examples

In [None]:
df_train_pos = df_train.loc[(df_train['similarity'] == 1)]
df_dev_pos = df_dev.loc[(df_dev['similarity'] == 1)]

In [None]:
df_train_pos.drop_duplicates(inplace=True)
df_dev_pos.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [None]:
train_sent1 = df_train_pos['sentence1'].tolist()
train_sent2 = df_train_pos['sentence2'].tolist()

dev_sent1 = df_dev_pos['sentence1'].tolist()
dev_sent2 = df_dev_pos['sentence2'].tolist()

test_sent1 = df_test['sentence1'].tolist()
test_sent2 = df_test['sentence2'].tolist()

In [None]:
train_label = df_train_pos['similarity'].tolist()
dev_label = df_dev_pos['similarity'].tolist()
test_label = df_test['similarity'].tolist()

In [None]:
train_sent1 = train_sent1[:100000]
train_sent2 = train_sent2[:100000]
train_label = train_label[:100000]

In [None]:
train_sent1_tokens = tokenize(train_sent1)
train_sent2_tokens = tokenize(train_sent2)

dev_sent1_tokens = tokenize(dev_sent1)
dev_sent2_tokens = tokenize(dev_sent2)

test_sent1_tokens = tokenize(test_sent1)
test_sent2_tokens = tokenize(test_sent2)

In [None]:
train_sent1_tokens = remove_stopwords(train_sent1_tokens)
train_sent2_tokens = remove_stopwords(train_sent2_tokens)

dev_sent1_tokens = remove_stopwords(dev_sent1_tokens)
dev_sent2_tokens = remove_stopwords(dev_sent2_tokens)

test_sent1_tokens = remove_stopwords(test_sent1_tokens)
test_sent2_tokens = remove_stopwords(test_sent2_tokens)

In [None]:
train_sent1_tokens = stem_word(train_sent1_tokens)
train_sent2_tokens = stem_word(train_sent2_tokens)

dev_sent1_tokens = stem_word(dev_sent1_tokens)
dev_sent2_tokens = stem_word(dev_sent2_tokens)

test_sent1_tokens = stem_word(test_sent1_tokens)
test_sent2_tokens = stem_word(test_sent2_tokens)

In [None]:
vocab = create_vocabulary_contra(train_sent1_tokens, train_sent2_tokens)

In [None]:
word2idx, idx2word = create_mappings(vocab)

In [None]:
train_sent1_tokens = map_to_token(word2idx,train_sent1_tokens)
train_sent2_tokens = map_to_token(word2idx,train_sent2_tokens)

dev_sent1_tokens = map_to_token(word2idx,dev_sent1_tokens)
dev_sent2_tokens = map_to_token(word2idx,dev_sent2_tokens)

test_sent1_tokens = map_to_token(word2idx,test_sent1_tokens)
test_sent2_tokens = map_to_token(word2idx,test_sent2_tokens)

In [None]:
train_data1 = pad_sequences(train_sent1_tokens, maxlen=50)
train_data2 = pad_sequences(train_sent2_tokens,maxlen=50)

dev_data1 = pad_sequences(dev_sent1_tokens, maxlen=50)
dev_data2 = pad_sequences(dev_sent2_tokens,maxlen=50)

test_data1 = pad_sequences(test_sent1_tokens, maxlen=50)
test_data2 = pad_sequences(test_sent2_tokens,maxlen=50)

In [None]:
train_data1 = np.array(train_data1, dtype='object').astype('int32')
train_data2 = np.array(train_data2, dtype='object').astype('int32')
train_label = np.array(train_label).astype("int32")

dev_data1 = np.array(dev_data1, dtype='object').astype('int32')
dev_data2 = np.array(dev_data2, dtype='object').astype('int32')
dev_label = np.array(dev_label).astype("int32")

test_data1 = np.array(test_data1, dtype='object').astype('int32')
test_data2 = np.array(test_data2, dtype='object').astype('int32')
test_label = np.array(test_label).astype("int32")

In [None]:
def calculate_mean(x, axis=1):
    return K.mean(x, axis=axis)

def normalize(x):
        return x / K.sqrt(K.sum(x * x, axis=-1, keepdims=True))

In [None]:
base_model = tf.keras.Sequential()
base_model.add(tf.keras.layers.Embedding(input_dim=len(vocab)+2, output_dim=128))
base_model.add(tf.keras.layers.LSTM(128, return_sequences=True))
base_model.add(tf.keras.layers.Lambda(calculate_mean, name='mean'))
base_model.add(tf.keras.layers.Lambda(normalize, name='normalize'))

In [None]:
base_model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, None, 128)         1497856   
_________________________________________________________________
lstm_9 (LSTM)                (None, None, 128)         131584    
_________________________________________________________________
mean (Lambda)                (None, 128)               0         
_________________________________________________________________
normalize (Lambda)           (None, 128)               0         
Total params: 1,629,440
Trainable params: 1,629,440
Non-trainable params: 0
_________________________________________________________________


In [None]:
input1 = tf.keras.layers.Input(shape=(50,))
input2 = tf.keras.layers.Input(shape=(50,))

encoding1 = base_model(input1)
encoding2 = base_model(input2)

merged = tf.keras.layers.Concatenate()([encoding1, encoding2])

model = tf.keras.Model(inputs = [input1, input2], outputs = merged)

In [None]:
model.summary()

Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_21 (InputLayer)           [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_22 (InputLayer)           [(None, 50)]         0                                            
__________________________________________________________________________________________________
sequential_9 (Sequential)       (None, 128)          1629440     input_21[0][0]                   
                                                                 input_22[0][0]                   
__________________________________________________________________________________________________
concatenate_10 (Concatenate)    (None, 256)          0           sequential_9[0][0]        

In [None]:
def TripletLoss(margin=0.25):
    def triplet(y_true,y_pred):
      batch_size = tf.cast(tf.shape(y_true)[0], dtype=tf.float32)
      v1, v2 = y_pred[:,:128],y_pred[:,-128:]
      scores = K.dot(v1, K.transpose(v2))
      positive = tf.linalg.diag_part(scores)
      negative_without_positive = scores - 2 * tf.eye(batch_size)

      closest_negative = tf.reduce_max(negative_without_positive, axis=1)

      negative_zero_on_duplicate = scores * (1.0 - tf.eye(batch_size))
      
      mean_negative = K.sum(negative_zero_on_duplicate, axis=1) / (batch_size-1)
      
      triplet_loss1 = K.maximum(0.0, margin - positive + closest_negative)
      
      triplet_loss2 = K.maximum(0.0, margin - positive + mean_negative)
      
      triplet_loss = K.mean(triplet_loss1 + triplet_loss2)

      return triplet_loss
    return triplet

In [None]:
triplet_loss = TripletLoss()
model.compile(
    optimizer = Adam(0.001),
    loss = triplet_loss
)

In [None]:
history = model.fit(
    [train_data1,train_data2],
    train_label,
    batch_size=64,
    validation_data=([dev_data1,dev_data2], dev_label), 
    epochs=5
    )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
def predict(model, sentence1, sentence2, cosine=False):
    sentence1 = sentence1.reshape(1,50)
    sentence2 = sentence2.reshape(1,50)
    v = model.predict([sentence1, sentence2])
    v1, v2 = v[:,:128], v[:,-128:]
    similarity = np.dot(v1,v2.T)[0][0]
    if similarity >= 0.65:
        return 1
    return 0

In [None]:
preds = []
for idx in range(len(test_data1)):
    preds.append(predict(model,test_data1[idx], test_data2[idx]))

In [None]:
sum(test_label == preds) / len(test_label)

0.6825132475397426

In [None]:
model.save("Triplet.h5")