In [67]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import os
import numpy as np
import pandas as pd
import random as rnd
import tensorflow as tf

# Set random seeds
rnd.seed(34)

In [68]:
data = pd.read_csv("quora_duplicate_questions.csv")
N = len(data)
print('Number of question pairs: ', N)
data.head()

Number of question pairs:  345036


Unnamed: 0,index,id,qid1,qid2,question1,question2,is_duplicate
0,0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
4,4,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1


In [69]:
N_train = 300000
N_test = 45036
data_train = data[:N_train]
data_test = data[N_train:N_train + N_test]
print("Train set:", len(data_train), "Test set:", len(data_test))
del (data)  # remove to free memory

Train set: 300000 Test set: 45036


In [70]:
td_index = data_train['is_duplicate'] == 1
td_index = [i for i, x in enumerate(td_index) if x]
print('Number of duplicate questions: ', len(td_index))
print('Indexes of first ten duplicate questions:', td_index[:10])

Number of duplicate questions:  119221
Indexes of first ten duplicate questions: [4, 8, 9, 10, 11, 12, 14, 16, 25, 27]


In [71]:
print(data_train['question1'][4])
print(data_train['question2'][4])
print('is_duplicate: ', data_train['is_duplicate'][4])

How can I be a good geologist?
What should I do to be a great geologist?
is_duplicate:  1


In [72]:
Q1_train = np.array(data_train['question1'][td_index])
Q2_train = np.array(data_train['question2'][td_index])

Q1_test = np.array(data_test['question1'])
Q2_test = np.array(data_test['question2'])
y_test  = np.array(data_test['is_duplicate'])

In [73]:
print('TRAINING QUESTIONS:\n')
print('Question 1: ', Q1_train[0])
print('Question 2: ', Q2_train[0], '\n')
print('Question 1: ', Q1_train[4])
print('Question 2: ', Q2_train[4], '\n')

print('TESTING QUESTIONS:\n')
print('Question 1: ', Q1_test[0])
print('Question 2: ', Q2_test[0], '\n')
print('is_duplicate =', y_test[0], '\n')

TRAINING QUESTIONS:

Question 1:  How can I be a good geologist?
Question 2:  What should I do to be a great geologist? 

Question 1:  What would a Trump presidency mean for current international master’s students on an F1 visa?
Question 2:  How will a Trump presidency affect the students presently in US or planning to study in US? 

TESTING QUESTIONS:

Question 1:  How can I move photos in Google Photos into folders or albums?
Question 2:  Can we make a hidden folder in Google photos? 

is_duplicate = 0 



In [74]:
# Splitting the data
cut_off = int(len(Q1_train) * 0.8)
train_Q1, train_Q2 = Q1_train[:cut_off], Q2_train[:cut_off]
val_Q1, val_Q2 = Q1_train[cut_off:], Q2_train[cut_off:]
print('Number of duplicate questions: ', len(Q1_train))
print("The length of the training set is:  ", len(train_Q1))
print("The length of the validation set is: ", len(val_Q1))

Number of duplicate questions:  119221
The length of the training set is:   95376
The length of the validation set is:  23845


In [75]:
tf.random.set_seed(0)
text_vectorization = tf.keras.layers.TextVectorization(output_mode='int',split='whitespace', standardize='strip_punctuation')
text_vectorization.adapt(np.concatenate((Q1_train,Q2_train)))

In [76]:
# GRADED FUNCTION: Siamese
def Siamese(text_vectorizer, vocab_size=36224, d_feature=128):
    """Returns a Siamese model.

    Args:
        text_vectorizer (TextVectorization): TextVectorization instance, already adapted to your training data.
        vocab_size (int, optional): Length of the vocabulary. Defaults to 56400.
        d_model (int, optional): Depth of the model. Defaults to 128.
        
    Returns:
        tf.model.Model: A Siamese model. 
    
    """
    ### START CODE HERE ###

    # Define the Siamese branch
    branch = tf.keras.models.Sequential(name='sequential') 
    # Add the text_vectorizer layer. This is the text_vectorizer you instantiated and trained before 
    branch.add(text_vectorizer)
    # Add the Embedding layer. Remember to call it 'embedding' using the parameter `name`
    branch.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=d_feature, name='embedding'))
    # Add the LSTM layer, recall from W2 that you want the LSTM layer to return sequences, not just one value. 
    # Remember to call it 'LSTM' using the parameter `name`
    branch.add(tf.keras.layers.LSTM(units=d_feature, name='LSTM', return_sequences=True))
    # Add the GlobalAveragePooling1D layer. Remember to call it 'mean' using the parameter `name`
    branch.add(tf.keras.layers.GlobalAveragePooling1D(name='mean'))
    # Add the normalizing layer using the Lambda function. Remember to call it 'out' using the parameter `name`
    branch.add(tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1), name='out'))
    
    # Define both inputs. Remember to call them 'input_1' and 'input_2' using the `name` parameter. 
    # Be mindful of the data type and size
    input1 = tf.keras.layers.Input(shape=(1,), dtype=tf.string, name='input_1')
    input2 = tf.keras.layers.Input(shape=(1,), dtype=tf.string, name='input_2')
    
    # Define the output of each branch of your Siamese network. Remember that both branches have the same coefficients, 
    # but they each receive different inputs.
    branch1 = branch(input1)
    branch2 = branch(input2)
    
    # Define the Concatenate layer. You should concatenate columns, you can fix this using the `axis` parameter. 
    # This layer is applied over the outputs of each branch of the Siamese network
    conc = tf.keras.layers.Concatenate(axis=1, name='conc_1_2')([branch1, branch2]) 
    
    ### END CODE HERE ###
    
    return tf.keras.models.Model(inputs=[input1, input2], outputs=conc, name="SiameseModel")

In [77]:
# Check your Siamese model
model = Siamese(text_vectorization, vocab_size=text_vectorization.vocabulary_size())
model.build(input_shape=((None, 1), (None, 1)))  # Set the input_shape argument
model.summary()

# Check the sequential branch inside the Siamese model
sequential_branch = model.get_layer(name='sequential')
sequential_branch.build(input_shape=(None,))  # Set the input_shape argument
sequential_branch.summary()

Model: "SiameseModel"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 sequential (Sequential)     (None, 128)                  4688896   ['input_1[0][0]',             
                                                                     'input_2[0][0]']             
                                                                                                  
 conc_1_2 (Concatenate)      (None, 256)                  0         ['sequential[0][0]'

In [78]:
# GRADED FUNCTION: TripletLossFn
def TripletLossFn(v1, v2, margin=0.25):
    """Custom Loss function.

    Args:
        v1 (numpy.ndarray or Tensor): Array with dimension (batch_size, model_dimension) associated with Q1.
        v2 (numpy.ndarray or Tensor): Array with dimension (batch_size, model_dimension) associated with Q2.
        margin (float, optional): Desired margin. Defaults to 0.25.

    Returns:
        triplet_loss (numpy.ndarray or Tensor)
    """

    # use `tf.linalg.matmul` to take the dot product of the two batches.
    # Don't forget to transpose the second argument using `transpose_b=True`
    scores = tf.linalg.matmul(v2, tf.transpose(v1, perm=[1, 0]))

    # calculate new batch size and cast it as the same datatype as scores.
    batch_size = tf.cast(tf.shape(v1)[0], scores.dtype)

    # use `tf.linalg.diag_part` to grab the cosine similarity of all positive examples
    positive = tf.linalg.diag_part(scores)

    # subtract the diagonal from scores. You can do this by creating a diagonal matrix with the values
    # of all positive examples using `tf.linalg.diag`
    negative_zero_on_duplicate = scores - tf.linalg.diag(positive)

    # use `tf.math.reduce_sum` on `negative_zero_on_duplicate` for `axis=1` and divide it by `(batch_size - 1)`
    mean_negative = tf.math.reduce_sum(negative_zero_on_duplicate, axis=1) / (batch_size - 1)

    # create a composition of two masks:
    # the first mask to extract the diagonal elements,
    # the second mask to extract elements in the negative_zero_on_duplicate matrix that are larger than the elements in the diagonal
    mask_exclude_positives = tf.math.logical_or(tf.eye(batch_size, dtype=tf.bool), (negative_zero_on_duplicate > tf.expand_dims(positive, 1)))

    # multiply `mask_exclude_positives` with 2.0 and subtract it out of `negative_zero_on_duplicate`
    negative_without_positive = tf.where(mask_exclude_positives, -2.0, negative_zero_on_duplicate)

    # take the row by row `max` of `negative_without_positive`.
    # Hint: `tf.math.reduce_max(negative_without_positive, axis=None)`
    closest_negative = tf.math.reduce_max(negative_without_positive, axis=1)

    # compute `tf.maximum` among 0.0 and `A`
    # A = subtract `positive` from `margin` and add `closest_negative`
    triplet_loss1 = tf.maximum(0.0, margin - positive + closest_negative)

    # compute `tf.maximum` among 0.0 and `B`
    # B = subtract `positive` from `margin` and add `mean_negative`
    triplet_loss2 = tf.maximum(0.0, margin - positive + mean_negative)

    # add the two losses together and take the `tf.math.reduce_sum` of it
    triplet_loss = tf.math.reduce_sum(triplet_loss1 + triplet_loss2)

    return triplet_loss

In [79]:
def TripletLoss(labels, out, margin=0.25):
    _, embedding_size = out.shape # get embedding size
    v1 = out[:,:int(embedding_size/2)] # Extract v1 from out
    v2 = out[:,int(embedding_size/2):] # Extract v2 from out
    return TripletLossFn(v1, v2, margin=margin)

In [80]:
train_dataset = tf.data.Dataset.from_tensor_slices(((train_Q1, train_Q2),tf.constant([1]*len(train_Q1))))
val_dataset = tf.data.Dataset.from_tensor_slices(((val_Q1, val_Q2),tf.constant([1]*len(val_Q1))))

In [81]:
# GRADED FUNCTION: train_model
def train_model(Siamese, TripletLoss, text_vectorizer, train_dataset, val_dataset, d_feature=128, lr=0.01, train_steps=5):
    """Training the Siamese Model

    Args:
        Siamese (function): Function that returns the Siamese model.
        TripletLoss (function): Function that defines the TripletLoss loss function.
        text_vectorizer: trained instance of `TextVecotrization` 
        train_dataset (tf.data.Dataset): Training dataset
        val_dataset (tf.data.Dataset): Validation dataset
        d_feature (int, optional) = size of the encoding. Defaults to 128.
        lr (float, optional): learning rate for optimizer. Defaults to 0.01
        train_steps (int): number of epochs
        
    Returns:
        tf.keras.Model
    """
    ## START CODE HERE ###

    # Instantiate your Siamese model
    model = Siamese(text_vectorizer,
                    vocab_size = text_vectorizer.vocabulary_size(), #set vocab_size accordingly to the size of your vocabulary
                    d_feature = d_feature)
    # Compile the model
    model.compile(loss=TripletLoss,
                  optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
            )
    # Train the model 
    model.fit(train_dataset,
              epochs = train_steps,
              validation_data = val_dataset,
             )
             
    ### END CODE HERE ###

    return model

In [None]:
train_steps = 25
batch_size = 256
train_generator = train_dataset.shuffle(len(train_Q1),
                                        seed=7, 
                                        reshuffle_each_iteration=True).batch(batch_size=batch_size)
val_generator = val_dataset.shuffle(len(val_Q1), 
                                   seed=7,
                                   reshuffle_each_iteration=True).batch(batch_size=batch_size)
model = train_model(Siamese, TripletLoss,text_vectorization, 
                                            train_generator, 
                                            val_generator, 
                                            train_steps=train_steps,)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25

In [None]:
# GRADED FUNCTION: classify
def classify(test_Q1, test_Q2, y_test, threshold, model, batch_size=64, verbose=True):
    """Function to test the accuracy of the model.

    Args:
        test_Q1 (numpy.ndarray): Array of Q1 questions. Each element of the array would be a string.
        test_Q2 (numpy.ndarray): Array of Q2 questions. Each element of the array would be a string.
        y_test (numpy.ndarray): Array of actual target.
        threshold (float): Desired threshold
        model (tensorflow.Keras.Model): The Siamese model.
        batch_size (int, optional): Size of the batches. Defaults to 64.

    Returns:
        float: Accuracy of the model
        numpy.array: confusion matrix
    """
    y_pred = []
    test_gen = tf.data.Dataset.from_tensor_slices(((test_Q1, test_Q2),None)).batch(batch_size=batch_size)
    
    ### START CODE HERE ###

    pred = model.predict(test_gen)
    _, n_feat = pred.shape
    v1 = pred[:, :n_feat // 2]
    v2 = pred[:, n_feat // 2:]
    
    # Compute the cosine similarity. Using `tf.math.reduce_sum`. 
    # Don't forget to use the appropriate axis argument.
    d  = tf.math.reduce_sum(v1 * v2, axis=1) / (tf.norm(v1, axis=1) * tf.norm(v2, axis=1))
    # Check if d>threshold to make predictions
    y_pred = tf.cast(d > threshold, tf.float64)
    # take the average of correct predictions to get the accuracy
    accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred, y_test), tf.float64))
    # compute the confusion matrix using `tf.math.confusion_matrix`
    cm = tf.math.confusion_matrix(y_test, y_pred, num_classes=2)
    
    ### END CODE HERE ###
    
    return accuracy, cm

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Ensure that all elements are strings
Q1_test = [str(item) for item in Q1_test]
Q2_test = [str(item) for item in Q2_test]

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(Q1_test + Q2_test)

# Convert text to sequences
Q1_sequences = tokenizer.texts_to_sequences(Q1_test)
Q2_sequences = tokenizer.texts_to_sequences(Q2_test)

# Pad sequences to a fixed length
max_sequence_length = 100  # adjust as needed
Q1_padded = pad_sequences(Q1_sequences, maxlen=max_sequence_length)
Q2_padded = pad_sequences(Q2_sequences, maxlen=max_sequence_length)


In [None]:
# this takes around 1 minute
accuracy, cm = classify(Q1_test,Q2_test, y_test, 0.7, model,  batch_size = 512) 
print("Accuracy", accuracy.numpy())
print(f"Confusion matrix:\n{cm.numpy()}")

# UPTO ABOVE COURESERA CODE

In [44]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

from gensim.models import KeyedVectors

google_news_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Convert text to word embeddings
def text_to_embedding(text, embedding_dim):
    tokens = text.split(' ')
    vectors = [google_news_model[word] for word in tokens if word in google_news_model.key_to_index]
    vector_mean = np.mean(vectors, axis=0) if vectors else np.zeros(embedding_dim)
    return vector_mean

# Apply the function to the questions
embedding_dim = 300
X = np.array([text_to_embedding(Q1_test[i], embedding_dim) for i in range(len(Q1_test))])
Y = np.array([text_to_embedding(Q2_test[i], embedding_dim) for i in range(len(Q2_test))])
X = np.concatenate((X, Y), axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_test, test_size=0.2, random_state=42)

# Define the model
model = Sequential()
model.add(Dense(128, input_dim=embedding_dim*2, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
predictions = model.predict(X_test)
predictions = [1 if p > 0.5 else 0 for p in predictions]

# Print the accuracy and confusion matrix
print("Accuracy: ", accuracy_score(y_test, predictions))
print("Confusion Matrix: \n", confusion_matrix(y_test, predictions))


Epoch 1/10

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy:  0.7253552397868561
Confusion Matrix: 
 [[4696  894]
 [1580 1838]]


# Janina ki but normal GoogleNewsVector apply kora

# ar ei nicher ta i g Siamese with LSTM

In [51]:
import numpy as np
from gensim.models import KeyedVectors
from keras.layers import Input, LSTM, Dense, Embedding, Dropout
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec

def text_to_embedding(text, embedding_dim):
    tokens = text.split(' ')
    vectors = [google_news_model[word] for word in tokens if word in google_news_model.key_to_index]
    vector_mean = np.mean(vectors, axis=0) if vectors else np.zeros(embedding_dim)
    return vector_mean

# Load Google's pre-trained Word2Vec model.
google_news_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
# Set parameters
max_len = 100
embedding_dim = 300

# Convert texts to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(Q1_test)
Q1_seq = tokenizer.texts_to_sequences(Q1_test)
Q1_seq = pad_sequences(Q1_seq, maxlen=max_len)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(Q2_test)
Q2_seq = tokenizer.texts_to_sequences(Q2_test)
Q2_seq = pad_sequences(Q2_seq, maxlen=max_len)

# Create a Siamese network with LSTM
def create_siamese_lstm():
    inputs = Input(shape=(max_len,))
    embedding = Embedding(input_dim=45036, 
                          output_dim=google_news_model.vector_size,
                          trainable=False)(inputs)
    lstm = LSTM(64, return_sequences=True)(embedding)
    lstm = Dropout(0.2)(lstm)
    lstm = LSTM(32)(lstm)
    lstm = Dropout(0.2)(lstm)
    lstm = Dense(64, activation='relu')(lstm)
    model = Model(inputs, lstm)
    return model
    
# Compile and train the Siamese network
model = create_siamese_lstm()
model.compile(loss='mean_squared_error', optimizer='adam')

# Compute embeddings for questions
X = model.predict(Q1_seq)
Y = model.predict(Q2_seq)



In [60]:
def classify_with_embeddings(X, Y, y_test, threshold, verbose=True):
    # Compute cosine similarity scores between X and Y
    similarity_scores = [cosine_similarity([x], [y])[0, 0] for x, y in zip(X, Y)]
    
    # Convert similarity scores to binary predictions using the threshold
    y_pred = np.array(similarity_scores) > threshold

    # Compute accuracy and confusion matrix
    accuracy = np.mean(y_pred == y_test)
    confusion_matrix = np.array([[np.sum((y_test == 0) & (y_pred == 0)), np.sum((y_test == 0) & (y_pred == 1))],
                                 [np.sum((y_test == 1) & (y_pred == 0)), np.sum((y_test == 1) & (y_pred == 1))]])

    if verbose:
        print("Accuracy:", accuracy)
        print("Confusion Matrix:")
        print(confusion_matrix)

    return accuracy, confusion_matrix

In [59]:
print("Q1_seq shape:", Q1_seq.shape)
print("Q2_seq shape:", Q2_seq.shape)
print("X shape:", X.shape)
print("Y shape:", Y.shape)


Q1_seq shape: (45036, 100)
Q2_seq shape: (45036, 100)
X shape: (45036, 64)
Y shape: (45036, 64)


In [62]:
# Reshape y_test to match the shape of y_pred
y_test = np.reshape(y_test, (-1,))

# Use the function to compute accuracy
threshold = 0.5  # You can adjust the threshold as needed
accuracy, cm = classify_with_embeddings(X, Y, y_test, threshold)
print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(cm)

ValueError: operands could not be broadcast together with shapes (45036,) (9008,) 