# Import Required Libraries

In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

In [59]:
df = pd.read_csv("/kaggle/input/quora-duplicate-questions-dataset/train.csv")

In [60]:
df.shape

(404290, 6)

In [61]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


# Random Sampling of 400k Rows from Dataset

In [62]:
new_df = df.sample(400000,random_state = 2)

In [63]:
dataset = new_df[['question1', 'question2', 'is_duplicate']]

In [64]:
print(dataset.shape)
dataset.head()

(400000, 3)


Unnamed: 0,question1,question2,is_duplicate
398782,What is the best marketing automation tool for...,What is the best marketing automation tool for...,1
115086,I am poor but I want to invest. What should I do?,I am quite poor and I want to be very rich. Wh...,0
327711,I am from India and live abroad. I met a guy f...,T.I.E.T to Thapar University to Thapar Univers...,0
367788,Why do so many people in the U.S. hate the sou...,My boyfriend doesnt feel guilty when he hurts ...,0
151235,Consequences of Bhopal gas tragedy?,What was the reason behind the Bhopal gas trag...,0


# Text Preprocessing

In [65]:
def preprocess(q):
    q = str(q).lower().strip()

#Replace certain special characters with their equivalents
    q = q.replace('%','percent')
    q = q.replace('$','dollar')
    q = q.replace('@','at')
    q = q.replace('₹', 'rupee')
    q = q.replace('€', 'euro')

# the pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]','')

# replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000','b')
    q = q.replace(',000,000','m')
    q = q.replace(',000','k')
    q = re.sub(r'([0-9]+)000000000',r'\1b',q)
    q = re.sub(r'([0-9]+)000000',r'\1m',q)
    q = re.sub(r'([0-9]+)000',r'\1k',q)


# Decontracting Words
#https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python/19794953?newreg=c7cc89d538bf4ff5864e477a0a7e2442

    contractions = { 
    "ain't": "am not / are not / is not / has not / have not",
    "aren't": "are not / am not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had / he would",
    "he'd've": "he would have",
    "he'll": "he shall / he will",
    "he'll've": "he shall have / he will have",
    "he's": "he has / he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how has / how is / how does",
    "I'd": "I had / I would",
    "I'd've": "I would have",
    "I'll": "I shall / I will",
    "I'll've": "I shall have / I will have",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it had / it would",
    "it'd've": "it would have",
    "it'll": "it shall / it will",
    "it'll've": "it shall have / it will have",
    "it's": "it has / it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had / she would",
    "she'd've": "she would have",
    "she'll": "she shall / she will",
    "she'll've": "she shall have / she will have",
    "she's": "she has / she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as / so is",
    "that'd": "that would / that had",
    "that'd've": "that would have",
    "that's": "that has / that is",
    "there'd": "there had / there would",
    "there'd've": "there would have",
    "there's": "there has / there is",
    "they'd": "they had / they would",
    "they'd've": "they would have",
    "they'll": "they shall / they will",
    "they'll've": "they shall have / they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had / we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what shall / what will",
    "what'll've": "what shall have / what will have",
    "what're": "what are",
    "what's": "what has / what is",
    "what've": "what have",
    "when's": "when has / when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where has / where is",
    "where've": "where have",
    "who'll": "who shall / who will",
    "who'll've": "who shall have / who will have",
    "who's": "who has / who is",
    "who've": "who have",
    "why's": "why has / why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had / you would",
    "you'd've": "you would have",
    "you'll": "you shall / you will",
    "you'll've": "you shall have / you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []
    for word in q.split():
        if word in contractions:
            word = contractions[word]
        q_decontracted.append(word)

    q = " ".join(q_decontracted)
    q = q.replace("'ve"," have")
    q = q.replace("n't"," not")
    q = q.replace("'re"," are")
    q = q.replace("'ll"," will")
    # removing html tags

    q = BeautifulSoup(q)
    q = q.get_text()

    # Remove punctuation
    pattern = re.compile('\W')
    q = re.sub(pattern,' ',q).strip()

    return q

    
    


   
    

In [66]:
preprocess("I've already! wasn't  <b>done</b>?")

'i have already  was not done'

In [67]:
dataset['question1'] = dataset['question1'].apply(preprocess)
dataset['question2'] = dataset['question2'].apply(preprocess)

In [68]:
dataset.head()

Unnamed: 0,question1,question2,is_duplicate
398782,what is the best marketing automation tool for...,what is the best marketing automation tool for...,1
115086,i am poor but i want to invest what should i do,i am quite poor and i want to be very rich wh...,0
327711,i am from india and live abroad i met a guy f...,t i e t to thapar university to thapar univers...,0
367788,why do so many people in the u s hate the sou...,my boyfriend doesnt feel guilty when he hurts ...,0
151235,consequences of bhopal gas tragedy,what was the reason behind the bhopal gas tragedy,0


# Tokenization

In [69]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [70]:
tokenizer = Tokenizer()

In [71]:
dataset.head()

Unnamed: 0,question1,question2,is_duplicate
398782,what is the best marketing automation tool for...,what is the best marketing automation tool for...,1
115086,i am poor but i want to invest what should i do,i am quite poor and i want to be very rich wh...,0
327711,i am from india and live abroad i met a guy f...,t i e t to thapar university to thapar univers...,0
367788,why do so many people in the u s hate the sou...,my boyfriend doesnt feel guilty when he hurts ...,0
151235,consequences of bhopal gas tragedy,what was the reason behind the bhopal gas tragedy,0


# Tokenization & Padding of Question Pairs

In [72]:
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Step 1: Tokenizer fit on combined text of both question1 and question2
tokenizer.fit_on_texts(pd.concat([dataset['question1'], dataset['question2']]))

# Convert question1 and question2 into sequences of integers
q1_sequences = tokenizer.texts_to_sequences(dataset['question1'])
q2_sequences = tokenizer.texts_to_sequences(dataset['question2'])

# Step 2: Find maximum sequence length across both Q1 and Q2
max_len = max(max(map(len, q1_sequences)), max(map(len, q2_sequences)))

# Step 3: Pad all sequences to the same length (post-padding with zeros)
q1_padded = pad_sequences(q1_sequences, maxlen=max_len, padding='post')
q2_padded = pad_sequences(q2_sequences, maxlen=max_len, padding='post')

# Shapes after padding
print("Q1 shape:", q1_padded.shape)
print("Q2 shape:", q2_padded.shape)


Q1 shape: (400000, 253)
Q2 shape: (400000, 253)


In [76]:
# Combine question1 and question2 padded sequences
X = np.concatenate([q1_padded, q2_padded], axis=1)

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, dataset['is_duplicate'], test_size=0.1, random_state=42)

# Convert to numpy with correct dtypes
X_train = np.array(X_train).astype(np.int32)
X_test = np.array(X_test).astype(np.int32)
y_train = np.array(y_train).astype(np.float32)
y_test = np.array(y_test).astype(np.float32)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)


X_train shape: (360000, 506)
y_train shape: (360000,)


# Training Word2Vec Embeddings for Quora Question Pairs

In [77]:
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import text_to_word_sequence
import multiprocessing

# Training Word2Vec Embeddings for Quora Question Pairs

# Prepare corpus by combining 'question1' and 'question2' columns into a single list
# This ensures that the Word2Vec model learns word relationships across both fields
questions = pd.concat([dataset['question1'].astype(str), dataset['question2'].astype(str)], axis=0).tolist()

# Tokenize each question into a list of words (lowercased + punctuation removed)
# Example: "How are you?" -> ["how", "are", "you"]
tokenized_sentences = [text_to_word_sequence(q) for q in questions]

# Initialize and train Word2Vec model
# - vector_size: dimensionality of embeddings (common choices: 100 or 300)
# - window: context window size (number of words before/after target word)
# - min_count: ignore words with frequency < 2 (reduces noise)
# - workers: number of CPU cores used for training (parallelization)
# - sg: training algorithm (1 = Skip-gram, 0 = CBOW)
# - epochs: number of training iterations over the data
embedding_dim = 300
w2v_model = Word2Vec(
    sentences=tokenized_sentences,
    vector_size=embedding_dim,
    window=5,
    min_count=2,
    workers=multiprocessing.cpu_count(),
    sg=1,   # Skip-gram model is better for capturing semantic meaning
    epochs=5
)

# Print confirmation and size of the learned vocabulary
print("Word2Vec training done. Vocabulary size:", len(w2v_model.wv))


Word2Vec training done. Vocabulary size: 52405


# Creating Embedding Matrix from Word2Vec for LSTM/ML Models

In [78]:
import numpy as np

# Total vocabulary size (all unique tokens from tokenizer)
# +1 is added for padding token (index 0, usually reserved)
vocab_size = len(tokenizer.word_index) + 1  

# Initialize embedding matrix with zeros
# Shape: (vocab_size, embedding_dim)
# Each row represents the embedding vector for one word
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Track how many words are Out-Of-Vocabulary (OOV) i.e., not found in trained Word2Vec
oov_count = 0

# Fill embedding matrix
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        # If the word exists in Word2Vec vocabulary, use its trained embedding
        embedding_matrix[i] = w2v_model.wv[word]
    else:
        # If not found (OOV), initialize a random vector
        # Random initialization helps model learn embeddings during training
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim, ))  
        oov_count += 1  # count OOV words

# Display embedding matrix details
print("Embedding matrix shape:", embedding_matrix.shape)   # Should be (vocab_size, embedding_dim)
print("Out of vocab words:", oov_count)


Embedding matrix shape: (86383, 300)
Out of vocab words: 33977


# BiLSTM Model for Quora Duplicate Question Pairs

In [80]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

# -------------------------------
# BiLSTM Model for Quora Duplicate Question Pairs
# -------------------------------

# max_len = length of padded sequences (combined q1 + q2 tokens)
# This defines the fixed input size for the model
max_len = X_train.shape[1]  

# Initialize Sequential model
model = Sequential()

# Embedding layer (using pre-trained Word2Vec embeddings)
model.add(Embedding(
    input_dim=vocab_size,          # total vocabulary size
    output_dim=embedding_dim,      # embedding dimension (e.g., 300)
    weights=[embedding_matrix],    # pre-trained embedding matrix
    input_length=max_len,          # input length = max padded sequence length
    trainable=True                 # embeddings will be fine-tuned during training
))

# First Bidirectional LSTM layer
# return_sequences=True → passes full sequence to next LSTM
model.add(Bidirectional(LSTM(256, return_sequences=True)))

# Dropout to prevent overfitting
model.add(Dropout(0.3))

# Second Bidirectional LSTM layer
# No return_sequences → outputs final hidden state only
model.add(Bidirectional(LSTM(256)))

# Dense layer with ReLU activation (hidden layer for learning complex patterns)
model.add(Dense(256, activation='relu'))

# Dropout again for regularization
model.add(Dropout(0.4))

# Output layer → 1 neuron (binary classification: duplicate or not)
# Sigmoid activation gives probability between 0 and 1
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(
    loss='binary_crossentropy',    # suitable for binary classification
    optimizer='adam',              # adaptive optimizer
    metrics=['accuracy']           # track accuracy
)

# Print model summary to check architecture
model.summary()


# Training BiLSTM with EarlyStopping & ModelCheckpoint

In [82]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# EarlyStopping → stop training when validation loss stops improving
es = EarlyStopping(
    monitor='val_loss',        # monitor validation loss
    patience=3,                # stop if no improvement for 3 epochs
    restore_best_weights=True  # restore weights from the best epoch
)

# ModelCheckpoint → save best model based on validation accuracy
mc = ModelCheckpoint(
    'best_model.h5',           # filename to save best model
    monitor='val_accuracy',    # monitor validation accuracy
    save_best_only=True,       # save only the best model
    mode='max'                 # maximize validation accuracy
)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=10,                 # number of training epochs
    batch_size=512,            # number of samples per batch
    validation_split=0.1,      # 10% of training data used for validation
    callbacks=[es, mc],        # use EarlyStopping & ModelCheckpoint
    verbose=1                  # 1 = progress bar with details
)


Epoch 1/10
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1041s[0m 2s/step - accuracy: 0.7159 - loss: 0.5558 - val_accuracy: 0.7515 - val_loss: 0.5031
Epoch 2/10
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1028s[0m 2s/step - accuracy: 0.7671 - loss: 0.4777 - val_accuracy: 0.7566 - val_loss: 0.4973
Epoch 3/10
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1027s[0m 2s/step - accuracy: 0.7968 - loss: 0.4198 - val_accuracy: 0.7632 - val_loss: 0.5110
Epoch 4/10
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1028s[0m 2s/step - accuracy: 0.8239 - loss: 0.3682 - val_accuracy: 0.7674 - val_loss: 0.5520
Epoch 5/10
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1025s[0m 2s/step - accuracy: 0.8451 - loss: 0.3271 - val_accuracy: 0.7586 - val_loss: 0.6299


In [83]:
# Load best saved model
from tensorflow.keras.models import load_model
best_model = load_model('best_model.h5')

# Evaluate on test set
loss, acc = best_model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {acc*100:.2f}%")


[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 59ms/step - accuracy: 0.7726 - loss: 0.5419
Test Accuracy: 77.05%
