# Import Required Libraries

In [None]:
# Basic imports
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

# TensorFlow / Keras imports
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, BatchNormalization, Lambda, concatenate, Multiply
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

print("TensorFlow version:", tf.__version__)


TensorFlow version: 2.19.0
TensorFlow version: 2.19.0


In [None]:
# Load dataset
df = pd.read_csv("train.csv")

# select columns first
df = df[['question1', 'question2', 'is_duplicate']]

# then drop missing values (inplace)
df.dropna(inplace=True)

# convert label to int
df['is_duplicate'] = df['is_duplicate'].astype(int)

print("Dataset shape:", df.shape)
print(df['is_duplicate'].value_counts(normalize=True))
print(df.head(3).to_dict(orient='records'))



Dataset shape: (404287, 3)
is_duplicate
0    0.630799
1    0.369201
Name: proportion, dtype: float64
[{'question1': 'What is the step by step guide to invest in share market in india?', 'question2': 'What is the step by step guide to invest in share market?', 'is_duplicate': 0}, {'question1': 'What is the story of Kohinoor (Koh-i-Noor) Diamond?', 'question2': 'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?', 'is_duplicate': 0}, {'question1': 'How can I increase the speed of my internet connection while using a VPN?', 'question2': 'How can Internet speed be increased by hacking through DNS?', 'is_duplicate': 0}]


# Preprocessing

In [None]:
import re
from bs4 import BeautifulSoup
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


def preprocess(q):
    q = str(q).lower().strip()

    # Replace special characters
    q = q.replace('%','percent').replace('$','dollar').replace('@','at').replace('₹','rupee').replace('€','euro')

    # Remove math token
    q = q.replace('[math]','')

    # Numbers to k/m/b
    q = q.replace(',000,000,000','b').replace(',000,000','m').replace(',000','k')
    q = re.sub(r'([0-9]+)000000000',r'\1b',q)
    q = re.sub(r'([0-9]+)000000',r'\1m',q)
    q = re.sub(r'([0-9]+)000',r'\1k',q)

    # Contractions
    contractions = {
       "ain't":"is not","aren't":"are not","can't":"cannot","can't've":"cannot have","cause":"because",
    "could've":"could have","couldn't":"could not","didn't":"did not","doesn't":"does not","don't":"do not",
    "hadn't":"had not","hasn't":"has not","haven't":"have not","he'd":"he would","he'll":"he will",
    "he's":"he is","how'd":"how did","how'll":"how will","how's":"how is","i'd":"i would","i'll":"i will",
    "i'm":"i am","isn't":"is not","it'd":"it would","it'll":"it will","it's":"it is","let's":"let us",
    "ma'am":"madam","mightn't":"might not","mustn't":"must not","shan't":"shall not","she'd":"she would",
    "she'll":"she will","she's":"she is","should've":"should have","shouldn't":"should not","that's":"that is",
    "there's":"there is","they'd":"they would","they'll":"they will","they're":"they are","they've":"they have",
    "wasn't":"was not","we'd":"we would","we're":"we are","we've":"we have","weren't":"were not","what'll":"what will",
    "what're":"what are","what's":"what is","what've":"what have","where's":"where is","who's":"who is",
    "won't":"will not","would've":"would have","wouldn't":"would not","you'd":"you would","you'll":"you will","you're":"you are"
    }
    # Cleaner function
    REPLACE_BY_SPACE_RE = re.compile(r'[\t\n\r]+')
    BAD_SYMBOLS_RE = re.compile(r'[^0-9a-z ]')

    q_decontracted = []
    for word in q.split():
        if word in contractions:
            word = contractions[word]
        q_decontracted.append(word)
    q = " ".join(q_decontracted)

    # Remove HTML tags
    q = BeautifulSoup(q, "html.parser").get_text()

    # Remove punctuation
    q = re.sub(r'\W', ' ', q).strip()

    # replace newlines and tabs with space
    q = REPLACE_BY_SPACE_RE.sub(' ', q)
    # remove unwanted characters (keep a-z and numbers)
    q = BAD_SYMBOLS_RE.sub(' ', q)
    # collapse multiple spaces
    q = re.sub(' +', ' ', q).strip()

    return q




In [None]:
# Apply preprocessing
df['question1'] = df['question1'].apply(preprocess)
df['question2'] = df['question2'].apply(preprocess)

In [None]:
df.head(3)

Unnamed: 0,question1,question2,is_duplicate
0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,what is the story of kohinoor koh i noor diamond,what would happen if the indian government sto...,0
2,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0


# Tokenization

In [None]:
# Tokenization
MAX_NB_WORDS = 200000
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token='__OOV__')
tokenizer.fit_on_texts(pd.concat([df['question1'], df['question2']]))
# --- Save tokenizer for later use ---
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

q1_sequences = tokenizer.texts_to_sequences(df['question1'])
q2_sequences = tokenizer.texts_to_sequences(df['question2'])

In [None]:
# Calculate lengths of all question sequences (q1 + q2)
lengths = [len(seq) for seq in q1_sequences + q2_sequences]

import numpy as np

# Show the 95th percentile length
print("95th percentile:", np.percentile(lengths, 95))

# Show the maximum sequence length
print("Max length:", max(lengths))


95th percentile: 23.0
Max length: 247


In [None]:
# Find maximum length and pad sequences
import gc

# calculate max length
max_len = 30

q1_padded = pad_sequences(q1_sequences, maxlen=max_len, padding='post', truncating='post')
q2_padded = pad_sequences(q2_sequences, maxlen=max_len, padding='post', truncating='post')

# Labels
labels = df['is_duplicate'].values

print("Q1 shape:", q1_padded.shape)
print("Q2 shape:", q2_padded.shape)
print("Labels shape:", labels.shape)

# Free memory
_del = [q1_sequences, q2_sequences]
_del = None
gc.collect()


Q1 shape: (404287, 30)
Q2 shape: (404287, 30)
Labels shape: (404287,)


0

In [None]:
# ===============================
# Hyperparameters & Seed
# ===============================
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

LSTM_UNITS = 256
DROPOUT_RATE = 0.2
EMB_TRAINABLE = False
BATCH_SIZE = 512
EPOCHS = 20
EMBEDDING_DIM = 100
vocab_size = min(200000, len(tokenizer.word_index)+1)


# ===============================
# Load GloVe embeddings & build embedding matrix
# ===============================
import os
GLOVE_DIR = "./glove"  # adjust path if needed
os.makedirs(GLOVE_DIR, exist_ok=True)

glove_path = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')
if not os.path.exists(glove_path):
    print('Downloading GloVe embeddings...')
    !wget -q http://nlp.stanford.edu/data/glove.6B.zip -P {GLOVE_DIR}
    !unzip -o {os.path.join(GLOVE_DIR,"glove.6B.zip")} -d {GLOVE_DIR}
else:
    print('GloVe already present')

# Build embedding index
emb_index = {}
with open(glove_path, 'r', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        emb_index[word] = coefs

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    vec = emb_index.get(word)
    if vec is not None:
        embedding_matrix[i] = vec

emb_index = None  # free memory
print('Embedding matrix shape:', embedding_matrix.shape)

# ===============================
# Train/validation split
# ===============================
from sklearn.model_selection import train_test_split

X_q1_train, X_q1_val, X_q2_train, X_q2_val, y_train, y_val = train_test_split(
    q1_padded, q2_padded, labels, test_size=0.1, random_state=SEED, stratify=labels
)
print('Train size:', len(y_train), 'Val size:', len(y_val))



Downloading GloVe embeddings...
Archive:  ./glove/glove.6B.zip
  inflating: ./glove/glove.6B.50d.txt  
  inflating: ./glove/glove.6B.100d.txt  
  inflating: ./glove/glove.6B.200d.txt  
  inflating: ./glove/glove.6B.300d.txt  
Embedding matrix shape: (85876, 100)
Train size: 363858 Val size: 40429


# Model architecture with GloVe embeddings

In [None]:
# Model architecture with GloVe embeddings
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, BatchNormalization, Lambda, concatenate,Subtract, Multiply
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

q1_in = Input(shape=(max_len,))
q2_in = Input(shape=(max_len,))

# Embedding layer (shared)
embedding_layer = Embedding(input_dim=vocab_size,
                            output_dim=EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=EMB_TRAINABLE,
                            name='embedding')

# Shared encoder (BiLSTM)
shared_lstm = Bidirectional(LSTM(LSTM_UNITS, return_sequences=False, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE))


# Encode both inputs
q1_emb = embedding_layer(q1_in)
q2_emb = embedding_layer(q2_in)

q1_vec = shared_lstm(q1_emb)
q2_vec = shared_lstm(q2_emb)

# Combine features: absolute difference, element-wise multiply, and concat original vectors
class AbsLayer(tf.keras.layers.Layer):
    def call(self, inputs):
        return tf.abs(inputs)

diff = Subtract()([q1_vec, q2_vec])
abs_diff = AbsLayer()(diff)


mul = Multiply()([q1_vec,q2_vec])
merged = concatenate([q1_vec,q2_vec,abs_diff,mul])

# Dense layers for classification
x = BatchNormalization()(merged)
x = Dense(256, activation='relu')(x)
x = Dropout(DROPOUT_RATE)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(DROPOUT_RATE)(x)
out = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[q1_in, q2_in], outputs=out)
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(1e-3), metrics=['accuracy'])
model.summary()

# ===============================
# Callbacks & Class Weights
# ===============================
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from sklearn.utils import class_weight

checkpoint_path = 'siamese_bilstm_best.h5'
callbacks = [
    ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, verbose=1,mode='min'),
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True,mode='min'),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, min_lr=1e-6, verbose=1)
]

class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = {i: w for i,w in enumerate(class_weights)}


# Model Training

In [None]:
# Train model

history = model.fit(
    [X_q1_train,X_q2_train], y_train,
    validation_data=([X_q1_val,X_q2_val], y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    class_weight=class_weights,
    verbose=1
)

# ===============================
# Save final model
# ===============================
model.save('siamese_bilstm_final.keras')
print("Model saved successfully!")

Epoch 1/20
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 325ms/step - accuracy: 0.6668 - loss: 0.5913
Epoch 1: val_loss improved from inf to 0.62532, saving model to siamese_bilstm_best.h5




[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 335ms/step - accuracy: 0.6669 - loss: 0.5912 - val_accuracy: 0.6515 - val_loss: 0.6253 - learning_rate: 0.0010
Epoch 2/20
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 322ms/step - accuracy: 0.7571 - loss: 0.4723
Epoch 2: val_loss improved from 0.62532 to 0.53487, saving model to siamese_bilstm_best.h5




[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 331ms/step - accuracy: 0.7571 - loss: 0.4723 - val_accuracy: 0.7215 - val_loss: 0.5349 - learning_rate: 0.0010
Epoch 3/20
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 324ms/step - accuracy: 0.7780 - loss: 0.4389
Epoch 3: val_loss did not improve from 0.53487
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 332ms/step - accuracy: 0.7780 - loss: 0.4389 - val_accuracy: 0.7325 - val_loss: 0.5539 - learning_rate: 0.0010
Epoch 4/20
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 325ms/step - accuracy: 0.7919 - loss: 0.4154
Epoch 4: val_loss did not improve from 0.53487
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 331ms/step - accuracy: 0.7919 - loss: 0.4154 - val_accuracy: 0.7371 - val_loss: 0.5435 - learning_rate: 0.0010
Epoch 5/20
[1m711/711[0m [32m━━━━━━━━━━━━━━━━



[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 335ms/step - accuracy: 0.8029 - loss: 0.3975 - val_accuracy: 0.7406 - val_loss: 0.5340 - learning_rate: 0.0010
Epoch 6/20
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 326ms/step - accuracy: 0.8115 - loss: 0.3824
Epoch 6: val_loss improved from 0.53403 to 0.48469, saving model to siamese_bilstm_best.h5




[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 334ms/step - accuracy: 0.8115 - loss: 0.3824 - val_accuracy: 0.7679 - val_loss: 0.4847 - learning_rate: 0.0010
Epoch 7/20
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 324ms/step - accuracy: 0.8188 - loss: 0.3693
Epoch 7: val_loss did not improve from 0.48469
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 332ms/step - accuracy: 0.8188 - loss: 0.3693 - val_accuracy: 0.7648 - val_loss: 0.4930 - learning_rate: 0.0010
Epoch 8/20
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 322ms/step - accuracy: 0.8249 - loss: 0.3573
Epoch 8: val_loss did not improve from 0.48469
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 330ms/step - accuracy: 0.8249 - loss: 0.3573 - val_accuracy: 0.7700 - val_loss: 0.4955 - learning_rate: 0.0010
Epoch 9/20
[1m711/711[0m [32m━━━━━━━━━━━━━━━━



[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 345ms/step - accuracy: 0.8360 - loss: 0.3387 - val_accuracy: 0.7740 - val_loss: 0.4833 - learning_rate: 0.0010
Epoch 11/20
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 325ms/step - accuracy: 0.8412 - loss: 0.3278
Epoch 11: val_loss improved from 0.48327 to 0.46911, saving model to siamese_bilstm_best.h5




[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 333ms/step - accuracy: 0.8412 - loss: 0.3278 - val_accuracy: 0.7826 - val_loss: 0.4691 - learning_rate: 0.0010
Epoch 12/20
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 328ms/step - accuracy: 0.8456 - loss: 0.3206
Epoch 12: val_loss improved from 0.46911 to 0.44948, saving model to siamese_bilstm_best.h5




[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 337ms/step - accuracy: 0.8456 - loss: 0.3206 - val_accuracy: 0.7872 - val_loss: 0.4495 - learning_rate: 0.0010
Epoch 13/20
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 330ms/step - accuracy: 0.8492 - loss: 0.3132
Epoch 13: val_loss did not improve from 0.44948
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 337ms/step - accuracy: 0.8492 - loss: 0.3132 - val_accuracy: 0.7915 - val_loss: 0.4685 - learning_rate: 0.0010
Epoch 14/20
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 327ms/step - accuracy: 0.8524 - loss: 0.3075
Epoch 14: val_loss improved from 0.44948 to 0.42874, saving model to siamese_bilstm_best.h5




[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 334ms/step - accuracy: 0.8524 - loss: 0.3075 - val_accuracy: 0.8035 - val_loss: 0.4287 - learning_rate: 0.0010
Epoch 15/20
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 328ms/step - accuracy: 0.8572 - loss: 0.2984
Epoch 15: val_loss did not improve from 0.42874
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m239s[0m 337ms/step - accuracy: 0.8572 - loss: 0.2984 - val_accuracy: 0.7983 - val_loss: 0.4476 - learning_rate: 0.0010
Epoch 16/20
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 327ms/step - accuracy: 0.8599 - loss: 0.2928
Epoch 16: val_loss did not improve from 0.42874
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 342ms/step - accuracy: 0.8599 - loss: 0.2928 - val_accuracy: 0.7935 - val_loss: 0.4602 - learning_rate: 0.0010
Epoch 17/20
[1m711/711[0m [32m━━━━━━━━━━━

# Evaluation

In [None]:
# Evaluation on validation set
val_preds = model.predict([X_q1_val, X_q2_val], batch_size=1024)
val_preds_label = (val_preds.flatten() >= 0.5).astype(int)

print('Accuracy:', accuracy_score(y_val, val_preds_label))
print('F1:', f1_score(y_val, val_preds_label))
print('Precision:', precision_score(y_val, val_preds_label))
print('Recall:', recall_score(y_val, val_preds_label))
print('\nClassification report:\n', classification_report(y_val, val_preds_label))


[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 105ms/step
Accuracy: 0.8035321180340844
F1: 0.7789989148882891
Precision: 0.666143231025458
Recall: 0.9378936084684443

Classification report:
               precision    recall  f1-score   support

           0       0.95      0.72      0.82     25503
           1       0.67      0.94      0.78     14926

    accuracy                           0.80     40429
   macro avg       0.81      0.83      0.80     40429
weighted avg       0.85      0.80      0.81     40429



# Duplicate Question Pair Prediction

In [None]:

def preprocess_single(q):
    q = preprocess(q)
    seq = tokenizer.texts_to_sequences([q])
    pad = pad_sequences(seq, maxlen=30, padding='post')
    return pad

def predict_pair(q1,q2,thresh=0.5):
    s1 = preprocess_single(q1)
    s2 = preprocess_single(q2)
    p = model.predict([s1,s2])[0,0]
    return {'probability': float(p), 'is_duplicate': int(p>=thresh)}

# Test examples
examples = [
    ("What is the capital of France?", "Which city is the capital of France?"),
    ("How to lose weight fast?", "What is the best way to lose weight in 2 weeks?"),
    ("Who is the president of the USA?", "Name the current US president"),
    ("What is AI?", "Explain artificial intelligence"),
    ("How can I cook pasta?", "What are some tips to make spaghetti?"),
    ("Best programming language for beginners?", "Which language should a new programmer learn first?"),
    ("Where is Mount Everest located?", "Which country has Mount Everest?"),
    ("What is the time in London?", "Tell me the population of London"),
    ("Can I lose weight without exercise?", "Is it possible to reduce weight by only dieting?"),
    ("Who won the FIFA World Cup in 2018?","Which country was the winner of FIFA 2018?"),
    ("Who is the founder of Microsoft?","Who started Microsoft company?"),
    ("What is the idea behind democracy?","What is the core idea of democracy?"),
    ("What is the idea behind democracy?","What is the core idea of sociology?")
]

for a,b in examples:
    print(a,"||",b,"->", predict_pair(a,b))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
What is the capital of France? || Which city is the capital of France? -> {'probability': 0.9271326065063477, 'is_duplicate': 1}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 201ms/step
How to lose weight fast? || What is the best way to lose weight in 2 weeks? -> {'probability': 0.8783103823661804, 'is_duplicate': 1}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 194ms/step
Who is the president of the USA? || Name the current US president -> {'probability': 0.9656570553779602, 'is_duplicate': 1}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 261ms/step
What is AI? || Explain artificial intelligence -> {'probability': 0.010477392934262753, 'is_duplicate': 0}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step
How can I cook pasta? || What are some tips to make spaghetti? -> {'probability': 0.26347556710243225, 'is_duplicate': 0}
[1m1/1[0m [32m━━