In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm



In [3]:
df = pd.read_csv("questions.csv")
df=df[6::7]
df = df[df['is_duplicate'].apply(lambda x: str(x).strip().isdigit())]

# Reset index after filtering
df.reset_index(drop=True, inplace=True)

# Convert 'is_duplicate' column to integer
df['is_duplicate'] = df['is_duplicate'].astype(int)
df["id"]=df.index
df


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
1,1,27,28,What was your first sexual experience like?,What was your first sexual experience?,1
2,2,41,42,Why do rockets look white?,Why are rockets and boosters painted white?,1
3,3,55,56,Does society place too much importance on sports?,How do sports contribute to the society?,0
4,4,69,70,What is the best travel website in spain?,What is the best travel website?,0
...,...,...,...,...,...,...
57759,57759,789738,789739,What is the difference between a psychologist ...,What is the difference between psychologist an...,1
57760,57760,789752,789753,How can I gain weight on my body?,What should I eat to gain weight?,1
57761,57761,789766,789767,What are the best new car products or inventio...,What are some mind-blowing vehicles tools that...,1
57762,57762,789780,789781,What is the minimum time required to build a f...,What is a cheaper and quicker way to build an ...,0


In [20]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    
    if pd.isnull(text):
        return ''

    # Lowercasing
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    tokens = text.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    clean_text = ' '.join(tokens)

    return clean_text

dff1['question1'] = dff1['question1'].apply(preprocess)
dff1['question2'] = dff1['question2'].apply(preprocess)

dff1.head


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
pip install fuzzywuzzy



In [23]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
from nltk.tokenize import word_tokenize
from fuzzywuzzy import fuzz

def extract_basic_features(df):
    dff1['q1_len_char'] = dff1['question1'].apply(len)
    dff1['q2_len_char'] = dff1['question2'].apply(len)
    dff1['q1_len_word'] = dff1['question1'].apply(lambda x: len(word_tokenize(x)))
    dff1['q2_len_word'] = dff1['question2'].apply(lambda x: len(word_tokenize(x)))
    dff1['char_diff'] = abs(dff1['q1_len_char'] - df['q2_len_char'])
    dff1['word_diff'] = abs(dff1['q1_len_word'] - df['q2_len_word'])

    def common_words_count(row):
        q1_words = set(word_tokenize(row['question1']))
        q2_words = set(word_tokenize(row['question2']))
        return len(q1_words.intersection(q2_words))

    dff1['common_words'] = dff1.apply(common_words_count, axis=1)

    dff1['fuzz_ratio'] = dff1.apply(lambda x: fuzz.ratio(x['question1'], x['question2']), axis=1)
    return dff1

dff1= extract_basic_features(dff1)

dff1.head()


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 11,Unnamed: 12,q1_len_char,q2_len_char,q1_len_word,q2_len_word,char_diff,word_diff,common_words,fuzz_ratio
0,0,13,14,buy tiago,keep childern active far phone video game,0,,,,,...,,,9,41,2,7,32,5,0,20
1,1,27,28,first sexual experience like,first sexual experience,1,,,,,...,,,28,23,4,3,5,1,3,90
2,2,41,42,rocket look white,rocket booster painted white,1,,,,,...,,,17,28,3,4,11,1,2,67
3,3,55,56,society place much importance sport,sport contribute society,0,,,,,...,,,35,24,5,3,11,2,2,24
4,4,69,70,best travel website spain,best travel website,0,,,,,...,,,25,19,4,3,6,1,3,86


In [13]:
pip install gensim




In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Combine questions for TF-IDF vectorization
combined_questions = dff1['question1'] + ' ' + dff1['question2']

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_questions)

# Compute Cosine Similarity between question pairs
cosine_sim = []
for i in range(len(dff1)):
    q1_tfidf = tfidf_matrix[i]  
    q2_tfidf = tfidf_matrix[i] 
    similarity = cosine_similarity(q1_tfidf, q2_tfidf)[0][0]
    cosine_sim.append(similarity)


dff1['cosine_similarity'] = cosine_sim
dff1


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 12,q1_len_char,q2_len_char,q1_len_word,q2_len_word,char_diff,word_diff,common_words,fuzz_ratio,cosine_similarity
0,0,13,14,buy tiago,keep childern active far phone video game,0,,,,,...,,9,41,2,7,32,5,0,20,1.0
1,1,27,28,first sexual experience like,first sexual experience,1,,,,,...,,28,23,4,3,5,1,3,90,1.0
2,2,41,42,rocket look white,rocket booster painted white,1,,,,,...,,17,28,3,4,11,1,2,67,1.0
3,3,55,56,society place much importance sport,sport contribute society,0,,,,,...,,35,24,5,3,11,2,2,24,1.0
4,4,69,70,best travel website spain,best travel website,0,,,,,...,,25,19,4,3,6,1,3,86,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57715,57715,789738,789739,difference psychologist psychiatrist,difference psychologist psychatist,1,,,,,...,,36,34,3,3,2,0,2,97,1.0
57716,57716,789752,789753,gain weight body,eat gain weight,1,,,,,...,,16,15,3,3,1,0,2,71,1.0
57717,57717,789766,789767,best new car product invention people know,mind blowing vehicle tool exist people know,1,,,,,...,,42,43,7,7,1,0,2,38,1.0
57718,57718,789780,789781,minimum time required build flyover km,cheaper quicker way build underpass flyover,0,,,,,...,,38,43,6,6,5,0,2,47,1.0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
combined_questions = dff1['question1'] + ' ' + dff1['question2']
count_vectorizer = CountVectorizer(tokenizer=lambda x: x.split())
count_matrix = count_vectorizer.fit_transform(combined_questions)

q1_indices = range(len(dff1))
q2_indices = range(len(dff1))  

# Calculate Jaccard similarity for each pair of questions
jaccard_sim = []
for i, j in zip(q1_indices, q2_indices):
    q1_tokens = set(count_matrix[i].indices)
    q2_tokens = set(count_matrix[j].indices)

    if len(q1_tokens.union(q2_tokens)) == 0:
        similarity = 0
    else:
        similarity = len(q1_tokens.intersection(q2_tokens)) / len(q1_tokens.union(q2_tokens))
    jaccard_sim.append(similarity)


dff1['jaccard_similarity'] = jaccard_sim
dff1





Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,q1_len_char,q2_len_char,q1_len_word,q2_len_word,char_diff,word_diff,common_words,fuzz_ratio,cosine_similarity,jaccard_similarity
0,6,13,14,buy tiago,keep childern active far phone video game,0,,,,,...,9,41,2,7,32,5,0,20,1.0,1.0
1,13,27,28,first sexual experience like,first sexual experience,1,,,,,...,28,23,4,3,5,1,3,90,1.0,1.0
2,20,41,42,rocket look white,rocket booster painted white,1,,,,,...,17,28,3,4,11,1,2,67,1.0,1.0
3,27,55,56,society place much importance sport,sport contribute society,0,,,,,...,35,24,5,3,11,2,2,24,1.0,1.0
4,34,69,70,best travel website spain,best travel website,0,,,,,...,25,19,4,3,6,1,3,86,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57759,404319,789738,789739,difference psychologist psychiatrist,difference psychologist psychatist,1,,,,,...,36,34,3,3,2,0,2,97,1.0,1.0
57760,404326,789752,789753,gain weight body,eat gain weight,1,,,,,...,16,15,3,3,1,0,2,71,1.0,1.0
57761,404333,789766,789767,best new car product invention people know,mind blowing vehicle tool exist people know,1,,,,,...,42,43,7,7,1,0,2,38,1.0,1.0
57762,404340,789780,789781,minimum time required build flyover km,cheaper quicker way build underpass flyover,0,,,,,...,38,43,6,6,5,0,2,47,1.0,1.0


In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
texts = dff1['question1'] + ' [SEP] ' + dff1['question2']
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
input_ids = []
attention_masks = []

for text in texts:
    encoded_dict = tokenizer.encode_plus(
                        text,                      
                        add_special_tokens = True, 
                        max_length = 64,           
                        padding='max_length',
                        truncation=True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',     
                   )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(dff1['is_duplicate'].values)

train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels,
                                                            random_state=42, test_size=0.2)
train_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=42, test_size=0.2)


batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',  
    num_labels = 2,  
    output_attentions = False, 
    output_hidden_states = False,  
)

model.to(device)
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,  
                  eps = 1e-8  
                )

epochs = 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# BERT training loop
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        optimizer.zero_grad()

        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        loss = outputs.loss
        train_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = train_loss / len(train_dataloader)

    # Validation loop
    model.eval()
    val_loss = 0
    val_preds = []
    val_true = []

    for batch in val_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

        logits = outputs.logits


        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, model.num_labels), b_labels.view(-1))
        val_loss += loss.item()
        val_preds.extend(torch.argmax(logits, dim=1).flatten().tolist())
        val_true.extend(b_labels.flatten().tolist())

    avg_val_loss = val_loss / len(val_dataloader)

    print(f'Epoch {epoch + 1}/{epochs}')
    print(f'Training Loss: {avg_train_loss:.4f}')
    print(f'Validation Loss: {avg_val_loss:.4f}')
    print(f'Validation Accuracy: {accuracy_score(val_true, val_preds):.4f}')
    print(classification_report(val_true, val_preds))


Epoch 1/3: 100%|██████████| 1443/1443 [07:33<00:00,  3.18it/s]


Epoch 1/3
Training Loss: 0.4544
Validation Loss: 0.3793
Validation Accuracy: 0.8222
              precision    recall  f1-score   support

           0       0.87      0.84      0.86      7380
           1       0.74      0.78      0.76      4164

    accuracy                           0.82     11544
   macro avg       0.81      0.81      0.81     11544
weighted avg       0.83      0.82      0.82     11544



Epoch 2/3: 100%|██████████| 1443/1443 [07:32<00:00,  3.19it/s]


Epoch 2/3
Training Loss: 0.3361
Validation Loss: 0.3737
Validation Accuracy: 0.8295
              precision    recall  f1-score   support

           0       0.87      0.86      0.87      7380
           1       0.76      0.77      0.77      4164

    accuracy                           0.83     11544
   macro avg       0.81      0.82      0.82     11544
weighted avg       0.83      0.83      0.83     11544



Epoch 3/3: 100%|██████████| 1443/1443 [07:33<00:00,  3.18it/s]


Epoch 3/3
Training Loss: 0.2366
Validation Loss: 0.4741
Validation Accuracy: 0.8047
              precision    recall  f1-score   support

           0       0.92      0.76      0.83      7380
           1       0.67      0.88      0.77      4164

    accuracy                           0.80     11544
   macro avg       0.80      0.82      0.80     11544
weighted avg       0.83      0.80      0.81     11544



In [47]:
# Save final results
results = {
    "training_loss": avg_train_loss,
    "validation_loss": avg_val_loss,


}
results

{'training_loss': 0.23664526821824716, 'validation_loss': 0.4741022728610567}