In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-question-pairs/train.csv.zip
/kaggle/input/quora-question-pairs/sample_submission.csv.zip
/kaggle/input/quora-question-pairs/test.csv
/kaggle/input/quora-question-pairs/test.csv.zip


In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import functional as F
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [3]:
train_file_path = '/kaggle/input/quora-question-pairs/train.csv.zip'
test_file_path = '/kaggle/input/quora-question-pairs/test.csv'

In [4]:
train_df = pd.read_csv(train_file_path, compression='zip')
test_df = pd.read_csv(test_file_path)

In [5]:
df_train, df_val = train_test_split(train_df, test_size=0.1, random_state=42)

In [10]:
class CosineSimilarity(torch.nn.Module):
    def __init__(self, model_name):
        super(CosineSimilarity, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):
        outputs1 = self.model(input_ids1, attention_mask=attention_mask1)
        outputs2 = self.model(input_ids2, attention_mask=attention_mask2)
        embeddings1 = outputs1.last_hidden_state[:, 0, :]
        embeddings2 = outputs2.last_hidden_state[:, 0, :]
        similarity = F.cosine_similarity(embeddings1, embeddings2)
        return similarity

In [11]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    question1_list = df['question1'].astype(str).tolist()
    question2_list = df['question2'].astype(str).tolist()
    
    # Tokenize the inputs
    encodings = tokenizer(
        text=question1_list,
        text_pair=question2_list,
        max_length=max_len,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    
    input_ids1 = encodings['input_ids']
    attention_mask1 = encodings['attention_mask']
    input_ids2 = encodings['token_type_ids']  # Use token_type_ids as input_ids2
    attention_mask2 = encodings['attention_mask']
    
    labels = torch.tensor(df['is_duplicate'].values)
    
    dataset = TensorDataset(input_ids1, attention_mask1, input_ids2, attention_mask2, labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [12]:
model_name = 'bert-base-uncased'
bi_encoder_cosine = CosineSimilarity(model_name)
tokenizer = bi_encoder_cosine.tokenizer
max_len = 128
batch_size = 8

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



In [13]:
train_loader = create_data_loader(df_train, tokenizer, max_len, batch_size)
val_loader = create_data_loader(df_val, tokenizer, max_len, batch_size)

In [14]:
optimizer = torch.optim.Adam(bi_encoder_cosine.parameters(), lr=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bi_encoder_cosine.to(device)

CosineSimilarity(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [16]:
for epoch in range(1):  # Example: 3 epochs
    bi_encoder_cosine.train()
    running_loss = 0.0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
        input_ids1, attention_mask1, input_ids2, attention_mask2, labels = batch
        input_ids1, attention_mask1 = input_ids1.to(device), attention_mask1.to(device)
        input_ids2, attention_mask2 = input_ids2.to(device), attention_mask2.to(device)
        labels = labels.to(device).float()

        optimizer.zero_grad()
        outputs = bi_encoder_cosine(input_ids1, attention_mask1, input_ids2, attention_mask2)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {running_loss / len(train_loader)}")

Epoch 1: 100%|██████████| 45483/45483 [4:29:33<00:00,  2.81it/s]  

Epoch 1 Loss: 0.457991462282048





In [1]:
bi_encoder_cosine.eval()
val_loss = 0.0
with torch.no_grad():
    for batch in tqdm(val_loader, desc='Validation'):
        input_ids1, attention_mask1, input_ids2, attention_mask2, labels = batch
        input_ids1, attention_mask1 = input_ids1.to(device), attention_mask1.to(device)
        input_ids2, attention_mask2 = input_ids2.to(device), attention_mask2.to(device)
        labels = labels.to(device).float()

        outputs = bi_encoder_cosine(input_ids1, attention_mask1, input_ids2, attention_mask2)
        loss = criterion(outputs, labels)
        val_loss += loss.item()

NameError: name 'bi_encoder_cosine' is not defined

In [None]:
 print(f"Validation Loss: {val_loss / len(val_loader)}")