In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-question-pairs/train.csv.zip
/kaggle/input/quora-question-pairs/sample_submission.csv.zip
/kaggle/input/quora-question-pairs/test.csv
/kaggle/input/quora-question-pairs/test.csv.zip


In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import f1_score

In [3]:
train_file_path = '/kaggle/input/quora-question-pairs/train.csv.zip'
test_file_path = '/kaggle/input/quora-question-pairs/test.csv'

In [4]:
train_df = pd.read_csv(train_file_path, compression='zip')
test_df = pd.read_csv(test_file_path)

In [5]:
df_train, df_val = train_test_split(train_df, test_size=0.1, random_state=42)

In [6]:
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    # Convert questions to strings
    question1_list = df['question1'].astype(str).tolist()
    question2_list = df['question2'].astype(str).tolist()

    # Tokenize both questions together (cross-encoder)
    encodings = tokenizer(
        question1_list,
        question2_list,
        max_length=max_len,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    # Labels
    labels = torch.tensor(df['is_duplicate'].values)

    # Create TensorDataset and DataLoader
    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [8]:
max_len = 128
batch_size = 16

In [9]:
train_data_loader = create_data_loader(df_train, tokenizer, max_len, batch_size)
val_data_loader = create_data_loader(df_val, tokenizer, max_len, batch_size)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [11]:
def train_model(model, data_loader, optimizer, device):
    model.train()
    for batch in tqdm(data_loader):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [12]:
def evaluate_model(model, data_loader, device):
    model.eval()
    total_loss, total_correct = 0, 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()

            # Get predictions
            preds = torch.argmax(outputs.logits, dim=1)
            total_correct += torch.sum(preds == labels)

    return total_loss / len(data_loader), total_correct.double() / len(data_loader.dataset)

In [13]:
epochs = 1
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train_model(model, train_data_loader, optimizer, device)
    val_loss, val_acc = evaluate_model(model, val_data_loader, device)
    print(f"Validation loss: {val_loss}, Validation accuracy: {val_acc}")

Epoch 1/1


100%|██████████| 22742/22742 [2:08:08<00:00,  2.96it/s]


Validation loss: 0.24651506325199782, Validation accuracy: 0.8962625837888644
