In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
from datasets import load_dataset
from tqdm import tqdm

# Load the Quora Question Pairs dataset
dataset = load_dataset("quora")

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Function to encode a sentence into embeddings
def encode_sentence(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Compute similarity between two sentences
def calculate_similarity(sentence1, sentence2):
    embedding1 = encode_sentence(sentence1)
    embedding2 = encode_sentence(sentence2)
    similarity = cosine_similarity([embedding1], [embedding2])
    return similarity[0][0]

# Define a custom Dataset class
class QuoraDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question1 = self.data[idx]['question1']
        question2 = self.data[idx]['question2']
        label = self.data[idx]['is_duplicate']

        # Tokenize both questions
        inputs = self.tokenizer(
            question1, question2, truncation=True, padding=True, max_length=self.max_length, return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Training function
def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in tqdm(train_loader, desc="Training", ncols=100):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.pooler_output

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / len(train_loader)
    accuracy = correct / total
    return avg_loss, accuracy

# Evaluation function
def evaluate_model(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating", ncols=100):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.pooler_output

            loss = criterion(logits, labels)
            total_loss += loss.item()
            _, preds = torch.max(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(val_loader)
    accuracy = correct / total
    return avg_loss, accuracy

# Data preprocessing and preparation
def prepare_data():
    # Convert to pandas dataframe
    data = dataset['train']
    data = pd.DataFrame(data)

    # Split into train and validation sets
    train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

    # Create datasets
    train_dataset = QuoraDataset(train_data.to_dict('records'), tokenizer)
    val_dataset = QuoraDataset(val_data.to_dict('records'), tokenizer)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)

    return train_loader, val_loader

# Fine-tune BERT model
def fine_tune_model():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Set optimizer and loss function
    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()

    # Prepare data
    train_loader, val_loader = prepare_data()

    # Training loop
    for epoch in range(3):
        print(f"Epoch {epoch+1}")
        train_loss, train_accuracy = train_model(model, train_loader, optimizer, criterion, device)
        val_loss, val_accuracy = evaluate_model(model, val_loader, criterion, device)

        print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

    # Save the fine-tuned model
    torch.save(model.state_dict(), "fine_tuned_bert_model.pth")
    print("Model saved.")

# Example to calculate similarity on a few question pairs
def calculate_sample_similarity():
    question1 = dataset['train'][0]['question1']
    question2 = dataset['train'][0]['question2']
    label = dataset['train'][0]['is_duplicate']

    print(f"Question 1: {question1}")
    print(f"Question 2: {question2}")
    print(f"Ground Truth Label (1=Duplicate, 0=Not): {label}")

    similarity = calculate_similarity(question1, question2)
    print(f"Cosine Similarity Score: {similarity:.4f}")

# Main function to train and evaluate the model
def main():
    # Uncomment the line below to fine-tune the model
    # fine_tune_model()

    # Evaluate a sample similarity
    calculate_sample_similarity()

if __name__ == "__main__":
    main()
