In [None]:
# ✅ BERT for Quora Question Pairs: Duplicate Detection + Gradio App (Lightweight Version)

# 1. Install Dependencies
!pip install transformers datasets gradio -q

# 2. Import Libraries
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import gradio as gr
import os
os.environ["WANDB_DISABLED"] = "true"  # Disable Weights & Biases logging

# 3. Load Dataset (very small subset for fast testing)
df = pd.read_csv("train.csv").dropna()
df = df.sample(1000, random_state=42)  # Even smaller sample for speed

df = df.rename(columns={'question1': 'text1', 'question2': 'text2', 'is_duplicate': 'label'})
dataset = Dataset.from_pandas(df[['text1', 'text2', 'label']])

# 4. Tokenization
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(example):
    return tokenizer(example['text1'], example['text2'], truncation=True, padding='max_length', max_length=128)

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

# 5. Load Model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 6. Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    # Changed evaluation_strategy to eval_strategy for compatibility with newer transformers versions
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=10,
    disable_tqdm=True,
    report_to=[],  # prevent extra logging
)

In [None]:
# 7. Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# 8. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

# 9. Train the model
trainer.train()

# 10. Evaluate
trainer.evaluate()

{'loss': 0.6557, 'grad_norm': 4.085048198699951, 'learning_rate': 1.8200000000000002e-05, 'epoch': 0.1}
{'loss': 0.7016, 'grad_norm': 6.4664812088012695, 'learning_rate': 1.62e-05, 'epoch': 0.2}
{'loss': 0.6294, 'grad_norm': 8.57040786743164, 'learning_rate': 1.4200000000000001e-05, 'epoch': 0.3}
{'loss': 0.6219, 'grad_norm': 4.426288604736328, 'learning_rate': 1.22e-05, 'epoch': 0.4}
{'loss': 0.6362, 'grad_norm': 4.70740270614624, 'learning_rate': 1.02e-05, 'epoch': 0.5}
{'loss': 0.6042, 'grad_norm': 5.805069446563721, 'learning_rate': 8.2e-06, 'epoch': 0.6}
{'loss': 0.5785, 'grad_norm': 4.6322340965271, 'learning_rate': 6.200000000000001e-06, 'epoch': 0.7}
{'loss': 0.5483, 'grad_norm': 8.116912841796875, 'learning_rate': 4.2000000000000004e-06, 'epoch': 0.8}
{'loss': 0.5737, 'grad_norm': 2.651797294616699, 'learning_rate': 2.2e-06, 'epoch': 0.9}
{'loss': 0.5118, 'grad_norm': 10.44213581085205, 'learning_rate': 2.0000000000000002e-07, 'epoch': 1.0}
{'eval_loss': 0.5443378686904907, 'e

{'eval_loss': 0.5443378686904907,
 'eval_accuracy': 0.625,
 'eval_f1': 0.0,
 'eval_runtime': 86.1386,
 'eval_samples_per_second': 2.322,
 'eval_steps_per_second': 0.29,
 'epoch': 1.0}

In [None]:
# 11. Gradio Interface for Live Testing
def predict_duplicate(q1, q2):
    inputs = tokenizer(q1, q2, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    probs = outputs.logits.softmax(dim=1).detach().numpy()[0]
    label = "Duplicate" if probs[1] > 0.5 else "Not Duplicate"
    confidence = round(float(probs[1]), 3)
    return f"{label} (Confidence: {confidence})"

interface = gr.Interface(
    fn=predict_duplicate,
    inputs=[gr.Textbox(label="Question 1"), gr.Textbox(label="Question 2")],
    outputs="text",
    title="Quora Duplicate Question Detector",
    description="Enter two questions to check if they are duplicates using a BERT model."
)

# 12. Launch App
interface.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://630719ee4339c06554.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


