## Login to Huggingface

In [None]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token=token, # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

## Imports

In [1]:
# pytorch
import torch

# huggingface
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification
)

# datasets
import pandas as pd
from datasets import Dataset

## Device

In [2]:
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    "mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

Device = cuda:0


## Hyperparameters

In [3]:
# seed
seed=42

# Tokenizer arguments
max_length=64
padding="max_length"
truncation=True

# mixed precision
dtype=torch.float32

## Model

In [None]:
model_id = "PathFinderKR/bert-finetuned-uncased-QQP"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=2,
    device_map=device,
    #attn_implementation="flash_attention_2", # not supported
    torch_dtype=dtype
)

In [None]:
model

## Dataset

In [4]:
# Dataset Path
dataset_path = "data/"

In [5]:
# Load the test dataset
test_df = pd.read_csv(dataset_path + "test.csv")
# convert dataframe into huggingface dataset
raw_test_dataset = Dataset.from_pandas(test_df)

In [None]:
# tokenize the questions
def tokenize_test_questions(example):
    return tokenizer(example["question1"], example["question2"], max_length=max_length, padding=padding,
                     truncation=truncation)
tokenized_test_dataset = raw_test_dataset.map(tokenize_test_questions, batched=True)

## Submission

In [6]:
# Load the Submission dataset
submission_df = pd.read_csv(dataset_path + "sample_submission.csv")

In [None]:
def predict(question1, question2):
    inputs = tokenizer(question1, question2, return_tensors="pt")
    inputs = {name: tensor.to(model.device) for name, tensor in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    return {
        "probabilities": probabilities.detach().cpu().numpy(),
        "is_duplicate": bool(torch.argmax(probabilities) == 1)
    }

In [None]:
# Inference the test dataset and save the submission to sample_submission.csv
submission_df["is_duplicate"] = [
    predict(question1, question2)["is_duplicate"]
    for question1, question2 in zip(test_df["question1"], test_df["question2"])
]

In [None]:
# Save the submission
submission_df.to_csv("submission.csv", index=False)