## Login to Huggingface

In [1]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token=token, # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/pathfinder/.cache/huggingface/token
Login successful


## Imports

In [2]:
# pytorch
import torch

# huggingface
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification
)

# datasets
import pandas as pd
from datasets import Dataset

## Device

In [3]:
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    "mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

Device = cuda:0


## Hyperparameters

In [4]:
# seed
seed=42

# Tokenizer arguments
max_length=64
padding="max_length"
truncation=True

# mixed precision
dtype=torch.float32

## Model

In [5]:
model_id = "PathFinderKR/bert-finetuned-uncased-QQP"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=2,
    device_map=device,
    #attn_implementation="flash_attention_2", # not supported
    torch_dtype=dtype
)

In [7]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

## Dataset

In [8]:
# Dataset Path
dataset_path = "data/"

In [9]:
# Load the test dataset
test_df = pd.read_csv(dataset_path + "test.csv")
# convert dataframe into huggingface dataset
raw_test_dataset = Dataset.from_pandas(test_df)

## Submission

In [10]:
# Load the Submission dataset
submission_df = pd.read_csv(dataset_path + "sample_submission.csv")

In [11]:
def predict(question1, question2):
    inputs = tokenizer(question1, question2, return_tensors="pt")
    inputs = {name: tensor.to(model.device) for name, tensor in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    return {
        "probabilities": probabilities.detach().cpu().numpy(),
        "is_duplicate": bool(torch.argmax(probabilities) == 1)
    }

In [12]:
# Inference the test dataset and save the submission to sample_submission.csv
submission_df["is_duplicate"] = [
    predict(question1, question2)["is_duplicate"]
    for question1, question2 in zip(test_df["question1"], test_df["question2"])
]

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
# Save the submission
submission_df.to_csv("submission.csv", index=False)