In [268]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.auto import tqdm

In [ ]:
test = pd.read_csv("../datasets/test_essays.csv")
X_test = test["text"]

In [269]:
MODEL_PATH = "/kaggle/input/detect-ai-text-deberta-v3-large/pytorch/large/1"
MAX_LEN = 1024
BATCH_SIZE = 16

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, max_position_embeddings=MAX_LEN).to("cuda")

y_pred = []
with torch.no_grad():
    for i in tqdm(range(0, len(test), BATCH_SIZE)):
        inputs = tokenizer(
            X_test[i : i + BATCH_SIZE].tolist(),
            padding=True,
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt",
        ).to("cuda")
        logits = model(**inputs).logits.cpu().numpy()
        y_pred.extend((np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True))[:, 1])
        
del tokenizer
del model
torch.cuda.empty_cache()

In [276]:
pd.DataFrame({"id": test["id"], "generated": y_pred}).to_csv(
    "submission.csv", index=False
)