In [None]:
import pandas as pd
import numpy as np
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader

In [None]:
test = pd.read_csv('../input/llm-detect-ai-generated-text/test_essays.csv')
test

In [None]:
test_text = test['text'].tolist()
test_text

In [None]:
class LLMDDatasetTest(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer):
        self.texts = texts
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], truncation=True, max_length=256, padding='max_length', return_tensors='pt')
        return {key: val.squeeze() for key, val in encoding.items()}

    def __len__(self):
        return len(self.texts)

In [None]:
LABELS = ['generated', 'human']
id2label = {idx:label for idx, label in enumerate(LABELS)}
label2id = {label:idx for idx, label in enumerate(LABELS)}

In [None]:
class My_TextClassifier_Model():
    def __init__(self, pretrained_transformer_name='../input/save-model/save_model/'):
        max_samples = {'test': 100000}
        test_texts = test_text[:max_samples['test']]
        
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_transformer_name)
        self.test_dataset = LLMDDatasetTest(test_texts, self.tokenizer)
        self.model = AutoModelForSequenceClassification.from_pretrained(pretrained_transformer_name,
                                                                        num_labels=len(LABELS),
                                                                        id2label=id2label,
                                                                        label2id=label2id)
        
    def inference(self):
        loader = DataLoader(self.test_dataset, batch_size=16, shuffle=False)
        predictions = []
        
        with torch.no_grad():
            for batch in loader:
                outputs = self.model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
                logits = outputs.logits
                probs = np.exp(logits.numpy()) / np.sum(np.exp(logits.numpy()), axis=1, keepdims=True)
                predictions.extend(probs[:,0].tolist())

        return predictions

In [None]:
classification_trainer = My_TextClassifier_Model(pretrained_transformer_name='../input/save-model/save_model/')

In [None]:
preds = classification_trainer.inference()

In [None]:
results = pd.DataFrame()
results["id"] = test["id"]
results["generated"] = preds
results

In [None]:
results.to_csv('./submission.csv', index=False)