In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader

In [None]:
class ToxicDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels=None):
            self.encodings = encodings
            self.labels = labels

        def __len__(self):
            return len(self.encodings['input_ids'])

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            if self.labels is not None:
                item['labels'] = torch.tensor(self.labels[idx])
            return item

In [None]:
def tokenize_function(examples, tokenizer):
        return tokenizer(examples['string'], truncation=True, padding=True, max_length=128)


def main():
    data_dir = "./kaggle_data/"

    # %%
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(device)

    # %%
    test_data = pd.read_csv(f'{data_dir}/test_x.csv')  # Replace with your test file path
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    test_data['string'].fillna('Missing', inplace=True)  # Replace NaN with empty strings

    test_encodings = tokenizer(list(test_data['string']), truncation=True, padding=True, max_length=256)
    test_dataset = ToxicDataset(test_encodings)
    file_path = "./results/checkpoint-8408/"
    model = BertForSequenceClassification.from_pretrained(file_path, num_labels=2).to(device)
    model.eval()
    dataloader = DataLoader(test_dataset, batch_size = 512)
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=512,
        per_device_eval_batch_size=512,
        num_train_epochs=1,
        weight_decay=0.01,
        logging_dir='./logs',
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=test_dataset,
        eval_dataset=test_dataset,
    )

    model.eval()

    # Use the Trainer for prediction
    test_predictions = trainer.predict(test_dataset).predictions
    test_predictions = torch.softmax(torch.tensor(test_predictions), dim=1)[:, 1] > 0.5  # Threshold at 0.5
    test_predictions = test_predictions.int().tolist()
    # Add IDs for test data
    test_ids = test_data.index.tolist()

    # %%
    # Step 8: Save Predictions in the Required Format
    pred_df = pd.DataFrame({'ID': test_ids, 'pred': test_predictions})
    pred_df.to_csv('./prediction.csv', index=False)
    print("Predictions saved to prediction.csv")

    # print(f"Predicted class: {predicted_class}")

In [None]:
if __name__ == "__main__":
    main()