In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn import metrics
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_from_disk

In [None]:
attempt = 4

In [None]:
data = load_from_disk('cleaned_data/')

data = data.class_encode_column('genre')

data['train'][0]

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased/", use_fast=True, do_lower_case=True)
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased/",
    num_labels=len(data['train'].features["genre"]._int2str),
).to(device)

In [None]:
max_len = 0

for example in data['train']:
    input_ids = tokenizer.encode(example['final_text'], add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

print(f'Max sentence len - {max_len}')

In [None]:
class ClassificationDataset:
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        text = str(self.data[item]["final_text"])
        target = int(self.data[item]["genre"])
        inputs = self.tokenizer(text, max_length=max_len, padding="max_length", truncation=True)

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long).to(device),
            "attention_mask": torch.tensor(mask, dtype=torch.long).to(device),
            "labels": torch.tensor(target, dtype=torch.long).to(device),
        }


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = metrics.accuracy_score(labels, predictions)
    return {"accuracy": accuracy}


def train(ds):
    ds_train = ds["train"]
    ds_test = ds["test"]

    temp_ds = ds_train.train_test_split(test_size=0.1, stratify_by_column="genre")
    ds_train = temp_ds["train"]
    ds_val = temp_ds["test"]

    train_dataset = ClassificationDataset(ds_train, tokenizer)
    valid_dataset = ClassificationDataset(ds_val, tokenizer)
    test_dataset = ClassificationDataset(ds_test, tokenizer)

    args = TrainingArguments(
        "model",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to="none",
        save_total_limit=1
    )

    trainer = Trainer(
        model,
        args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    preds = trainer.predict(test_dataset).predictions
    preds = np.argmax(preds, axis=1)

    # generate submission file
    submission = pd.DataFrame({"id": ds_test["id"], "genre": preds})
    submission.loc[:, "genre"] = submission.genre.apply(lambda x: ds_train.features["genre"].int2str(x))
    submission.to_csv(f"submission_{attempt}.csv", index=False)


In [None]:
train(data)