In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import  AutoModelForSequenceClassification, AutoTokenizer

In [None]:
checkpoint = "bert-base-uncased" 
classifier = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=15) 
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [3]:
from datasets import Dataset, DatasetDict
raw_datasets = DatasetDict({
    "train": Dataset.from_csv("train.csv"),
    "eval": Dataset.from_csv("eval.csv")
})

In [None]:
tokenized_datasets = raw_datasets.map(lambda dataset: tokenizer(dataset['textual_embedding'], truncation=True, padding=True), batched=True)

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["textual_embedding"])
tokenized_datasets = tokenized_datasets.rename_column("label_int", "labels")

In [None]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
import numpy as np
import evaluate


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


training_args = TrainingArguments("test-trainer", num_train_epochs=1, evaluation_strategy="epoch", 
                                  weight_decay=5e-4, save_strategy="no", report_to="none")


def compute_metrics(eval_preds):
    metric = evaluate.load("accuracy") 
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
    classifier,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
save_path = "save_folder" 
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to {save_path}")