In [None]:
import numpy as np
import pandas as pd
import json
import torch
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
import transformers
import datasets
import evaluate

In [None]:
with open("./data/arxiv-metadata-oai-snapshot.json", "r") as f_in, \
     open("./data/data.json", "w") as f_out:
    for i, line in enumerate(f_in):
        if not i % 100000:
            print(f"Processed {i} lines.")
        try:
            data = json.loads(line)
            category = data["categories"].split()[0]
            parsed_data = {
                "id": data["id"],
                "update_date": data["update_date"],
                "title": data["title"],
                "abstract": data["abstract"][:512],

                "categories": category
            }
            json.dump(parsed_data, f_out)
            f_out.write('\n')
        except:
            print(f"Line {i} is bad!")
        

In [None]:
df = pd.read_json("./data/data.json", lines=True)

In [None]:
le = LabelEncoder()
df["label"] = le.fit_transform(df["categories"])

In [None]:
df["text"] = df["title"] + " " + df["abstract"]

In [None]:
model_name = "distilbert/distilbert-base-cased"

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

In [None]:
dataset = datasets.Dataset.from_pandas(
    df[["id", "text", "categories", "label"]]
).train_test_split(0.1, shuffle=True)

In [None]:
def tokenize_function(row):
    return tokenizer(row["text"], padding="max_length", truncation=True)

In [None]:
dataset = dataset.map(tokenize_function, batched=True, batch_size=2000)

In [None]:
dataset.save_to_disk("./data/distilbert-base-cased-dataset")

In [None]:
dataset = datasets.load_from_disk("./data/distilbert-base-cased-dataset/")

In [None]:
dataset["train"][0].keys()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=172,  # len(le.classes_),
).to(device)

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = transformers.TrainingArguments(
    output_dir="./my_model",
    overwrite_output_dir=True,
    num_train_epochs=5,
    learning_rate=7e-5,
    lr_scheduler_type="cosine",
    # lr_scheduler_kwargs={},
    # warmup_ratio=0.03125,
    # warmup_steps=10,
    per_device_train_batch_size=64,
    gradient_accumulation_steps=1,
    log_level="error",
    # logging_dir="output_dir/runs/CURRENT_DATETIME_HOSTNAME"  # логи для tensorboard (default)
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="epoch",
    # save_steps=1,
    save_total_limit=2,
    save_safetensors=True,  # safetensors вместо torch.save / torch.load
    save_only_model=False,  # сохраняем optimizer, shceduler, rng, ...
    use_cpu=False,
    seed=42,
    # bf16=True,  # использовать bf16 вместо fp32
    # eval_strategy="epoch",
    # eval_steps=32,
    disable_tqdm=True,
    load_best_model_at_end=False,
    label_smoothing_factor=0.,
    optim="adamw_torch",
    # optim_args=...,
    # resume_from_checkpoint=...,
    # auto_find_batch_size=...,
)

In [None]:
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
id2label = {i: label for i, label in enumerate(le.classes_)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
model.config.id2label = id2label
model.config.label2id = label2id

In [None]:
model.save_pretrained("updated-model")