# Установка и обновление необходимых библиотек

In [1]:
# !pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121

In [2]:
# !pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.0-py3-none-win_amd64.whl

In [3]:
# !pip install -q datasets transformers accelerate loralib sentencepiece gradio fire peft wandb peft

In [4]:
import zipfile
import json
import datetime

import torch
from torch import nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset
import peft
from peft import LoraConfig, get_peft_model
import evaluate
import wandb
from sklearn.model_selection import train_test_split
import pandas as pd
from loguru import logger
import numpy as np

In [5]:
print(torch.cuda.is_available())

True


# Подготовка набора данных для обучения и валидации модели

## Чтение файлов

In [6]:
data_zip = zipfile.ZipFile('D:\My_projects\Grapix\garpix-solution\data\small_boxes.zip', 'r')

In [7]:
with data_zip.open(data_zip.filelist[0].filename) as file:
    json_file = json.loads(file.read())

## Формирование датасета

In [8]:
def json_to_array(json):
    
    boxes = pd.json_normalize(json['first_visual']['boxes'])[[
        'mass', 'size.width', 'size.height', 'size.length'
        ]]
    target = json['first_visual']['calculation_info']['density_percent']
    
    return boxes.to_numpy().flatten(), target

In [9]:
arr = []
targets = []

for data_info in data_zip.filelist:
    with data_zip.open(data_info.filename) as file:
        json_file = json.loads(file.read())
        try:
            boxes, target = json_to_array(json_file)
            if len(boxes) <= 512:
                boxes = np.array2string(
                    boxes, separator=' ', formatter={'float_kind': lambda x: str(int(x))}
                ).replace('[', '').replace(']', '').replace('\n', '')
                arr.append(str(boxes))
                targets.append(round(target))
            else:
                logger.info(f'{data_info.filename} length is more than 512. Skip.')
                pass
        except Exception as e:
            logger.warning(f'\nError {e} \n In file {data_info.filename}')
            pass

In [10]:
len(arr), len(targets)

(4999, 4999)

## Создание pandas таблицы

In [11]:
df = pd.DataFrame()

In [12]:
df['text'] = arr
df['labels'] = targets
df['labels'] = df['labels'].astype(float)

In [13]:
# Необходимо для многоклассовой классификации
label_enum = {k:j for j, k in enumerate(df['labels'].unique())}
num_classes = len(set(targets))

df['labels'] = df['labels'].apply(
    lambda x: [1.0 if label_enum[x]==i else 0.0 for i in range(num_classes)]
    )

## Разделение на train / val выборки

In [14]:
train, val = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
train_data = Dataset.from_pandas(train)
val_data = Dataset.from_pandas(val)

### Конвертация набора данных в huggingface формат, токенизация

In [15]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512, return_tensors="pt")

In [16]:
train_dataset = train_data.map(tokenize_function, batched=True)
val_dataset = val_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/3999 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [17]:
train_dataset = train_dataset.remove_columns(['__index_level_0__', 'text'])
val_dataset = val_dataset.remove_columns(['__index_level_0__', 'text'])

In [18]:
train_dataset.set_format("pt", columns=["input_ids", 'token_type_ids', 'attention_mask', 'labels'], output_all_columns=True)
val_dataset.set_format("pt", columns=["input_ids", 'token_type_ids', 'attention_mask'], output_all_columns=True)

## Загрузка модели

In [19]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(set(targets)))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Применение LoRa адаптеров

In [20]:
peft_config = LoraConfig(
    task_type=peft.TaskType.SEQ_CLS,
    lora_alpha=128,
    lora_dropout=0.1,
    target_modules=["query", "value"],
    r=64,
    bias="none"
)

In [21]:
peft_bert = get_peft_model(model, peft_config)

bin d:\My_projects\Grapix\garpix-solution\.venv\lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll


In [22]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [23]:
print_trainable_parameters(peft_bert)

trainable params: 2427737 || all params: 111978418 || trainable%: 2.1680400950118797


## Обучение модели

In [24]:
run_name = (f'{datetime.datetime.now().strftime("%m/%d/%Y %H:%M:%S")}'
            ' Garpix bert_peft_training')
print(run_name)

02/26/2024 13:22:56 Garpix bert_peft_training


In [25]:
training_args = TrainingArguments(output_dir="test_trainer",
                                  no_cuda=False,
                                  learning_rate=1e-4,
                                  push_to_hub=False,
                                  num_train_epochs=10,
                                  evaluation_strategy='steps',
                                  eval_steps=250,
                                  logging_strategy="steps",
                                  logging_steps=250,
                                  report_to='wandb',
                                  run_name=run_name
                                  )

metric = evaluate.load("accuracy")

In [26]:
def compute_metrics(eval_pred):

    logits, labels = eval_pred
    
    labels = np.argmax(labels, axis=-1)
    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels)

In [27]:
trainer = Trainer(
    model=peft_bert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

https://api.wandb.ai/links/ramzes/ou4jsysx

![Alt text](image.png)