In [62]:
# !pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121

In [63]:
# !pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.0-py3-none-win_amd64.whl

In [64]:
# !pip install -q datasets transformers accelerate loralib sentencepiece gradio fire peft wandb peft

In [65]:
import zipfile
import json
import datetime

import torch
from torch import nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset
import peft
from peft import PeftModelForSequenceClassification, LoraConfig, get_peft_model
import evaluate
import wandb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from loguru import logger
import numpy as np

In [66]:
print(torch.cuda.is_available())

True


In [67]:
data_zip = zipfile.ZipFile('D:\My_projects\Grapix\garpix-solution\data\small_boxes.zip', 'r')

In [68]:
with data_zip.open(data_zip.filelist[0].filename) as file:
    json_file = json.loads(file.read())

In [69]:
boxes = pd.json_normalize(json_file['first_visual']['boxes'])[[
    'mass','size.width','size.height', 'size.length'
    ]].to_numpy().flatten()

In [70]:
target = json_file['first_visual']['calculation_info']['density_percent']

In [71]:
def json_to_array(json):
    
    boxes = pd.json_normalize(json['first_visual']['boxes'])[[
        'mass', 'size.width', 'size.height', 'size.length'
        ]]
    target = json['first_visual']['calculation_info']['density_percent']
    
    return boxes.to_numpy().flatten(), target

In [72]:
arr = []
targets = []

for data_info in data_zip.filelist:
    with data_zip.open(data_info.filename) as file:
        json_file = json.loads(file.read())
        try:
            boxes, target = json_to_array(json_file)
            if len(boxes) <= 512:
                boxes = np.array2string(
                    boxes, separator=' ', formatter={'float_kind': lambda x: str(int(x))}
                ).replace('[', '').replace(']', '').replace('\n', '')
                arr.append(str(boxes))
                targets.append(round(target))
            else:
                logger.info(f'{data_info.filename} length is more than 512. Skip.')
                pass
        except Exception as e:
            logger.warning(f'\nError {e} \n In file {data_info.filename}')
            pass

In [73]:
len(arr), len(targets)

(4999, 4999)

In [74]:
df = pd.DataFrame()

In [75]:
df['text'] = arr
df['labels'] = targets
df['labels'] = df['labels'].astype(float)

In [76]:
label_enum = {k:j for j, k in enumerate(df['labels'].unique())}
label_enum

{77.0: 0,
 76.0: 1,
 74.0: 2,
 56.0: 3,
 87.0: 4,
 54.0: 5,
 100.0: 6,
 73.0: 7,
 58.0: 8,
 62.0: 9,
 68.0: 10,
 69.0: 11,
 71.0: 12,
 43.0: 13,
 40.0: 14,
 82.0: 15,
 79.0: 16,
 75.0: 17,
 81.0: 18,
 65.0: 19,
 85.0: 20,
 67.0: 21,
 61.0: 22,
 37.0: 23,
 84.0: 24,
 83.0: 25,
 59.0: 26,
 45.0: 27,
 80.0: 28,
 48.0: 29,
 89.0: 30,
 64.0: 31,
 63.0: 32,
 51.0: 33,
 72.0: 34,
 88.0: 35,
 66.0: 36,
 49.0: 37,
 78.0: 38,
 90.0: 39,
 57.0: 40,
 70.0: 41,
 53.0: 42,
 46.0: 43,
 35.0: 44,
 86.0: 45,
 55.0: 46,
 44.0: 47,
 50.0: 48,
 36.0: 49,
 60.0: 50,
 38.0: 51,
 41.0: 52,
 47.0: 53,
 91.0: 54,
 92.0: 55,
 52.0: 56,
 33.0: 57,
 30.0: 58,
 19.0: 59,
 93.0: 60,
 94.0: 61,
 42.0: 62,
 31.0: 63,
 34.0: 64,
 98.0: 65,
 20.0: 66,
 96.0: 67,
 97.0: 68,
 95.0: 69,
 39.0: 70,
 26.0: 71,
 29.0: 72,
 16.0: 73,
 24.0: 74,
 8.0: 75,
 32.0: 76,
 9.0: 77,
 21.0: 78,
 23.0: 79,
 25.0: 80,
 7.0: 81,
 28.0: 82,
 13.0: 83,
 10.0: 84,
 11.0: 85,
 22.0: 86,
 27.0: 87,
 18.0: 88}

In [77]:
df['labels'] = df['labels'].apply(lambda x: [1.0 if label_enum[x]==i else 0.0 for i in range(len(set(targets)))])

In [78]:
train, val = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
train_data = Dataset.from_pandas(train)
val_data = Dataset.from_pandas(val)

In [79]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512, return_tensors="pt")

In [80]:
train_dataset = train_data.map(tokenize_function, batched=True)
val_dataset = val_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/3999 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [81]:
train_dataset = train_dataset.remove_columns(['__index_level_0__', 'text'])
val_dataset = val_dataset.remove_columns(['__index_level_0__', 'text'])

In [113]:
train_dataset.set_format("pt", columns=["input_ids", 'token_type_ids', 'attention_mask'], output_all_columns=True)
val_dataset.set_format("pt", columns=["input_ids", 'token_type_ids', 'attention_mask'], output_all_columns=True)

In [86]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(set(targets)))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [87]:
peft_config = LoraConfig(
    task_type=peft.TaskType.SEQ_CLS,
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none"
)

In [88]:
peft_former = get_peft_model(model, peft_config)

In [89]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [90]:
print_trainable_parameters(model)

trainable params: 363353 || all params: 109914034 || trainable%: 0.3305792597876992


In [91]:
training_args = TrainingArguments(output_dir="test_trainer",
                                  no_cuda=False,
                                  learning_rate=1e-3,
                                  push_to_hub=False,
                                  num_train_epochs=1,
                                  evaluation_strategy='steps',
                                  eval_steps=50,
                                  logging_strategy="steps",
                                  logging_steps=50
                                  )

metric = evaluate.load("accuracy")

In [92]:
def compute_metrics(eval_pred):

    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels)

In [93]:
"cuda:0" if torch.cuda.is_available() else "cpu"

'cuda:0'

In [101]:
type(train_dataset[0]['input_ids'])

torch.Tensor

In [115]:
inputs = train_dataset[0]

In [116]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [117]:
input_ids = inputs['input_ids'].unsqueeze(0)
attention_mask = inputs['attention_mask'].unsqueeze(0)
token_type_ids = inputs['token_type_ids'].unsqueeze(0)

# NOW I STOPPED HERE! CHECK EVAL FUNC

In [118]:
with torch.no_grad():
    logits = model(input_ids, attention_mask, token_type_ids).logits

In [119]:
logits

tensor([[ 0.4693, -0.0622,  0.7178,  0.1627, -0.1421, -0.1235,  0.2952,  0.0544,
          0.6805,  0.6063, -0.4417, -0.3302, -0.0539, -0.3465,  0.2902,  0.1370,
         -0.1958,  0.0670, -0.5611,  0.1078, -0.1611, -0.8442, -0.5220,  0.2288,
         -0.0096,  0.6315, -0.1905, -0.5507,  0.2417,  0.1591,  0.1722, -0.2233,
         -0.3620,  0.2168, -0.6344,  0.1465, -0.3930, -0.1170,  0.4615,  0.6546,
         -0.1103,  0.6494,  0.2672,  0.0044, -0.4229,  0.2915, -0.0084, -0.1462,
          0.1175,  0.4064, -0.3995,  0.1832, -0.0530, -0.8782,  0.4476,  0.6353,
         -0.2529, -0.5343,  0.4150, -0.3722, -0.1825,  0.6289, -0.3182, -0.2007,
         -0.0403,  0.2832, -0.0419,  0.3674, -0.2599,  0.4915, -0.3517,  0.2709,
          0.0086, -0.3790, -0.2798, -0.1293,  0.0945,  0.5888, -0.0504,  0.2047,
          0.1439, -0.5780, -0.1910, -0.2142, -0.2117, -0.1576, -0.0979, -0.1142,
         -0.1970]])

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [34]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrama30765[0m ([33mramzes[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/500 [00:00<?, ?it/s]

{'loss': 0.1036, 'learning_rate': 0.0009000000000000001, 'epoch': 0.1}


  0%|          | 0/125 [00:00<?, ?it/s]

ValueError: Predictions and/or references don't match the expected format.
Expected format: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)},
Input predictions: [38 38 38 38 36 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 36 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 36 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 36 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 36 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 36 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 36 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 36 38
 38 38 38 38 36 38 38 38 38 38 36 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 36 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 36 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 36 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 36
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 36 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 36 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 36 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
 38 38 38 38 38 38 36 38 38 38 38 38 38 38 38 38],
Input references: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]