In [108]:
import os
import zipfile
import json

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from loguru import logger
import numpy as np

# Create dataset from files

In [277]:
data_zip = zipfile.ZipFile('data.zip', 'r') # open archive with archives

In [278]:
def json_to_array(json):
    
    boxes = pd.json_normalize(json['data_result']['boxes'])[[
        'mass', 'size.width', 'size.height', 'size.length'
        ]]
    target = json['data_result']['cargo_space']['calculation_info']['density_percent']
    
    return boxes.to_numpy().flatten(), target

In [279]:
arr = []
targets = []

for data_info in data_zip.filelist: # for all archives inside
    jsons_zip = zipfile.ZipFile(f'{data_info.filename}', 'r') # open archive with jsons
    for jsons_zip_info in jsons_zip.filelist: 
        with jsons_zip.open(jsons_zip_info.filename) as file: # open json
            json_file = json.loads(file.read())
            try:
                boxes, target = json_to_array(json_file)
                if len(boxes) <= 512:
                    boxes = np.array2string(
                        boxes, separator=' ', formatter={'float_kind': lambda x: str(int(x))}
                    ).replace('[', '').replace(']', '').replace('\n', '')
                    arr.append(str(boxes))
                    targets.append(round(target))
                else:
                    logger.info(f'{jsons_zip_info.filename} length is more than 512. Skip.')
                    pass
            except Exception as e:
                logger.warning(f'\nError {e} \n In file {jsons_zip_info.filename}')
                pass  

[32m2024-02-09 17:52:54.041[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1m33377.json length is more than 512. Skip.[0m
[32m2024-02-09 17:52:54.483[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1m34766.json length is more than 512. Skip.[0m
[32m2024-02-09 17:52:54.813[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1m34944.json length is more than 512. Skip.[0m


In [333]:
from transformers import AutoTokenizer
import numpy as np
from sklearn.model_selection import train_test_split

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [425]:
class SequenceDataset(Dataset):
    def __init__(self, arr, targets, tokenizer: AutoTokenizer) -> None:
        super(Dataset,self).__init__()
        
        self.arr = arr
        self.targets = targets
        self.tokenizer = tokenizer
        
        self.labels = set(targets)

        self.id2label = {idx:label for idx, label in enumerate(self.labels)}
        self.label2id = {label:idx for idx, label in enumerate(self.labels)}
        
        self.encodings = tokenizer(arr,
                                   padding="max_length",
                                   truncation=True,
                                   max_length=512,
                                   return_tensors="pt")
        
    def __getitem__(self, idx):
        
        item = {
            'input_ids' : self.encodings['input_ids'][idx],
            'attention_masks' : self.encodings['attention_mask'][idx],
            'targets' : self.targets[idx]
        }
        
        return item
    
    def __len__(self):
        return len(self.targets)

In [422]:
train_arr, val_arr, train_labels, val_labels = train_test_split(arr, targets, train_size=0.8, random_state=42) # not enough data for stratifying

In [426]:
# Dataset & dataloader
train_dataset = SequenceDataset(train_arr, train_labels, tokenizer)
val_dataset = SequenceDataset(val_arr, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)

In [428]:
next(iter(train_loader))['input_ids']

torch.Size([16, 512])

In [378]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification",
                                                           id2label=train_dataset.id2label,
                                                           label2id=train_dataset.label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [449]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [452]:
batch = next(iter(train_loader))

inp = batch['input_ids']
msk = batch['attention_masks']
targets = batch['targets']

with torch.no_grad():
    logits = model(inp, msk).logits

In [453]:
logits

tensor([[-8.1324e-02,  1.5877e-01,  1.8534e-01, -2.2440e-01, -1.0671e-01,
         -7.3002e-01, -8.6762e-02, -2.6212e-01,  3.8471e-02,  9.5836e-02,
         -5.1918e-02, -5.5732e-02, -1.3814e-01, -4.2400e-01,  2.8129e-01,
         -5.9642e-01,  1.9677e-02,  9.6603e-02, -3.1720e-01, -3.1919e-01,
         -2.2364e-01, -1.8494e-01, -4.9326e-02,  8.6304e-02,  1.6323e-01,
          6.5074e-02,  3.1736e-02, -8.1918e-02, -2.3737e-01, -7.5522e-02,
         -9.2556e-02, -6.7558e-02,  1.3388e-01,  5.1918e-01, -1.6889e-01,
         -1.4325e-01,  2.4757e-03,  3.2576e-01, -7.1208e-01, -1.7321e-01,
         -1.4190e-01, -2.4352e-01,  4.9416e-01,  1.1689e-01,  3.7894e-01,
          1.4997e-02, -1.1204e-01, -1.3797e-01, -1.4962e-01,  1.5934e-01,
         -1.6081e-01, -2.1723e-01, -1.4116e-01,  1.8200e-01],
        [-3.9210e-02,  1.9848e-01,  1.9677e-01, -3.0167e-01, -4.3886e-02,
         -7.3666e-01, -2.5710e-02, -2.2825e-01,  1.1192e-02,  1.3202e-01,
         -2.9506e-02,  4.2047e-02, -1.1006e-01, -4

In [467]:
probs = nn.functional.softmax(logits, dim=-1)
probs

tensor([[0.0176, 0.0224, 0.0230, 0.0153, 0.0172, 0.0092, 0.0175, 0.0147, 0.0199,
         0.0211, 0.0182, 0.0181, 0.0167, 0.0125, 0.0253, 0.0105, 0.0195, 0.0211,
         0.0139, 0.0139, 0.0153, 0.0159, 0.0182, 0.0209, 0.0225, 0.0204, 0.0197,
         0.0176, 0.0151, 0.0177, 0.0174, 0.0179, 0.0219, 0.0321, 0.0162, 0.0166,
         0.0192, 0.0265, 0.0094, 0.0161, 0.0166, 0.0150, 0.0314, 0.0215, 0.0279,
         0.0194, 0.0171, 0.0167, 0.0165, 0.0224, 0.0163, 0.0154, 0.0166, 0.0229],
        [0.0182, 0.0231, 0.0231, 0.0140, 0.0181, 0.0091, 0.0185, 0.0151, 0.0192,
         0.0216, 0.0184, 0.0198, 0.0170, 0.0120, 0.0248, 0.0103, 0.0191, 0.0208,
         0.0137, 0.0134, 0.0157, 0.0155, 0.0179, 0.0201, 0.0227, 0.0205, 0.0193,
         0.0170, 0.0150, 0.0177, 0.0181, 0.0183, 0.0210, 0.0316, 0.0157, 0.0159,
         0.0198, 0.0265, 0.0090, 0.0151, 0.0166, 0.0152, 0.0322, 0.0211, 0.0289,
         0.0208, 0.0173, 0.0170, 0.0158, 0.0214, 0.0170, 0.0155, 0.0172, 0.0222],
        [0.0173, 0.0227, 0

In [468]:
idxs = probs.argmax(dim=1)
idxs

tensor([33, 42, 33, 33, 42, 33, 33, 42, 42, 42, 33, 42, 42, 33, 33, 42])

In [469]:
targets

tensor([65, 78, 64, 75, 71, 56, 70, 80, 78, 48, 74, 57, 77, 63, 57, 72])

In [470]:
[train_dataset.id2label[idx.item()] for idx in idxs]# prediction

[77, 86, 77, 77, 86, 77, 77, 86, 86, 86, 77, 86, 86, 77, 77, 86]

## Train

### Freeze all except 2 last layers

In [411]:
import torch.nn as nn
import pytorch_lightning as pl
from sklearn.metrics import accuracy_score, f1_score

In [300]:
model.requires_grad_ = False

model.bert.pooler.requires_grad_ = True
model.classifier.requires_grad_ = True

In [327]:
num_of_epochs = 25
learning_rate = 1e-6
hidden_layers = 8

In [472]:
class LightningBert(pl.LightningModule):
    def __init__(self, hparams=None, model=None, train_id2label=None, val_id2label=None):
        super(LightningBert, self).__init__()
        self.model = model
        self.metric = f1_score
        self.criterion == nn.CrossEntropyLoss()
        self.train_id2label = train_id2label
        self.val_id2label = val_id2label
    
    def forward(self, x):
        output = self.model(
            x['input_ids'].unsqueeze(0),
            x['attention_mask'].unsqueeze(0),
        ).logits
        
        return output
    
    def training_step(self, batch):
        
        input_ids = batch['input_ids']
        attention_masks = batch['attention_masks']
        labels = batch['targets']
        
        logits = self.forward(input_ids, attention_masks)
        probs = nn.functional.softmax(logits, dim=-1)
        idxs = probs.argmax(dim=1)
        preds = [self.train_id2label[idx.item()] for idx in idxs]
        loss = self.criterion(logits, labels)
        metric = self.metric(preds, labels)
        
        self.log("train_loss", loss, on_epoch=True)
        self.log("train_f1", metric, on_epoch=True)
        
        return {'loss' : loss, 'train_acc' : metric}
    
    def validation_step(self, batch):
        
        input_ids = batch['input_ids']
        attention_masks = batch['attention_masks']
        labels = batch['targets']

        logits = self.forward(input_ids, attention_masks)
        probs = nn.functional.softmax(logits, dim=-1)
        idxs = probs.argmax(dim=1)
        preds = [self.val_id2label[idx.item()] for idx in idxs]
        loss = self.criterion(logits, labels)
        metric = self.metric(preds, labels)
        
        self.log("val_loss", loss, on_epoch=True)
        self.log("val_f1", metric, on_epoch=True)
        
        return {'loss' : loss, 'train_acc' : metric}
    
    def predict_step(self, batch):
        
        input_ids = batch['input_ids']
        attention_masks = batch['attention_masks']

        logits = self.model(input_ids, attention_masks).logits
        probs = nn.functional.softmax(logits, dim=-1).detach().cpu().numpy()
        idxs = probs.argmax(dim=1)
        preds = [self.val_id2label[idx.item()] for idx in idxs]
        
        return preds
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-5)
        plateau_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                                    mode='min',
                                                                    factor=0.5,
                                                                    patience=5,
                                                                    verbose=True)
        return {
            'optimizer': optimizer,
            'lr_scheduler': plateau_scheduler,
            'monitor': 'val_loss'
        }