# Logins

In [2]:
# from huggingface_hub import notebook_login
# notebook_login()

# Settings

In [None]:
# checkpoint = 'google-bert/bert-base-multilingual-cased'
checkpoint = 'amberoad/bert-multilingual-passage-reranking-msmarco'

# my_model_name = 'category_predictor_for_household_equipments'
my_model_name = 'category_predictor'

# dataset_name = 'Vampyrian/products_with_category_household_equipments'
dataset_name = 'Vampyrian/products_with_category'

# Prepare dataset

In [3]:
from datasets import load_dataset, Dataset

In [4]:
dataset = load_dataset('Vampyrian/products_with_category_household_equipments')

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 26056
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 3453
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 3453
    })
})

In [6]:
dataset = dataset.remove_columns('label')

In [7]:
unique_categories = set(dataset['train']['label_text'])
id2label = {idx: label for idx, label in enumerate(unique_categories)}
label2id = {label: idx for idx, label in enumerate(unique_categories)}

In [8]:
print(len(id2label))
print('----')
print("id2label:", id2label)
print('----')
print("label2id:", label2id)

79
----
id2label: {0: 'Dulkių siurblių maišai', 1: 'Skrudintuvai', 2: 'Pjaustyklės', 3: 'Dulkių siurbliai robotai', 4: 'Dulkių siurbliai', 5: 'Ledų gaminimo aparatai', 6: 'Elektriniai šildytuvai', 7: 'Pieno šaldytuvai', 8: 'Plakikliai mikseriai', 9: 'Kavos aparatai', 10: 'Kaitlentės', 11: 'Cikloninės krosnelės', 12: 'Oro jonizatoriai', 13: 'Ozono generatoriai', 14: 'Oro drėkintuvai', 15: 'Barzdaskutės', 16: 'Oro kondicionieriai', 17: 'Plaukų džiovintuvai', 18: 'Plaukų formavimo prietaisai', 19: 'Cukraus vatos aparatai', 20: 'Daugiafunkciniai puodai', 21: 'Šilumos siurbliai oras-oras', 22: 'Geoterminiai šilumos siurbliai', 23: 'Dujiniai šildytuvai', 24: 'Hepa filtrai', 25: 'Blenderiai', 26: 'Konvekciniai radiatoriai', 27: 'Vakuumatoriai', 28: 'Griliai ir keptuvės', 29: 'Epiliatoriai', 30: 'Rekuperatoriai', 31: 'Džiovyklės', 32: 'Kavamalės', 33: 'Virtuvinės svarstyklės', 34: 'Indaplovės', 35: 'Oro valytuvai', 36: 'Oro šildytuvai', 37: 'Ventiliatoriniai šildytuvai', 38: 'Šaldytuvai', 39: 

In [9]:
def replace_category_with_id(example):
    example['label'] = label2id[example.pop('label_text')]
    return example

In [10]:
dataset = dataset.map(replace_category_with_id)

Map:   0%|          | 0/26056 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [11]:
dataset = dataset.shuffle(seed=42)

In [12]:
dataset['train'][12]

{'text': 'BAGGED VACUUM CLEANER ERIS 750W', 'label': 4}

# Tokenizing

In [14]:
from transformers import AutoTokenizer

In [15]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [16]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [17]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/26056 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [18]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Evaluate

In [19]:
import evaluate
accuracy = evaluate.load("accuracy")

In [20]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Train

In [21]:
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification

In [23]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(id2label), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
training_args = TrainingArguments(
    output_dir=my_model_name,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[codecarbon INFO @ 17:00:30] [setup] RAM Tracking...
[codecarbon INFO @ 17:00:30] [setup] GPU Tracking...
[codecarbon INFO @ 17:00:30] No GPU found.
[codecarbon INFO @ 17:00:30] [setup] CPU Tracking...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environm

In [26]:
trainer.train()

Epoch,Training Loss,Validation Loss


[codecarbon INFO @ 17:00:49] Energy consumed for RAM : 0.000025 kWh. RAM Power : 6.0 W
[codecarbon INFO @ 17:00:49] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 17:00:49] 0.000202 kWh of electricity used since the beginning.
[codecarbon INFO @ 17:01:04] Energy consumed for RAM : 0.000050 kWh. RAM Power : 6.0 W
[codecarbon INFO @ 17:01:04] Energy consumed for all CPUs : 0.000354 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 17:01:04] 0.000404 kWh of electricity used since the beginning.
[codecarbon INFO @ 17:01:19] Energy consumed for RAM : 0.000075 kWh. RAM Power : 6.0 W
[codecarbon INFO @ 17:01:19] Energy consumed for all CPUs : 0.000531 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 17:01:19] 0.000606 kWh of electricity used since the beginning.
[codecarbon INFO @ 17:01:34] Energy consumed for RAM : 0.000100 kWh. RAM Power : 6.0 W
[codecarbon INFO @ 17:01:34] Energy consumed for all CPUs : 0.000709 kWh. Total CPU Power : 42.5 W
[codeca

RuntimeError: MPS backend out of memory (MPS allocated: 4.12 GB, other allocations: 13.75 GB, max allowed: 18.13 GB). Tried to allocate 350.24 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [68]:
trainer.push_to_hub()

events.out.tfevents.1730988942.Zilvinass-Mac-mini.local.3613.1:   0%|          | 0.00/13.3k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Vampyrian/my_awesome_model/commit/045ed970b31b6989136faea0dcce80c11a42f7e3', commit_message='End of training', commit_description='', oid='045ed970b31b6989136faea0dcce80c11a42f7e3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Vampyrian/my_awesome_model', endpoint='https://huggingface.co', repo_type='model', repo_id='Vampyrian/my_awesome_model'), pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub(my_model_name)

# Testing

In [None]:
# from transformers import pipeline

In [75]:
# classifier = pipeline("text-classification", model="Vampyrian/my_awesome_model")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [77]:
# classifier('Coleman 3000000396 turistinė viryklė Skystojo kuro viryklė')

[{'label': 'Viryklės', 'score': 0.9028318524360657}]