# Logins

In [49]:
from dotenv import load_dotenv
import os

load_dotenv()

hf_token = os.getenv('HF_TOKEN')

In [7]:
from huggingface_hub import login
login(token=hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Settings

In [53]:
# checkpoint = 'google-bert/bert-base-multilingual-cased'
checkpoint = 'amberoad/bert-multilingual-passage-reranking-msmarco'

is_test = 0

if is_test == 1: 
    my_model_name = 'category_predictor_for_household_equipments'
    dataset_name = 'Vampyrian/products_with_category_household_equipments'
else:
    my_model_name = 'category_predictor'
    dataset_name = 'Vampyrian/products_with_category'
    
print(my_model_name)
print(dataset_name)

category_predictor
Vampyrian/products_with_category


# Prepare dataset

In [9]:
from datasets import load_dataset, Dataset

In [10]:
dataset = load_dataset(dataset_name)

train.json:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

validation.json:   0%|          | 0.00/303k [00:00<?, ?B/s]

test.json:   0%|          | 0.00/305k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16824 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2153 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2153 [00:00<?, ? examples/s]

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 16824
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2153
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2153
    })
})

In [12]:
dataset = dataset.remove_columns('label')

In [13]:
unique_categories = set(dataset['train']['label_text'])
id2label = {idx: label for idx, label in enumerate(unique_categories)}
label2id = {label: idx for idx, label in enumerate(unique_categories)}

In [14]:
# print(len(id2label))
# print('----')
# print("id2label:", id2label)
# print('----')
# print("label2id:", label2id)

In [15]:
def replace_category_with_id(example):
    example['label'] = label2id[example.pop('label_text')]
    return example

In [16]:
dataset = dataset.map(replace_category_with_id)

Map:   0%|          | 0/16824 [00:00<?, ? examples/s]

Map:   0%|          | 0/2153 [00:00<?, ? examples/s]

Map:   0%|          | 0/2153 [00:00<?, ? examples/s]

In [17]:
# dataset = dataset.shuffle(seed=42)

In [24]:
index = 1561
print(dataset['train'][index])
print(id2label[dataset['train'][index]['label']])

{'text': 'Brock vaflinė 19,2cm WM 3001', 'label': 75}
Vaflinės


# Tokenizing

In [25]:
from transformers import AutoTokenizer

In [26]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [27]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [28]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/16824 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2153 [00:00<?, ? examples/s]

Map:   0%|          | 0/2153 [00:00<?, ? examples/s]

In [29]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Evaluate

In [30]:
import evaluate
accuracy = evaluate.load("accuracy")

In [31]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Train

In [32]:
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification

In [33]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(id2label), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at amberoad/bert-multilingual-passage-reranking-msmarco and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([79, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([79]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
training_args = TrainingArguments(
    output_dir=my_model_name,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

In [38]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [36]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.9672,0.913175,0.787274
2,0.6297,0.73086,0.837901
3,0.4797,0.727262,0.844868


TrainOutput(global_step=12618, training_loss=1.0228470146117525, metrics={'train_runtime': 3471.0903, 'train_samples_per_second': 14.541, 'train_steps_per_second': 3.635, 'total_flos': 598388784722424.0, 'train_loss': 1.0228470146117525, 'epoch': 3.0})

In [39]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1731129095.Zilvinass-Mac-mini.local.2154.0:   0%|          | 0.00/17.8k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Vampyrian/category_predictor_for_household_equipments/commit/76427cc813cc099d49357b943384fd0e8e0eb277', commit_message='End of training', commit_description='', oid='76427cc813cc099d49357b943384fd0e8e0eb277', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Vampyrian/category_predictor_for_household_equipments', endpoint='https://huggingface.co', repo_type='model', repo_id='Vampyrian/category_predictor_for_household_equipments'), pr_revision=None, pr_num=None)

In [40]:
tokenizer.push_to_hub(my_model_name)

README.md:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Vampyrian/category_predictor_for_household_equipments/commit/4e6d41c3154d973ffe8119f7b20a89e8f59e8718', commit_message='Upload tokenizer', commit_description='', oid='4e6d41c3154d973ffe8119f7b20a89e8f59e8718', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Vampyrian/category_predictor_for_household_equipments', endpoint='https://huggingface.co', repo_type='model', repo_id='Vampyrian/category_predictor_for_household_equipments'), pr_revision=None, pr_num=None)

# Testing

In [41]:
from transformers import pipeline

In [43]:
trained_model_name_on_hugging_face = 'Vampyrian/' + my_model_name

'Vampyrian/category_predictor_for_household_equipments'

In [44]:
classifier = pipeline("text-classification", model=trained_model_name_on_hugging_face)

config.json:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [48]:
classifier('virykle labai gera')

[{'label': 'Viryklės', 'score': 0.8829935193061829}]