In [1]:
pip install transformers==4.45.2 sentence-transformers==3.1.1

Collecting transformers==4.45.2
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers==3.1.1
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers==4.45.2)
  Downloading tokenizers-0.20.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers==3.1.1)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers==3.1.1)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers==3.1.1)
  Downloading nvidia_cufft_cu

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
import pandas as pd
import numpy as np
import torch
import datasets
import transformers
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

In [10]:
import peft

In [11]:
print(transformers.__version__)
print(torch.__version__)
print(datasets.__version__)
print(peft.__version__)

4.45.2
2.6.0+cu124
3.6.0
0.14.0


In [None]:
train_data = pd.read_parquet('/kaggle/input/avito-transformer-task/data/df_train.parquet')
train_data.head(10)

In [None]:
valid_data = pd.read_parquet('/kaggle/input/avito-transformer-task/data/df_valid.parquet')
valid_data.head(10)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_data),
    'valid': Dataset.from_pandas(valid_data),
})
dataset

In [None]:
dataset['train'] = dataset['train'].filter(lambda x: len(x['description'].split()) >= 10)
dataset

In [None]:
def log_price(example):
    return {'log_price': np.log(example['price'])}

dataset = dataset.map(log_price)

In [None]:
dataset = dataset.rename_column(
    original_column_name='log_price', new_column_name='labels'
)
dataset

In [None]:
model_name = 'cointegrated/rubert-tiny2'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize_function(examples):
    return tokenizer(
        [c1 + tokenizer.special_tokens_map['sep_token'] + c2 for c1, c2 in zip(examples['city'], examples['description'])], padding=True, truncation=True, max_length=512,
    )

In [None]:
dataset = dataset.map(tokenize_function, batched=True)
dataset

In [None]:
dataset = dataset.select_columns(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
dataset

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels')
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss = torch.mean(torch.abs(logits.squeeze() - labels.squeeze()))
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir = '/kaggle/working/',
    report_to = [],
    num_train_epochs=3, warmup_steps=100,
    optim='adamw_torch', learning_rate=1e-4, weight_decay=1e-2,
    fp16=True, max_grad_norm=1.0, gradient_accumulation_steps=1,
    per_device_train_batch_size=64, per_device_eval_batch_size=128,
    do_eval=True, eval_strategy='steps', eval_steps=500, dataloader_num_workers=4,
)

In [None]:
small_dataset = {
    'train': dataset['train'].select(range(10)),
    'valid': dataset['valid'].select(range(10))
}
small_dataset

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

trainer = RegressionTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['valid'],
)
trainer.train()

In [None]:
def mdape(y_true, y_pred):
    return np.round(np.median(np.abs((y_pred - y_true) / y_true) * 100), 5)

In [None]:
valid_true_n_pred = trainer.predict(dataset['valid'])

y_pred, y_test = valid_true_n_pred.predictions.squeeze(), valid_true_n_pred.label_ids
y_pred, y_test = np.exp(y_pred), np.exp(y_test)

print(f'MdAPE: {mdape(y_test, y_pred)}')

In [None]:
from datasets import load_dataset

In [None]:
dataset_test = load_dataset('parquet', data_files={'test': '/kaggle/input/avito-transformer-task/data/df_test_no_target.parquet'})
dataset_test

In [None]:
dataset_test = dataset_test.map(tokenize_function, batched=True, num_proc=8)
dataset_test

In [None]:
training_args = TrainingArguments(
    output_dir = '/kaggle/working/',
    report_to = [],
)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(model=model, data_collator=data_collator, args=training_args)

y_logits = trainer.predict(dataset_test['test'], ignore_keys=['labels']).predictions

dataset_test = dataset_test['test'].add_column('price_pred', np.exp(y_logits.squeeze()))
dataset_test

In [None]:
dataset_test.set_format('polars')
dataset_test.select_columns(['item_id', 'price_pred'])[:].write_csv('test_preds.csv')