In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('prepared_data.csv')

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
model.cuda()  # uncomment it if you have a GPU

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

print(embed_bert_cls('привет мир', model, tokenizer).shape)

(312,)


In [3]:
test = pd.read_csv('prepared_test.csv')

test_embeddings = test['description'].apply(lambda x: embed_bert_cls(x, model, tokenizer))

np.save('test_embeddings.npy', test_embeddings.values)

In [26]:
embeddings = df['description'].apply(lambda x: embed_bert_cls(x, model, tokenizer))

In [38]:
np.save('embeddings.npy', embeddings.values)

In [48]:
df['text_embed'] = np.load('embeddings.npy', allow_pickle=True)

In [76]:
X_train, y_train = df.drop(['id', 'price', 'actual_price', 'start_date', 'close_date', 'sale_end_date', 'latitude', 'longitude', 'description'], axis=1), df['actual_price']

In [78]:
cat_features = ['brand', 'model', 'generation', 'modification', 'equipment', 'body_type', 'color', 'owners_count']
embed_features = ['text_embed']

In [77]:
from catboost import CatBoostRegressor

In [79]:
model = CatBoostRegressor(loss_function='RMSE', random_seed=42, eval_metric='MAPE', task_type='GPU', devices='0:1', cat_features=cat_features, embedding_features=embed_features, depth=10, l2_leaf_reg=1, iterations=1000, learning_rate=0.05)

In [None]:
model.fit(X_train, y_train, verbose=True)

