In [3]:
import pandas as pd
import numpy as np

In [28]:
df = pd.read_csv('prepared_data.csv')

In [29]:
df['text_emb'] = np.load('embeddings.npy', allow_pickle=True)

In [31]:
X_train, y_train = df.drop(['id', 'price', 'actual_price', 'start_date', 'close_date', 'sale_end_date', 'latitude', 'longitude', 'description'], axis=1), df['actual_price']

In [23]:
cat_features = ['brand', 'model', 'generation', 'modification', 'equipment', 'body_type', 'color', 'owners_count']
embed_features = ['text_emb']

In [11]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(loss_function='RMSE', random_seed=42, eval_metric='MAPE', task_type='GPU', devices='0:1', cat_features=cat_features, embedding_features=embed_features, depth=10, l2_leaf_reg=1, iterations=1000, learning_rate=0.05)

In [12]:
model.fit(X_train, y_train, verbose=True)

0:	learn: 2.2381253	total: 397ms	remaining: 6m 36s
1:	learn: 2.1482790	total: 486ms	remaining: 4m 2s
2:	learn: 2.0675958	total: 577ms	remaining: 3m 11s
3:	learn: 1.9923255	total: 662ms	remaining: 2m 44s
4:	learn: 1.9209355	total: 749ms	remaining: 2m 28s
5:	learn: 1.8433332	total: 842ms	remaining: 2m 19s
6:	learn: 1.7670525	total: 925ms	remaining: 2m 11s
7:	learn: 1.6971314	total: 1.02s	remaining: 2m 6s
8:	learn: 1.6327009	total: 1.11s	remaining: 2m 1s
9:	learn: 1.5754256	total: 1.2s	remaining: 1m 58s
10:	learn: 1.5131376	total: 1.28s	remaining: 1m 55s
11:	learn: 1.4596825	total: 1.37s	remaining: 1m 53s
12:	learn: 1.4042063	total: 1.46s	remaining: 1m 51s
13:	learn: 1.3608227	total: 1.55s	remaining: 1m 49s
14:	learn: 1.3135573	total: 1.63s	remaining: 1m 47s
15:	learn: 1.2662828	total: 1.72s	remaining: 1m 45s
16:	learn: 1.2186683	total: 1.8s	remaining: 1m 44s
17:	learn: 1.1759077	total: 1.89s	remaining: 1m 43s
18:	learn: 1.1331171	total: 1.98s	remaining: 1m 42s
19:	learn: 1.0936877	total:

<catboost.core.CatBoostRegressor at 0x2a9a888a7d0>

In [13]:
model.save_model('catboost_emb_model')

In [4]:
test = pd.read_csv('prepared_test.csv')

In [5]:
import torch
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
model.cuda()  # uncomment it if you have a GPU

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

print(embed_bert_cls('привет мир', model, tokenizer).shape)

(312,)


In [6]:
embeddings = test['description'].apply(lambda x: embed_bert_cls(x, model, tokenizer))

In [7]:
test['text_emb'] = embeddings

In [8]:
test

Unnamed: 0,id,sale_end_date,brand,model,generation,modification,equipment,body_type,color,year,...,is_carsharing,description,car_age,month,day,sale_year,day_of_week,day_of_year,distance_from_moscow,text_emb
0,436662,2022-07-24,Kia,Ceed,ED (2006—2010),1.6 MT (122 л.с.),Люкс,Хетчбэк,Серебряный,2009.0,...,0.0,жаркая пора в ключавто продаём автомобили деше...,13.0,7,24,2022,6,205,1666.587343,"[-0.0008980326, 0.0028284832, -0.012875804, -0..."
1,427226,2022-07-21,Nissan,Qashqai,II рестайлинг (2017—н. в.),1.2 DIG-T CVT (115 л.с.),SE,Внедорожник,Белый,2018.0,...,0.0,купим дорого ваш автомобиль в день обращения t...,4.0,7,21,2022,3,202,699.858552,"[-0.035715684, -0.023991501, -0.060391717, -0...."
2,153261,2022-07-24,Skoda,Kodiaq,I (2016—н. в.),2.0 TSI 4x4 DSG (180 л.с.),Unknown,Внедорожник,Чёрный,2021.0,...,0.0,23 и 24 июля weekend продаж автомобилей с проб...,1.0,7,24,2022,6,205,20.614456,"[0.024328541, 0.02273651, 0.0146195665, -0.054..."
3,411283,2022-07-13,Mercedes-Benz,GLC-класс,X253 (2015—2019),GLC 250 2.0 4MATIC 9G-Tronic (211 л.с.),Unknown,Внедорожник,Синий,2017.0,...,0.0,ночь продаж в рольф ясенево только 14 07 успей...,5.0,7,13,2022,2,194,17.501037,"[0.021716949, -0.0024255516, -0.014956146, -0...."
4,248112,2022-07-13,Kia,Cerato,III рестайлинг (2016—2020),1.6 AT (130 л.с.),Unknown,Седан,Серый,2017.0,...,0.0,успейте забрать с максимальной выгодой автомоб...,5.0,7,13,2022,2,194,647.247097,"[0.013503316, 0.012659598, -0.005661203, -0.05..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1606,142559,2022-07-30,Kia,Sportage,IV рестайлинг (2018—н. в.),2.0 MPI 4WD AT (150 л.с.),Unknown,Внедорожник,Красный,2019.0,...,0.0,не ждите пока купят ваш авто в рольф звезда ст...,3.0,7,30,2022,5,211,31.050962,"[0.04204547, -0.022820592, -0.01716095, -0.084..."
1607,435799,2022-07-26,ВАЗ (LADA),XRAY,I (2015—н. в.),1.8 AMT (122 л.с.),Exclusive,Хетчбэк,Серый,2018.0,...,0.0,асц честно автомобили с пробегом мы готовы вам...,4.0,7,26,2022,1,207,19.595866,"[-0.005243049, 0.008879315, 0.015105072, -0.09..."
1608,344058,2022-07-11,Toyota,RAV4,IV рестайлинг (2015—2019),2.0 CVT (146 л.с.),Comfort,Внедорожник,Чёрный,2018.0,...,0.0,на данном сайте представлены не все автомобили...,4.0,7,11,2022,0,192,623.765430,"[-0.018012553, 0.017068796, 0.006283274, -0.09..."
1609,97253,2022-07-27,BMW,X5,G05 (2018—н. в.),30d 3.0 xDrive Steptronic (249 л.с.),M Sport Pro,Внедорожник,Чёрный,2021.0,...,0.0,выкупим ваш в день обращения до от рыночной ст...,1.0,7,27,2022,2,208,31.784430,"[-0.040683582, 0.024216073, 0.03270462, -0.093..."


In [24]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(loss_function='RMSE', random_seed=42, eval_metric='MAPE', task_type='GPU', devices='0:1', cat_features=cat_features, embedding_features=embed_features, depth=10, l2_leaf_reg=1, iterations=1000, learning_rate=0.05)

In [32]:
X_train.columns

Index(['brand', 'model', 'generation', 'modification', 'equipment',
       'body_type', 'color', 'year', 'mileage', 'owners_count', 'crashes',
       'is_taxi', 'is_carsharing', 'car_age', 'month', 'day', 'sale_year',
       'day_of_week', 'day_of_year', 'distance_from_moscow', 'text_emb'],
      dtype='object')

In [33]:
test.columns

Index(['id', 'sale_end_date', 'brand', 'model', 'generation', 'modification',
       'equipment', 'body_type', 'color', 'year', 'mileage', 'owners_count',
       'latitude', 'longitude', 'crashes', 'is_taxi', 'is_carsharing',
       'description', 'car_age', 'month', 'day', 'sale_year', 'day_of_week',
       'day_of_year', 'distance_from_moscow', 'text_emb'],
      dtype='object')

In [25]:
model.load_model('catboost_emb_model')

<catboost.core.CatBoostRegressor at 0x169d67bfe10>

In [26]:
model.get_n_features_in()

0

In [35]:
pd.Series(model.predict(test.drop(['id', 'sale_end_date', 'description', 'latitude', 'longitude'], axis=1))).to_csv('solution6.csv', header=False, index=False)

In [21]:
X_train

Unnamed: 0,brand,model,generation,modification,equipment,body_type,color,year,mileage,owners_count,...,is_taxi,is_carsharing,car_age,month,day,sale_year,day_of_week,day_of_year,distance_from_moscow,text_emb
0,Hyundai,i40,I (2011—2015),2.0 AT (150 л.с.),Unknown,Седан,Белый,2014.0,106000,2,...,0.0,0.0,7.0,10,26,2021,1,299,1691.518471,"[0.026884185, -0.015162821, -0.024812264, -0.1..."
1,Hyundai,i40,I (2011—2015),2.0 AT (150 л.с.),Comfort,Седан,Чёрный,2013.0,179811,> 3,...,0.0,0.0,8.0,9,5,2021,6,248,1513.025798,"[-0.007450554, -0.024627624, -0.01478615, -0.0..."
2,Hyundai,Solaris,II (2017—2020),1.4 MT (100 л.с.),Active,Седан,Белый,2018.0,188000,1,...,0.0,0.0,3.0,7,20,2021,1,201,1515.653814,"[0.03736122, 0.0041526416, -0.045261025, -0.07..."
3,Mazda,Demio,DW (1997—2003),1.3 AT (83 л.с.),Базовая,Хетчбэк,Белый,2002.0,229346,> 3,...,0.0,0.0,19.0,10,26,2021,1,299,4417.209233,"[0.011051852, -0.016075635, 0.008649559, -0.07..."
4,Mazda,Demio,DW (1997—2003),1.3 AT (83 л.с.),Базовая,Хетчбэк,Белый,2003.0,202000,> 3,...,0.0,0.0,18.0,8,23,2021,0,235,1405.330865,"[0.027114373, 0.015217679, -0.048252683, -0.08..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478282,Ford,Focus,III рестайлинг (2014—2019),1.6 MT (105 л.с.),Unknown,Хетчбэк,Белый,2017.0,50480,1,...,0.0,0.0,4.0,11,26,2021,4,330,661.955812,"[-0.041036412, 0.0067323064, 0.06832428, -0.09..."
478283,Ford,Focus,III рестайлинг (2014—2019),1.6 MT (105 л.с.),Trend,Седан,Бежевый,2018.0,94300,1,...,0.0,0.0,3.0,9,28,2021,1,271,1465.579100,"[-0.0041431864, -0.016772578, -0.041577887, -0..."
478284,Ford,Focus,III рестайлинг (2014—2019),1.6 MT (105 л.с.),Trend,Универсал,Белый,2018.0,180837,3,...,0.0,0.0,3.0,8,28,2021,5,240,603.524155,"[-0.015547022, 0.017335007, 0.018243572, -0.07..."
478285,Ford,Focus,III рестайлинг (2014—2019),1.6 MT (105 л.с.),SYNC Edition,Седан,Синий,2017.0,47000,2,...,0.0,0.0,4.0,6,8,2021,1,159,841.666777,"[0.020203838, 0.014478373, -0.022698974, -0.07..."


In [None]:
model.predict(test.drop('description', axis=1))