In [1]:
import pandas as pd
import numpy as np

In [19]:
df = pd.read_csv('prepared_data.csv')

In [22]:
df['text_emb'] = np.load('embeddings.npy', allow_pickle=True)

In [3]:
df['text_emb'] = np.load('train_w2v.npy', allow_pickle=True).tolist()

In [9]:
df['img_emb'] = np.load('img_embeddings.npy', allow_pickle=True).tolist()

In [7]:
df['sale_end_date']

0         2021-10-26
1         2021-09-05
2         2021-07-20
3         2021-10-26
4         2021-08-23
             ...    
478282    2021-11-26
478283    2021-09-28
478284    2021-08-28
478285    2021-06-08
478286    2021-05-31
Name: sale_end_date, Length: 478287, dtype: object

In [10]:
# transform sale_end_date to day count from min of sale_end_date
df['sale_end_date'] = pd.to_datetime(df['sale_end_date'])
df['sale_end_date'] = (df['sale_end_date'] - df['sale_end_date'].min()).dt.days

In [11]:
X_train, y_train = df.drop(['id', 'price', 'actual_price', 'start_date', 'close_date', 'latitude', 'longitude', 'description'], axis=1), df['actual_price']

In [12]:
cat_features = ['brand', 'model', 'generation', 'modification', 'equipment', 'body_type', 'color', 'owners_count']
embed_features = ['text_emb', 'img_emb']

In [13]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(loss_function='RMSE', random_seed=42, eval_metric='MAPE', task_type='GPU', devices='0:1', cat_features=cat_features, embedding_features=embed_features, l2_leaf_reg=1, max_depth=10, iterations=1000, learning_rate=0.05)

In [14]:
model.fit(X_train, y_train, verbose=True)

0:	learn: 2.2313138	total: 343ms	remaining: 5m 42s
1:	learn: 2.1485674	total: 440ms	remaining: 3m 39s
2:	learn: 2.0632648	total: 527ms	remaining: 2m 55s
3:	learn: 1.9877638	total: 616ms	remaining: 2m 33s
4:	learn: 1.9173313	total: 702ms	remaining: 2m 19s
5:	learn: 1.8457250	total: 788ms	remaining: 2m 10s
6:	learn: 1.7762177	total: 852ms	remaining: 2m
7:	learn: 1.7089169	total: 944ms	remaining: 1m 57s
8:	learn: 1.6407084	total: 1.01s	remaining: 1m 50s
9:	learn: 1.5857133	total: 1.07s	remaining: 1m 46s
10:	learn: 1.5268382	total: 1.14s	remaining: 1m 42s
11:	learn: 1.4682118	total: 1.22s	remaining: 1m 40s
12:	learn: 1.4166285	total: 1.28s	remaining: 1m 37s
13:	learn: 1.3640734	total: 1.36s	remaining: 1m 36s
14:	learn: 1.3120331	total: 1.42s	remaining: 1m 33s
15:	learn: 1.2638745	total: 1.49s	remaining: 1m 31s
16:	learn: 1.2200248	total: 1.55s	remaining: 1m 29s
17:	learn: 1.1830935	total: 1.63s	remaining: 1m 28s
18:	learn: 1.1432230	total: 1.69s	remaining: 1m 27s
19:	learn: 1.1049101	total

<catboost.core.CatBoostRegressor at 0x18f109d9a50>

In [15]:
test = pd.read_csv('prepared_test.csv')

In [16]:
test['text_emb'] = np.load('test_w2v.npy', allow_pickle=True).tolist()

In [17]:
test['img_emb'] = np.load('test_img_embeddings.npy', allow_pickle=True).tolist()

In [21]:
test['sale_end_date'] = pd.to_datetime(test['sale_end_date'])
test['sale_end_date'] = (test['sale_end_date'] - pd.to_datetime(df['sale_end_date']).min()).dt.days

In [23]:
pd.Series(model.predict(test.drop(['id', 'description', 'latitude', 'longitude'], axis=1))).to_csv('solution6.csv', header=False, index=False)