In [1]:
%load_ext autoreload
%autoreload 2

In [31]:
import numpy as np
import pandas as pd
import catboost as cb
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gensim
import optuna

from sklearn.metrics import precision_score, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models.word2vec import Word2Vec


warnings.simplefilter('ignore')
np.random.seed(10)

Загрузим train и test

In [79]:
data = pd.read_feather(
    '../data/project_train.f', 
    columns=[
        'brand',
        'sale_end_date',
        'price',
        'actual_price',
        'description',
        'model',
        'year',
        'generation',
        'modification',
        'color',
        'body_type',
        'equipment',
        'owners_count',
        'mileage',
        'latitude',
        'longitude',
        'crashes',
        'is_taxi',
        'is_carsharing'
    ]
)
DEALER = ~data.actual_price.isna()
data.shape

(1424484, 19)


Для бейзлайна будем использовать в обучении только те данные, по которым есть actual_price, то есть цена сделки – наш финальный таргет.

In [80]:
data, users = data[DEALER], data[~DEALER]

In [81]:
data.sale_end_date.min(), data.sale_end_date.max() 

(Timestamp('2022-06-11 00:00:00'), Timestamp('2023-02-05 00:00:00'))

Разобьём выборку на train, val и test. Будем использовать временную валидацию.

In [82]:
data['sale_end_date'] = pd.to_datetime(data['sale_end_date'])

In [83]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81426 entries, 6 to 1424254
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   brand          81426 non-null  object        
 1   sale_end_date  81426 non-null  datetime64[ns]
 2   price          81426 non-null  object        
 3   actual_price   81426 non-null  float64       
 4   description    81393 non-null  object        
 5   model          81426 non-null  object        
 6   year           81426 non-null  int64         
 7   generation     81426 non-null  object        
 8   modification   81426 non-null  object        
 9   color          81426 non-null  object        
 10  body_type      81426 non-null  object        
 11  equipment      42733 non-null  object        
 12  owners_count   81426 non-null  object        
 13  mileage        81426 non-null  int64         
 14  latitude       81426 non-null  float64       
 15  longitude      81

In [84]:
train_delta = 30
val_delta = 10

TRAIN_SPLIT = data.sale_end_date.max() - timedelta(train_delta)
VAL_SPLIT = data.sale_end_date.max() - timedelta(val_delta)

In [85]:
train = data[data['sale_end_date'] <= TRAIN_SPLIT]
val = data[(data['sale_end_date'] > TRAIN_SPLIT)]
test = val[val['sale_end_date'] > VAL_SPLIT]
val = val[val['sale_end_date'] <= VAL_SPLIT]

print(f'Train rows: {train.shape[0]}')
print(f'Min train date: {train.sale_end_date.min()}')
print(f'Max train date: {train.sale_end_date.max()}')
print(f'Val rows: {val.shape[0]}')
print(f'Min val date: {val.sale_end_date.min()}')
print(f'Max val date: {val.sale_end_date.max()}')

Train rows: 69809
Min train date: 2022-06-11 00:00:00
Max train date: 2023-01-06 00:00:00
Val rows: 7173
Min val date: 2023-01-07 00:00:00
Max val date: 2023-01-26 00:00:00


### Парсинг колонок

In [86]:
def horsepower(x):
    bracket = x.find('(')
    last = x.find(')')
    return int(x[bracket + 1:last].split()[0])

train['horsepower'] = train['modification'].apply(horsepower)
val['horsepower'] = val['modification'].apply(horsepower)
test['horsepower'] = test.modification.apply(horsepower)

train['month'] = train.apply(lambda row:  int(row.sale_end_date.month), axis=1)
val['month'] = val.apply(lambda row:  int(row.sale_end_date.month), axis=1)
test['month'] = test.sale_end_date.dt.month.astype(int)

train['sale_year'] = train.apply(lambda row:  int(row.sale_end_date.year), axis=1)
val['sale_year'] = val.apply(lambda row:  int(row.sale_end_date.year), axis=1)
test['sale_year'] = test.sale_end_date.dt.year.astype(int)

### Эмбеддинги для текста

In [13]:
mystem = Mystem() 
russian_stopwords = stopwords.words("russian")

def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]
    
    text = " ".join(tokens)
    
    return text

In [14]:
train['description'] = train.description.fillna('').apply(preprocess_text)
val['description'] = val.description.fillna('').apply(preprocess_text)
test['description'] = test.description.fillna('').apply(preprocess_text)

In [20]:
w2v_corpus = users['description'].fillna('').sample(10000).apply(preprocess_text).str.split()

In [21]:
class LossLogger(CallbackAny2Vec):
    
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss - self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss
        

class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        print(f'Epoch {self.epoch}')
        self.epoch += 1

In [25]:
w2v_model = Word2Vec(sg=1, min_count=10, window=3, negative=15, hs=1, vector_size=100)
w2v_model.build_vocab(w2v_corpus)
w2v_model.train(
    w2v_corpus,
    total_examples=w2v_model.corpus_count,
    epochs=6,
    compute_loss=True,
    callbacks=[LossLogger()]
)

Loss after epoch 0: 6024833.5
Loss after epoch 1: 5198093.5
Loss after epoch 2: 4526521.0
Loss after epoch 3: 3298104.0
Loss after epoch 4: 2547020.0
Loss after epoch 5: 2860058.0


(2974690, 3583122)

In [43]:
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, w2v_model, alpha=2):
        
        self.w2v_model = w2v_model
        self.alpha = alpha
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X_transformed = np.zeros((len(X), self.w2v_model.wv.vector_size))
        for i, title in enumerate(X):
            
            title_vector = np.zeros((self.w2v_model.wv.vector_size,))
            try:
                tokens = title.split()
            except BaseException:
                continue
            
            counter = 1
            
            for token in tokens:
                if token in self.w2v_model.wv.key_to_index:
                    title_vector += self.w2v_model.wv.get_vector(token)
                    counter += 1 
                    
            X_transformed[i] = title_vector / (self.alpha * counter)
        
        return X_transformed

In [87]:
desc2vec = Pipeline([
    ('w2v', Word2VecTransformer(w2v_model=w2v_model)),
    ('scale', StandardScaler()),
    ('pca', PCA(25))
])

In [46]:
train_w2v = desc2vec.fit_transform(train['description'].values)
val_w2v = desc2vec.transform(val['description'].values)
test_w2v = desc2vec.transform(test['description'].values)

In [49]:
pca_cols = [f'pca_{i}' for i in range(1, 26)]

train_w2v = pd.DataFrame(train_w2v, columns=pca_cols)
val_w2v = pd.DataFrame(val_w2v, columns=pca_cols)
test_w2v = pd.DataFrame(test_w2v, columns=pca_cols)

In [119]:
CATEGORIES = [
    'brand',
    'model',
    'generation',
    'modification',
    'color',
    'body_type',
    'equipment',
    'owners_count',
]

NUMERIC = [
    'horsepower',
    'year',
    'month',
    'sale_year',
    'mileage',
    'latitude',
    'longitude',
    'crashes',
    'is_taxi',
    'is_carsharing'
] 
NUMERIC = NUMERIC + pca_cols
FEATURES = CATEGORIES + NUMERIC
IS_COLUMNS = [col for col in CATEGORIES if col.startswith('is_')]
NAN_COLS = ['pts', 'equipment', 'crashes'] + IS_COLUMNS

In [88]:
mapper = {
    'equipment': '', 
    'pts': '', 
    'is_taxi': -1, 
    'is_pledged': -1, 
    'is_restrictions': -1, 
    'is_carsharing': -1
}
train.fillna(mapper, inplace=True)
val.fillna(mapper, inplace=True)
test.fillna(mapper, inplace=True)

In [None]:
for col in IS_COLUMNS:
    train[col] = train[col].astype(str)
    val[col] = val[col].astype(str)
    test[col] = test[col].astype(str)

### regression

In [89]:
X_train = pd.concat([train.reset_index(), train_w2v], axis=1)[FEATURES]
X_val = pd.concat([val.reset_index(), val_w2v], axis=1)[FEATURES]
X_test = pd.concat([test.reset_index(), test_w2v], axis=1)[FEATURES]

y_train, y_val, y_test = (train['actual_price']), (val['actual_price']), test['actual_price']

Обучим CatBoostRegressor.

In [92]:
def mape(y_true, y_pred):
    return ((y_pred - y_true) / y_true).abs().median()

class MedianAPE:
    def __init__(self, f=lambda x: x, inv_f=lambda x: x):
        self.f = f
        self.inv_f = inv_f


    def get_final_error(self, error, weight=1.0):
        return error

    def is_max_optimal(self):
        # the lower metric value the better
        return False

    def evaluate(self, approxes, target, weight=None):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        preds = self.inv_f(np.array(approx))
        target = self.inv_f(np.array(target))
        error = np.median((np.abs(np.subtract(target, preds) / target))) * 100
        return (error, 1.0)

In [94]:
np.random.seed(10)

params = dict(
    learning_rate=0.05,
    iterations=3800,
    reg_lambda=0.0005,
    colsample_bylevel=1.,
    max_bin=80,
    bagging_temperature=2,
    loss_function='MAE',
    use_best_model=True,
    verbose=100,
    grow_policy='Depthwise',
    random_seed=42,
    eval_metric=MedianAPE()
    ignored_features=['is_taxi']
)
model = cb.CatBoostRegressor(
    **params,
)

eval_set = cb.Pool(data=X_val, label=y_val, cat_features=CATEGORIES)
model.fit(
    X_train[FEATURES], 
    y_train, 
    cat_features=CATEGORIES, 
    eval_set=eval_set, 
    plot=True,
    early_stopping_rounds=100,
)



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 42.1049137	test: 39.9216680	best: 39.9216680 (0)	total: 49ms	remaining: 3m 6s
100:	learn: 9.4438203	test: 9.4769438	best: 9.4769438 (100)	total: 3.61s	remaining: 2m 12s
200:	learn: 8.3300487	test: 8.5779632	best: 8.5779632 (200)	total: 6.68s	remaining: 1m 59s
300:	learn: 7.5356883	test: 8.1073201	best: 8.0948519 (293)	total: 9.67s	remaining: 1m 52s
400:	learn: 7.0222316	test: 7.6911276	best: 7.6884703 (395)	total: 11.8s	remaining: 1m 40s
500:	learn: 6.6829411	test: 7.5414953	best: 7.5390816 (499)	total: 14s	remaining: 1m 31s
600:	learn: 6.4276977	test: 7.4239942	best: 7.4239942 (600)	total: 16.1s	remaining: 1m 25s
700:	learn: 6.2134051	test: 7.3229292	best: 7.3229292 (700)	total: 18.2s	remaining: 1m 20s
800:	learn: 6.0507644	test: 7.3026702	best: 7.2860713 (791)	total: 20.2s	remaining: 1m 15s
900:	learn: 5.9016083	test: 7.2467303	best: 7.2356188 (860)	total: 22.3s	remaining: 1m 11s
1000:	learn: 5.7940925	test: 7.2029132	best: 7.1957744 (998)	total: 24.4s	remaining: 1m 8s
1100

<catboost.core.CatBoostRegressor at 0x7f41e82999f0>

### val

Посмотрим на метрики качества обученной модели. Видим, что medianAPE = 0.077510, а медиана сдвига (bias) = -0.022562.

In [95]:
val['prediction'] = (model.predict(X_val))
val['bias'] = (val['actual_price'] - (val['prediction'])) / val['actual_price']

In [96]:
stats = pd.concat([
    val.bias.describe(),
    val.bias.abs().describe(),
], axis=1)
stats.columns = ['bias', 'MAPE']

stats

Unnamed: 0,bias,MAPE
count,7173.0,7173.0
mean,-0.017728,0.100056
std,0.181688,0.152684
min,-7.317289,6.5e-05
25%,-0.074223,0.033078
50%,0.003771,0.070493
75%,0.067813,0.126109
max,1.0149,7.317289


### test

In [97]:
test['prediction'] = (model.predict(X_test))
test['bias'] = (test['actual_price'] - (test['prediction'])) / test['actual_price']

In [98]:
stats = pd.concat([
    test.bias.describe(),
    test.bias.abs().describe(),
], axis=1)
stats.columns = ['bias', 'MAPE']

stats

Unnamed: 0,bias,MAPE
count,4444.0,4444.0
mean,-0.001518,0.098131
std,0.174067,0.143769
min,-6.351438,1.4e-05
25%,-0.060557,0.033957
50%,0.015289,0.072013
75%,0.080804,0.123535
max,0.634401,6.351438
