In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import catboost as cb
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gensim
import optuna

from sklearn.metrics import precision_score, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models.word2vec import Word2Vec


warnings.simplefilter('ignore')
np.random.seed(10)

Загрузим train и test

In [None]:
data = pd.read_feather(
    '../data/project_train.f', 
    columns=[
        'brand',
        'sale_end_date',
        'price',
        'actual_price',
        'description',
        'model',
        'year',
        'generation',
        'modification',
        'color',
        'body_type',
        'equipment',
        'owners_count',
        'mileage',
        'latitude',
        'longitude',
        'crashes',
        'is_taxi',
        'is_carsharing'
    ]
)
DEALER = ~data.actual_price.isna()
data.shape


Для бейзлайна будем использовать в обучении только те данные, по которым есть actual_price, то есть цена сделки – наш финальный таргет.

In [None]:
data, users = data[DEALER], data[~DEALER]

In [None]:
data.sale_end_date.min(), data.sale_end_date.max() 

Разобьём выборку на train, val и test. Будем использовать временную валидацию.

In [None]:
data['sale_end_date'] = pd.to_datetime(data['sale_end_date'])

In [None]:
data.info()

In [None]:
train_delta = 30
val_delta = 10

TRAIN_SPLIT = data.sale_end_date.max() - timedelta(train_delta)
VAL_SPLIT = data.sale_end_date.max() - timedelta(val_delta)

In [None]:
train = data[data['sale_end_date'] <= TRAIN_SPLIT]
val = data[(data['sale_end_date'] > TRAIN_SPLIT)]
test = val[val['sale_end_date'] > VAL_SPLIT]
val = val[val['sale_end_date'] <= VAL_SPLIT]

print(f'Train rows: {train.shape[0]}')
print(f'Min train date: {train.sale_end_date.min()}')
print(f'Max train date: {train.sale_end_date.max()}')
print(f'Val rows: {val.shape[0]}')
print(f'Min val date: {val.sale_end_date.min()}')
print(f'Max val date: {val.sale_end_date.max()}')

### Парсинг колонок

In [None]:
def horsepower(x):
    bracket = x.find('(')
    last = x.find(')')
    return int(x[bracket + 1:last].split()[0])

train['horsepower'] = train['modification'].apply(horsepower)
val['horsepower'] = val['modification'].apply(horsepower)
test['horsepower'] = test.modification.apply(horsepower)

train['month'] = train.apply(lambda row:  int(row.sale_end_date.month), axis=1)
val['month'] = val.apply(lambda row:  int(row.sale_end_date.month), axis=1)
test['month'] = test.sale_end_date.dt.month.astype(int)

train['sale_year'] = train.apply(lambda row:  int(row.sale_end_date.year), axis=1)
val['sale_year'] = val.apply(lambda row:  int(row.sale_end_date.year), axis=1)
test['sale_year'] = test.sale_end_date.dt.year.astype(int)

### Эмбеддинги для текста

In [None]:
mystem = Mystem() 
russian_stopwords = stopwords.words("russian")

def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]
    
    text = " ".join(tokens)
    
    return text

In [None]:
train['description'] = train.description.fillna('').apply(preprocess_text)
val['description'] = val.description.fillna('').apply(preprocess_text)
test['description'] = test.description.fillna('').apply(preprocess_text)

In [None]:
w2v_corpus = users['description'].fillna('').sample(10000).apply(preprocess_text).str.split()

In [None]:
class LossLogger(CallbackAny2Vec):
    
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss - self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss
        

class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        print(f'Epoch {self.epoch}')
        self.epoch += 1

In [None]:
w2v_model = Word2Vec(sg=1, min_count=10, window=3, negative=15, hs=1, vector_size=100)
w2v_model.build_vocab(w2v_corpus)
w2v_model.train(
    w2v_corpus,
    total_examples=w2v_model.corpus_count,
    epochs=6,
    compute_loss=True,
    callbacks=[LossLogger()]
)

In [None]:
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, w2v_model, alpha=2):
        
        self.w2v_model = w2v_model
        self.alpha = alpha
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X_transformed = np.zeros((len(X), self.w2v_model.wv.vector_size))
        for i, title in enumerate(X):
            
            title_vector = np.zeros((self.w2v_model.wv.vector_size,))
            try:
                tokens = title.split()
            except BaseException:
                continue
            
            counter = 1
            
            for token in tokens:
                if token in self.w2v_model.wv.key_to_index:
                    title_vector += self.w2v_model.wv.get_vector(token)
                    counter += 1 
                    
            X_transformed[i] = title_vector / (self.alpha * counter)
        
        return X_transformed

In [None]:
desc2vec = Pipeline([
    ('w2v', Word2VecTransformer(w2v_model=w2v_model)),
    ('scale', StandardScaler()),
    ('pca', PCA(25))
])

In [None]:
train_w2v = desc2vec.fit_transform(train['description'].values)
val_w2v = desc2vec.transform(val['description'].values)
test_w2v = desc2vec.transform(test['description'].values)

In [None]:
pca_cols = [f'pca_{i}' for i in range(1, 26)]

train_w2v = pd.DataFrame(train_w2v, columns=pca_cols)
val_w2v = pd.DataFrame(val_w2v, columns=pca_cols)
test_w2v = pd.DataFrame(test_w2v, columns=pca_cols)

In [None]:
CATEGORIES = [
    'brand',
    'model',
    'generation',
    'modification',
    'color',
    'body_type',
    'equipment',
    'owners_count',
]

NUMERIC = [
    'horsepower',
    'year',
    'month',
    'sale_year',
    'mileage',
    'latitude',
    'longitude',
    'crashes',
    'is_taxi',
    'is_carsharing'
] 
NUMERIC = NUMERIC + pca_cols
FEATURES = CATEGORIES + NUMERIC
IS_COLUMNS = [col for col in CATEGORIES if col.startswith('is_')]
NAN_COLS = ['pts', 'equipment', 'crashes'] + IS_COLUMNS

In [None]:
mapper = {
    'equipment': '', 
    'pts': '', 
    'is_taxi': -1, 
    'is_pledged': -1, 
    'is_restrictions': -1, 
    'is_carsharing': -1
}
train.fillna(mapper, inplace=True)
val.fillna(mapper, inplace=True)
test.fillna(mapper, inplace=True)

In [None]:
for col in IS_COLUMNS:
    train[col] = train[col].astype(str)
    val[col] = val[col].astype(str)
    test[col] = test[col].astype(str)

### regression

In [None]:
X_train = pd.concat([train.reset_index(), train_w2v], axis=1)[FEATURES]
X_val = pd.concat([val.reset_index(), val_w2v], axis=1)[FEATURES]
X_test = pd.concat([test.reset_index(), test_w2v], axis=1)[FEATURES]

y_train, y_val, y_test = (train['actual_price']), (val['actual_price']), test['actual_price']

Обучим CatBoostRegressor.

In [None]:
def mape(y_true, y_pred):
    return ((y_pred - y_true) / y_true).abs().median()

class MedianAPE:
    def __init__(self, f=lambda x: x, inv_f=lambda x: x):
        self.f = f
        self.inv_f = inv_f


    def get_final_error(self, error, weight=1.0):
        return error

    def is_max_optimal(self):
        # the lower metric value the better
        return False

    def evaluate(self, approxes, target, weight=None):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        preds = self.inv_f(np.array(approx))
        target = self.inv_f(np.array(target))
        error = np.median((np.abs(np.subtract(target, preds) / target))) * 100
        return (error, 1.0)

In [None]:
np.random.seed(10)

params = dict(
    learning_rate=0.05,
    iterations=3800,
    reg_lambda=0.0005,
    colsample_bylevel=1.,
    max_bin=80,
    bagging_temperature=2,
    loss_function='MAE',
    use_best_model=True,
    verbose=100,
    grow_policy='Depthwise',
    random_seed=42,
    eval_metric=MedianAPE()
    ignored_features=['is_taxi']
)
model = cb.CatBoostRegressor(
    **params,
)

eval_set = cb.Pool(data=X_val, label=y_val, cat_features=CATEGORIES)
model.fit(
    X_train[FEATURES], 
    y_train, 
    cat_features=CATEGORIES, 
    eval_set=eval_set, 
    plot=True,
    early_stopping_rounds=100,
)



### val

Посмотрим на метрики качества обученной модели. Видим, что medianAPE = 0.077510, а медиана сдвига (bias) = -0.022562.

In [None]:
val['prediction'] = (model.predict(X_val))
val['bias'] = (val['actual_price'] - (val['prediction'])) / val['actual_price']

In [None]:
stats = pd.concat([
    val.bias.describe(),
    val.bias.abs().describe(),
], axis=1)
stats.columns = ['bias', 'MAPE']

stats

### test

In [None]:
test['prediction'] = (model.predict(X_test))
test['bias'] = (test['actual_price'] - (test['prediction'])) / test['actual_price']

In [None]:
stats = pd.concat([
    test.bias.describe(),
    test.bias.abs().describe(),
], axis=1)
stats.columns = ['bias', 'MAPE']

stats