In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime
import catboost as cb
from datetime import datetime, timedelta
from sklearn.model_selection import TimeSeriesSplit
import optuna
from optuna.samplers import TPESampler
import re
import string
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
import gensim
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models.word2vec import Word2Vec
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
import reverse_geocode
from catboost import CatBoostClassifier

warnings.simplefilter('ignore')

In [None]:
class MedianAPE:
    def __init__(self, f=lambda x: x, inv_f=lambda x: x):
        self.f = f
        self.inv_f = inv_f


    def get_final_error(self, error, weight=1.0):
        return error

    def is_max_optimal(self):
        # the lower metric value the better
        return False

    def evaluate(self, approxes, target, weight=None):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        preds = self.inv_f(np.array(approx))
        target = self.inv_f(np.array(target))
        error = np.median((np.abs(np.subtract(target, preds) / target))) * 100
        return (error, 1.0)

In [None]:
cols = ['actual_price', 'price', 'sale_end_date', 'description', 
                'brand', 'model', 'generation', 'modification', 'equipment',
       'body_type', 'drive_type', 'transmission_type', 'engine_type',
       'doors_number', 'color', 'year', 'mileage', 'owners_count',
       'steering_wheel', 'latitude', 'longitude', 'audiosistema', 'diski',
       'electropodemniki', 'fary', 'salon', 'upravlenie_klimatom', 
        'usilitel_rul', 'audiosistema_mult', 'shini_i_diski_mult']

df_train = pd.read_feather('project_data_imv_auto/project_train.f')[cols]
df_train['city'] = [i['city'] for i in reverse_geocode.search(df_train[['latitude', 'longitude']].values)]
df_description = df_train[(df_train['actual_price'].isna())]['description']
df_description = df_description.fillna('')
df_train = df_train[(df_train['actual_price'].notna())]
df_train['sale_end_date'] = pd.to_datetime(df_train['sale_end_date'])
df_train['month'] = df_train['sale_end_date'].dt.month

n_days = 30
train = df_train[df_train['sale_end_date'] <= df_train['sale_end_date'].max() - 2 * timedelta(n_days)]
val = df_train[(df_train['sale_end_date'] > df_train['sale_end_date'].max() - 2 * timedelta(n_days)) & (df_train['sale_end_date'] <= df_train['sale_end_date'].max() - timedelta(n_days))]
test = df_train[df_train['sale_end_date'] > df_train['sale_end_date'].max() - timedelta(n_days)]

train['description'] = train['description'].fillna('')
test['description'] = test['description'].fillna('')
val['description'] = val['description'].fillna('')

df_description_train = train['description']
df_description_test = test['description']
df_description_val = val['description']

options = pd.read_csv('project_data_imv_auto/option_names.csv')
stopwords = pd.read_csv('russian_stopwords.txt', encoding="windows-1251")

In [None]:
df_train = pd.read_csv('df_train.csv', sep=',').drop(['Unnamed: 0'], axis=1)
df_val = pd.read_csv('df_val.csv', sep=',').drop(['Unnamed: 0'], axis=1)
df_test = pd.read_csv('df_test.csv', sep=',').drop(['prediction', 'Unnamed: 0'], axis=1)

In [None]:
options = pd.read_csv('project_data_imv_auto/option_names.csv')
stopwords = pd.read_csv('russian_stopwords.txt', encoding="windows-1251")

In [None]:
df_description_train = df_train['description']
df_description_test = df_test['description']
df_description_val = df_val['description']

# word2vec
Обучим word2vec и получим эмбеддинги слов, из которых сделаем эмбеддинги объявлений.

In [None]:
WORD_PATTERN = '(?u)\\b\\w\\w+\\b'
wnl = WordNetLemmatizer()

def preprocessing(line, token=wnl):
    reg_exp = re.compile(pattern=WORD_PATTERN)
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    line = line.replace('\n\n', ' ').replace('\n', ' ')
    line = reg_exp.findall(line)
    line = [token.lemmatize(x) for x in line]
    line = [x for x in line if x not in stopwords.c.values]
    return line

In [None]:
sentences = [preprocessing(str(s)) for s in df_description_train]

In [None]:
class LossLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss - self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss
        

class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        print(f'Epoch {self.epoch}')
        self.epoch += 1

In [None]:
w2v_model = Word2Vec(sg=1, min_count=10, window=3, negative=15, hs=1, vector_size=100)
w2v_model.build_vocab(sentences)
w2v_model.train(
    sentences,
    total_examples=w2v_model.corpus_count,
    epochs=5,
    compute_loss=True,
    callbacks=[LossLogger()]
)

In [None]:
class Word2VecTransformer:
    
    def __init__(self, w2v_model, word_pattern, alpha=1):
        
        self.w2v_model = w2v_model
        self.word_pattern = word_pattern
        self.re = re.compile(pattern=self.word_pattern)
        self.alpha = alpha
        
    def fit(self, X):
        return self
    
    def transform(self, X):
        
        X_transformed = np.zeros((len(X), self.w2v_model.wv.vector_size))
        for i, title in enumerate(X):
            
            title_vector = np.zeros((self.w2v_model.wv.vector_size,))
            try:
                tokens = self.re.findall(title.lower())
            except BaseException:
                continue
            
            counter = 1
            
            for token in tokens:
                if token in self.w2v_model.wv.key_to_index:
                    title_vector += self.w2v_model.wv.get_vector(token)
                    counter += 1 
                    
            X_transformed[i] = title_vector / (self.alpha * counter)
        
        return X_transformed

In [None]:
w2v_transformer = Word2VecTransformer(w2v_model=w2v_model, word_pattern=WORD_PATTERN)

train_w2v = w2v_transformer.transform(df_train['description'].values)
val_w2v = w2v_transformer.transform(df_val['description'].values)
test_w2v = w2v_transformer.transform(df_test['description'].values)

In [None]:
# Попробуем снизить размерность до 25 с помощью PCA
centered_train_w2v = train_w2v - train_w2v.mean()
centered_val_w2v = val_w2v - train_w2v.mean()
centered_test_w2v = test_w2v - train_w2v.mean()

pca = PCA(n_components=25)
train_w2v_pca_decomp = pca.fit_transform(centered_train_w2v)
val_w2v_pca_decomp = pca.fit_transform(centered_val_w2v)
test_w2v_pca_decomp = pca.transform(centered_test_w2v)

In [None]:
pca_columns = [f"pca_{i}" for i in range(1, 26)]

train_pca_df = pd.DataFrame(
    data=train_w2v_pca_decomp,
    columns=pca_columns
)

val_pca_df = pd.DataFrame(
    data=val_w2v_pca_decomp,
    columns=pca_columns
)

test_pca_df = pd.DataFrame(
    data=test_w2v_pca_decomp,
    columns=pca_columns
)

train = pd.concat((df_train.reset_index(drop=True), train_pca_df.reset_index(drop=True)), axis=1)
val = pd.concat((df_val.reset_index(drop=True), val_pca_df.reset_index(drop=True)), axis=1)
test = pd.concat((df_test.reset_index(drop=True), test_pca_df.reset_index(drop=True)), axis=1)

# tf-idf

In [None]:
wnl = WordNetLemmatizer()

In [None]:
def preprocessing(line, token=wnl):
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    line = line.replace('\n\n', ' ').replace('\n', ' ')
    line = ' '.join([token.lemmatize(x) for x in line.split(' ')])
    return line

my_stop_words = text.ENGLISH_STOP_WORDS.union(["russian"])

tfidf = TfidfVectorizer(
    stop_words=my_stop_words,
    preprocessor=preprocessing,
    min_df=5,
)

In [None]:
tfidf.fit(df_description_train.fillna(''))

tfidf_train = tfidf.transform(df_description_train.fillna(''))
tfidf_val = tfidf.transform(df_description_val.fillna(''))
tfidf_test = tfidf.transform(df_description_test.fillna(''))

In [None]:
train['tfidf_sum'] = np.array(tfidf_train.sum(axis=1).ravel())[0]
train['tfidf_max'] = np.array(tfidf_train.max(axis=1).toarray().ravel())
train['tfidf_mean'] = np.array(tfidf_train.mean(axis=1).ravel())[0]

val['tfidf_sum'] = np.array(tfidf_val.sum(axis=1).ravel())[0]
val['tfidf_max'] = np.array(tfidf_val.max(axis=1).toarray().ravel())
val['tfidf_mean'] = np.array(tfidf_val.mean(axis=1).ravel())[0]

test['tfidf_sum'] = np.array(tfidf_test.sum(axis=1).ravel())[0]
test['tfidf_max'] = np.array(tfidf_test.max(axis=1).toarray().ravel())
test['tfidf_mean'] = np.array(tfidf_test.mean(axis=1).ravel())[0]

# Обработаем остальные фичи

In [None]:
def restyling_extract(gen_list: list) -> int:
    """
    Выделяем поколение рестайлинга из списка слов колонки generation
    """
    if len(gen_list) == 4:
        return int(gen_list[-2])
    elif len(gen_list) == 3:
        return 1
    return 0

def gb_mode(a):
    u, c = np.unique(a, return_counts=True)
    return u[c.argmax()]


def equipment_typos_transform(equipment: str) -> str:
    """
    Уберем найденные опечатки и приведём колонку к нижнему регистру
    """
    typos_dict = {
        "Bussines": "Business",
        "Elegancе": "Elegance",
        "Premuim": "Premium",
        "Standart": "Standard",
        "70-th Anniversary": "70th Anniversary",
        "Exclusive Mem": "Exclusive Mm",
        "Night Eagle\u200b": "Night Eagle",
        "[BLACK] '22": "[BLACK]'22"
    }
    return typos_dict.get(equipment, equipment).lower()


train_generation = train['generation'].str.split()
#df_train['generation_old'] = df_train['generation']
train['generation'] = train_generation.apply(lambda x: x[0])
train['generation_years'] = train_generation.apply(lambda x: x[-1])
train['restyling'] = train_generation.apply(lambda x: restyling_extract(x))

train['engine_volume'] = train.modification.str.extract(r'(?P<engine_volume>\d\.\d)')
train.loc[train['modification'] == 'FX30d 4WD AT (238 л.с.)', 'engine_volume'] = '3.0'
train.loc[train['modification'] == 'P85', 'engine_volume'] = '0.0'
train.loc[train['model'] == 'FX30', 'engine_volume'] = '3.0'
train['horse_power'] = train.modification.str.extract(r'(?P<horse_power>\(.*\))')
train['horse_power'] = train['horse_power'].str.strip('( л.с.)')
train['horse_power'] = train['horse_power'].fillna('382')
train['horse_power'] = train['horse_power'].astype(int)

train['equipment'] = train['equipment'].fillna('None').apply(lambda x: equipment_typos_transform(x))
EQUIPMENT_MODE_DICT = train[train['equipment'] != 'none'].groupby(['brand', 'model', 'generation']).equipment.apply(gb_mode)
train['brand_model_generation_restyling'] = train['brand'] + ' ' + train['model'] + ' ' + train['generation'] + ' ' + train['restyling'].astype(str)


val_generation = val['generation'].str.split()
#df_train['generation_old'] = df_train['generation']
val['generation'] = val_generation.apply(lambda x: x[0])
val['generation_years'] = val_generation.apply(lambda x: x[-1])
val['restyling'] = val_generation.apply(lambda x: restyling_extract(x))

val['engine_volume'] = val.modification.str.extract(r'(?P<engine_volume>\d\.\d)')
val.loc[val['modification'] == 'FX30d 4WD AT (238 л.с.)', 'engine_volume'] = '3.0'
val.loc[val['modification'] == 'P85', 'engine_volume'] = '0.0'
val.loc[val['model'] == 'FX30', 'engine_volume'] = '3.0'
val['horse_power'] = val.modification.str.extract(r'(?P<horse_power>\(.*\))')
val['horse_power'] = val['horse_power'].str.strip('( л.с.)')
val['horse_power'] = val['horse_power'].fillna('382')
val['horse_power'] = val['horse_power'].astype(int)

val['equipment'] = val['equipment'].fillna('None').apply(lambda x: equipment_typos_transform(x))
val['brand_model_generation_restyling'] = val['brand'] + ' ' + val['model'] + ' ' + val['generation'] + ' ' + val['restyling'].astype(str)


test_generation = test['generation'].str.split()
#df_test['generation_old'] = df_test['generation']
test['generation'] = test_generation.apply(lambda x: x[0])
test['generation_years'] = test_generation.apply(lambda x: x[-1])
test['restyling'] = test_generation.apply(lambda x: restyling_extract(x))

test['engine_volume'] = test.modification.str.extract(r'(?P<engine_volume>\d\.\d)')
test.loc[test['modification'] == 'FX30d 4WD AT (238 л.с.)', 'engine_volume'] = '3.0'
test.loc[test['modification'] == 'P85', 'engine_volume'] = '0.0'
test.loc[test['model'] == 'FX30', 'engine_volume'] = '3.0'
test['horse_power'] = test.modification.str.extract(r'(?P<horse_power>\(.*\))')
test['horse_power'] = test['horse_power'].str.strip('( л.с.)')
test['horse_power'] = test['horse_power'].fillna('382')
test['horse_power'] = test['horse_power'].astype(int)

test['equipment'] = test['equipment'].fillna('None').apply(lambda x: equipment_typos_transform(x))
test['brand_model_generation_restyling'] = test['brand'] + ' ' + test['model'] + ' ' + test['generation'] + ' ' + test['restyling'].astype(str)

In [None]:
OPTIONS_DICT = options.set_index('id').to_dict()['viewItemLabel']

def options_column_transform_inplace(df: pd.DataFrame) -> pd.DataFrame:
    """
    Преобразуем у входящего датафрейма
    колонки audiosystem, 
    discs, electropodemniki, 
    fary, upravlenie_klimatom, 
    usilitel_rul, audiosistema_mult, 
    shini_i_diski_mult
    """
    columns = [
        'audiosistema', 
        'diski', 
        'electropodemniki', 
        'fary', 
        'salon', 
        'upravlenie_klimatom', 
        'usilitel_rul'
    ]
    for col in columns:
        df[col] = df[col].apply(lambda x: OPTIONS_DICT.get(x, 'Нет данных'))
    for col in ['audiosistema_mult', 'shini_i_diski_mult']:
        df[col] = df[col].apply(
            lambda x: OPTIONS_DICT.get(
                float(x.strip('[]')), 'Нет данных'
            ) if x is not None else 'Нет данных'
        )
    return df

train = options_column_transform_inplace(train)
val = options_column_transform_inplace(val)
test = options_column_transform_inplace(test)

In [None]:
def equipment_mode_transform(row):
    if row['equipment'] == 'none':    
        return EQUIPMENT_MODE_DICT.get((row['brand'], row['model'], row['generation']), 'базовая')
    return row['equipment']


train['equipment'] = train.apply(lambda x: equipment_mode_transform(x), axis=1)
val['equipment'] = val.apply(lambda x: equipment_mode_transform(x), axis=1)
test['equipment'] = test.apply(lambda x: equipment_mode_transform(x), axis=1)

In [None]:
train['old_mileage'] = train['mileage'] / (2023 - train['year'])
val['old_mileage'] = val['mileage'] / (2023 - val['year'])
test['old_mileage'] = test['mileage'] / (2023 - test['year'])

In [None]:
cat_features = [
    'brand_model_generation_restyling', 'brand', 'model', 'generation', 'modification',
    'equipment', 'generation_years', 'body_type', 'drive_type', 
    'engine_type', 
]

num_features = [  
    'year',
    'mileage',
    'horse_power',
    'engine_volume', 
]

features = cat_features + num_features

X_train = train[features].reset_index(drop=True)
X_val = val[features].reset_index(drop=True)
X_test = test[features].reset_index(drop=True)

y_train, y_val, y_test = train['actual_price'], val['actual_price'], test['actual_price']

In [None]:
params = dict(
    learning_rate=0.05,
    iterations=2000,
    reg_lambda=0.0005,
    colsample_bylevel=1.,
    max_bin=80,
    bagging_temperature=2,
    loss_function='MAE',
    use_best_model=True,
    verbose=False,
    grow_policy='Depthwise',
    random_seed=42,
    eval_metric=MedianAPE(),
)
model = cb.CatBoostRegressor(
    **params,
)

eval_set = cb.Pool(data=X_val, label=y_val, cat_features=cat_features)
model.fit(X_train, y_train, cat_features=cat_features, eval_set=eval_set, plot=True)

In [None]:
val['prediction'] = model.predict(X_val)
val['bias'] = (val['actual_price'] - val['prediction']) / val['actual_price']

stats = pd.concat([
    val.bias.describe(),
    val.bias.abs().describe(),
], axis=1)

stats.columns = ['bias', 'MAPE']
stats['MAPE'] = stats['MAPE']

stats

In [None]:
importances = list(zip(model.feature_importances_, model.feature_names_))
importances.sort(reverse=True)
importances

In [None]:
test['prediction'] = model.predict(X_test)
test['bias'] = (test['actual_price'] - test['prediction']) / test['actual_price']

stats = pd.concat([
    test.bias.describe(),
    test.bias.abs().describe(),
], axis=1)
stats.columns = ['bias', 'MAPE']
stats['MAPE'] = stats['MAPE']

stats

In [None]:
params = dict(
    learning_rate=0.5,
    iterations=1000,
    reg_lambda=0.0005,
    colsample_bylevel=1.,
    max_bin=80,
    bagging_temperature=2,
    loss_function='MAE',
    use_best_model=True,
    verbose=False,
    grow_policy='Depthwise',
    random_seed=42,
    eval_metric=MedianAPE(),
)
model = cb.CatBoostRegressor(
    **params,
)

eval_set = cb.Pool(data=X_test, label=y_test, cat_features=cat_features)
model.fit(pd.concat((X_train, X_val), axis=0), pd.concat((y_train, y_val), axis=0), cat_features=cat_features, eval_set=eval_set, plot=True)

In [None]:
test['prediction'] = model.predict(X_test)
test['bias'] = (test['actual_price'] - test['prediction']) / test['actual_price']

stats = pd.concat([
    test.bias.describe(),
    test.bias.abs().describe(),
], axis=1)
stats.columns = ['bias', 'MAPE']
stats['MAPE'] = stats['MAPE']

stats

### Обучим находить машины с actual_price == price

In [None]:
params = dict(
    learning_rate=0.5,
    iterations=2000,
    reg_lambda=0.0005,
    colsample_bylevel=1.,
    max_bin=80,
    bagging_temperature=2,
    loss_function='Logloss',
    verbose=False,
    grow_policy='Depthwise',
    random_seed=42
)
classificator_model = cb.CatBoostClassifier(
    **params,
)

classificator_model.fit(X_class, y_class, cat_features=cat_features, plot=True)

In [None]:
val[val['bias'].abs() == val['bias'].abs().max()]

In [None]:
test[test['brand'] == ]

In [None]:
df_train[df_train['price'] < df_train['actual_price']].shape

## Поиск аномалий в данных для объяснения просадки метрики на тесте

In [None]:
eps = 70
min_samples = 10

val_dbscan = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(X_val[num_features])
test_dbscan = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(X_test[num_features])

In [None]:
unique, counts = np.unique(val_dbscan, return_counts=True)
np.asarray((unique, counts)).T

In [None]:
unique, counts = np.unique(test_dbscan, return_counts=True)
np.asarray((unique, counts)).T

In [None]:
test['prediction'] = model.predict(X_test)
test['bias'] = (test['actual_price'] - test['prediction']) / test['actual_price']

stats = pd.concat([
    test.bias.describe(),
    test.bias.abs().describe(),
], axis=1)
stats.columns = ['bias', 'MAPE']
stats['MAPE'] = stats['MAPE']

stats

In [None]:
high_bias_df = test[test['bias'].abs() >= test['bias'].abs().median()]

In [None]:
test['bias'].abs().median()

In [None]:
high_bias_df['year'].value_counts()

In [None]:
test['year'].value_counts()

In [None]:
val['class_target'] = 0
test['class_target'] = 1
clasification_df = pd.concat((val, test), axis=0)
X_class = clasification_df[features]
y_class = clasification_df['class_target']

In [None]:
from catboost import CatBoostClassifier

params = dict(
    learning_rate=0.5,
    iterations=2000,
    reg_lambda=0.0005,
    colsample_bylevel=1.,
    max_bin=80,
    bagging_temperature=2,
    loss_function='Logloss',
    verbose=False,
    grow_policy='Depthwise',
    random_seed=42
)
classificator_model = cb.CatBoostClassifier(
    **params,
)

classificator_model.fit(X_class, y_class, cat_features=cat_features, plot=True)

In [None]:
importances = list(zip(classificator_model.feature_importances_, classificator_model.feature_names_))
importances.sort(reverse=True)
importances