In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score, recall_score
from sklearn.pipeline import Pipeline

from sklearn.decomposition import  PCA
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import PowerTransformer, StandardScaler


from sklearn.compose import ColumnTransformer

import matplotlib.colors as colors
colors_list = list(colors._colors_full_map.values())  # список цветов
from tqdm import tqdm_notebook

In [None]:
# !pip install catboost
import catboost

# Подготовка данных

In [None]:
df = pd.read_csv('Train.csv')
df.drop('id', axis=1, inplace=True)  # удалил неинформативный столбец
# df.drop(df.columns[12:23], axis=1, inplace=True)  # остальные очень разреженные
df.head()

In [None]:
df.metro_dist.fillna(df.metro_dist.median(), inplace=True) # заполнил пропуски медианой
# отформатировал дату
df.date = pd.to_datetime(df.date)
start = min(df.date)
df.date = df.date - start
df.date = df.date.convert_dtypes(float) / 24 / 60 / 60 / 10 ** 9  + 100 # дни (+100 нужно для корректного расчета весов для бустинга)
df.head()

## Удаляю пасхалку

In [None]:
df.hist(figsize=(15, 10))
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
plt.grid()
plt.scatter(df.area.values, df.price.values,alpha=0.1 )
plt.show()

In [None]:
unique = np.unique(df.price.values, return_counts=True)
arg = np.argsort(unique[1])
# пасхалка с 1кк
# np.sort(unique[1])

In [None]:
k = np.sort(unique[1])[::-1]

In [None]:
df = df.iloc[df.price.values != 1000000, :]
df.shape

## Удаляю выброс

In [None]:
from sklearn.preprocessing import PowerTransformer
PT = PowerTransformer(method='box-cox', standardize=True)
PT.fit(df.price.values.reshape(-1, 1))
test = PT.transform(df.price.values.reshape(-1, 1))
plt.figure(figsize=(15, 10))
plt.hist(test, bins=400)
plt.grid()
plt.show()

In [None]:
# Разбираю выброс
k = np.histogram(test, bins=400)
arg = np.argmax(k[:][0])  # номер столбика - выброса
p = k[:][0] / np.sum(k[:][0])

In [None]:
# вероятности  оставлять/не оставлять элемент в выборке (прореживаю)
p = [(k[0][arg + 1] + k[0][arg - 1]) / 2 / k[0][arg], 1 - (k[0][arg + 1] + k[0][arg - 1]) / 2 / k[0][arg]]  

In [None]:
np.random.seed(seed=42)
mask1 = np.random.choice([False, True], size = k[0][arg], p = p)  # маска элементов в выбросе
mask2 = ((test < k[1][arg + 1]) * (test > k[1][arg])).reshape(-1)  # общая масска участков

In [None]:
# похоже дом построили)))
plt.figure(figsize=(15, 10))
plt.scatter(df.area[mask2].values, df.price[mask2].values, alpha = 0.3)
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(15, 10))
plt.scatter(df.area[mask2].values[mask1], df.price[mask2].values[mask1], alpha = 0.3)
plt.grid()
plt.show()

In [None]:
index = df.price.iloc[mask2][mask1].index
df.drop(index, inplace = True)
df.shape

In [None]:
# вроде красивее 
PT = PowerTransformer(method='box-cox', standardize=True)
PT.fit(df.price.values.reshape(-1, 1))
test = PT.transform(df.price.values.reshape(-1, 1))
plt.figure(figsize=(15, 10))
plt.hist(test, bins=400)
plt.grid()
plt.show()

## Заполняю build_tech (т.к довольно сильный признак)

In [None]:
print('0 - Медиана:', np.median(df[df.build_tech.values == 0].price))
print('0 - Среднее:', np.mean(df[df.build_tech.values == 0].price))
print('1 - Медиана:', np.median(df[df.build_tech.values == 1].price))
print('1 - Среднее:', np.mean(df[df.build_tech.values == 1].price))
print('2 - Медиана:', np.median(df[df.build_tech.values == 2].price))
print('2 - Среднее:', np.mean(df[df.build_tech.values == 2].price))
df.info()

In [None]:
df_build = df.iloc[df.build_tech.notna().values, :]
y = 'build_tech'
x = {'street_id', 'date', 'floor', 'area', 'rooms', 'balcon', 'metro_dist', 'n_photos', 'kw1', 'kw2'}
x_cat = {'street_id', 'kw1', 'kw2'}

In [None]:
df_train_x, df_valid_x, build_y_train, build_y_valid = train_test_split(df_build[x], df_build[y].values)
weights_train_build = df_train_x.date.values / np.sum(df_train_x.date.values)  # приоритет у более поздних квартир


In [None]:
Pool_build = catboost.Pool(df_train_x, label = build_y_train, cat_features=x_cat, weight=weights_train_build)  # для обучения классификатора
Pool_test = catboost.Pool(df_valid_x, cat_features=x_cat)  # для теста качества предсказания

In [None]:
Cls = catboost.CatBoostClassifier(random_seed=42)
Cls.fit(Pool_build, verbose=False)
build_train = pred_train_build = Cls.predict(Pool_build)
build_test = pred_test_build = Cls.predict(Pool_test)

In [None]:
print(accuracy_score(build_train, build_y_train))  # 0.952
print(accuracy_score(build_test, build_y_valid))  # 0.949
print(recall_score(build_train, build_y_train, average='weighted'))  # 0.955
print(recall_score(build_test, build_y_valid, average='weighted'))  # 0.949

In [None]:
weights_full_build = df_build.date.values / np.sum(df_build.date.values) # приоритет у более поздних квартир
df_build_isna = df.iloc[df.build_tech.isna().values, :]

# данные для заполнения нанов
Pool_pred = catboost.Pool(df_build_isna, cat_features=x_cat)  
Pool_build_all = catboost.Pool(df_build[x], label = df_build[y], weight=weights_full_build, cat_features=x_cat)  # для обучения конечной модели

In [None]:
Cls = catboost.CatBoostClassifier()
Cls.fit(Pool_build_all, verbose = False)
test = Cls.predict(Pool_build_all)
print(recall_score(test, df_build[y], average='weighted'))  # 0.955
print(accuracy_score(test, df_build[y]))  # 0.955

In [None]:
pred_build = Cls.predict(Pool_pred)
df.iloc[df.build_tech.isna().values, 2]= pred_build
df.info()

In [None]:
print('0 - Медиана:', np.median(df[df.g_lift.values == 0].price))
print('0 - Среднее:', np.mean(df[df.g_lift.values == 0].price))
print('1 - Медиана:', np.median(df[df.g_lift.values == 1].price))
print('1 - Среднее:', np.mean(df[df.g_lift.values == 1].price))
df.drop('g_lift', axis=1, inplace=True)  # Похоже вообще никак не связан с ценой, просто удаляю
df.head()

In [None]:
# # Полезный признак
# for i in np.unique(df.floor.values):
#     print(i, '- Медиана:', np.median(df[df.floor.values == i].price))
#     print(i, '- Среднее:', np.mean(df[df.floor.values == i].price))

## Кластеризация

In [None]:
from sklearn.metrics import adjusted_rand_score
from sklearn.mixture import BayesianGaussianMixture
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

In [None]:
claster = {'build_tech', 'area', 'metro_dist', 'rooms', 'floor'}
classtering = Pipeline([
    ('SS', StandardScaler()),
    # ('Kmeans', KMeans(n_clusters=7))
    ('BayesianGaussianMixture', BayesianGaussianMixture(n_components = 7, random_state = 42)) 
])

In [None]:
labels = classtering.fit_predict(df[claster])
df.insert(11, value=labels, column='Class')  # добавил в дф столбец с метками кластера
np.unique(labels, return_counts = True)

## Визуализация кластеризации (очень долго считается, но красиво, похоже на кулич)

In [None]:
SS = StandardScaler()
for_tsne = SS.fit_transform(df[claster])
tsne = TSNE(n_components=2)
for_tsne = tsne.fit_transform(for_tsne)

In [None]:
plt.figure(figsize=(15, 10))
for i in np.unique(labels):
    plt.scatter(for_tsne[i == labels, 0], for_tsne[i==labels, 1], cmap=colors_list[i], alpha=0.02)
plt.show()

## Финальная подготовка данных для обучения и теста

In [None]:
df.info()

In [None]:
cat_index = {'street_id', 'kw1', 'kw2', 'Class'}
df_train, df_valid, y_train, y_valid = train_test_split(df.iloc[:, :12], df.price, random_state = 42, shuffle = False)
df_valid.head()

In [None]:
weights = df_train.date.values / np.sum(df_train.date.values) # веса для наблюдений, тк предсказываем в будущее они возрастают со временем
D_train = catboost.Pool(df_train, y_train, cat_features=cat_index, weight=weights)
D_valid = catboost.Pool(df_valid, y_valid, cat_features=cat_index)

full_weights = df.date.values / np.sum(df.date.values)
X_train = catboost.Pool(df.iloc[:, :12], df.price, cat_features=cat_index, weight=full_weights)

# Catboost

## Обучение и валидация на разбитой выборке

In [None]:
models = []
n_models = 5
for i in tqdm_notebook(range(n_models)):
    model = catboost.CatBoostRegressor(loss_function='MAE',
                                       learning_rate = 0.15,
                                       max_depth = 6,
                                       n_estimators = 1200,
                                       l2_leaf_reg = 50,
                                       random_state = i,
                                    #    bootstrap_type= 'Bernoulli', subsample=0.6,
                                       bootstrap_type= 'Bayesian',  bagging_temperature = 0.5,
                                       rsm = 0.7
                                       )
    model.fit(D_train,  verbose = False)
    models.append(model)

In [None]:
model_avg = catboost.sum_models(models=models, weights=[1.0 / len(models)] * len(models))

In [None]:
pred_train = model_avg.predict(D_train)
pred_test = model_avg.predict(D_valid)

In [None]:
print(mean_absolute_error(pred_train, y_train))  #  364 - 13
print(mean_absolute_error(pred_test, y_valid))  #   458

## Обучение на всей выборке

In [None]:
models_full = []
for i in tqdm_notebook(range(n_models)):
    model = catboost.CatBoostRegressor(loss_function='MAE',
                                       learning_rate = 0.15,
                                       max_depth = 6,
                                       n_estimators = 1200,
                                       l2_leaf_reg = 50,
                                       random_state = i,
                                    #    bootstrap_type= 'Bernoulli', subsample=0.6,
                                       bootstrap_type= 'Bayesian',  bagging_temperature = 0.5,
                                       rsm = 0.7
                                       )
    model.fit(X_train,  verbose = False)
    models_full.append(model)

In [None]:
model_avg_full = catboost.sum_models(models=models_full, weights=[1.0 / len(models_full)] * len(models_full))

# Предсказание

In [None]:
df_pred = pd.read_csv('Test.csv')
df_pred.drop('id', axis=1, inplace=True)  # удалил неинформативный столбец
df_pred.metro_dist.fillna(df_pred.metro_dist.median(), inplace=True) # заполнил пропуски медианой
df_pred.drop(df_pred.columns[12:23], axis=1, inplace=True)  # остальные очень разреженные
df_pred.drop('g_lift', axis=1, inplace=True)  # Похоже вообще никак не связан с ценой, просто удаляю
df_pred.head()

In [None]:
# отформатировал дату
df_pred.date = pd.to_datetime(df_pred.date)
df_pred.date = df_pred.date - start
df_pred.date = df_pred.date.convert_dtypes(float) / 24 / 60 / 60 / 10 ** 9  + 100# дни
df_pred.head()

In [None]:
df_pred.hist(figsize=(15, 10))

## Заполняю build_tech по актуальным данным, аналогично как для тренировочной выборки (без приоритета по дате)

In [None]:
df_build = df_pred.iloc[df_pred.build_tech.notna().values, :]
y = 'build_tech'
x = {'street_id', 'date', 'floor', 'area', 'rooms', 'balcon', 'metro_dist', 'n_photos', 'kw1', 'kw2'}
x_cat = {'street_id', 'kw1', 'kw2'}
df_train_x, df_valid_x, build_y_train, build_y_valid = train_test_split(df_build[x], df_build[y].values)
Pool_build = catboost.Pool(df_train_x, label = build_y_train, cat_features=x_cat)  # для обучения классификатора
Pool_test = catboost.Pool(df_valid_x, cat_features=x_cat)  # для теста качества предсказания

In [None]:
Cls = catboost.CatBoostClassifier()
Cls.fit(Pool_build, verbose=False)
build_train = pred_train_build = Cls.predict(Pool_build)
build_test = pred_test_build = Cls.predict(Pool_test)

In [None]:
print(accuracy_score(build_train, build_y_train))  # 0.955
print(accuracy_score(build_test, build_y_valid))  # 0.942
print(recall_score(build_train, build_y_train, average='weighted'))  # 0.955
print(recall_score(build_test, build_y_valid, average='weighted'))  # 0.942

In [None]:
df_build_isna = df_pred.iloc[df_pred.build_tech.isna().values, :]

# данные для заполнения нанов
Pool_pred = catboost.Pool(df_build_isna, cat_features=x_cat)  
Pool_build_all = catboost.Pool(df_build[x], label = df_build[y], cat_features=x_cat)  

In [None]:
Cls = catboost.CatBoostClassifier()
Cls.fit(Pool_build_all, verbose = False)
test = Cls.predict(Pool_build_all)
print(recall_score(test, df_build[y], average='weighted'))  
print(accuracy_score(test, df_build[y]))  

In [None]:
pred_build = Cls.predict(Pool_pred)
df_pred.iloc[df_pred.build_tech.isna().values, 2]= pred_build
df_pred.info()

In [None]:
labels_pred = classtering.predict(df_pred[claster])
df_pred.insert(11, value=labels_pred, column='Class')  # добавил в дф столбец с метками кластера
np.unique(labels_pred, return_counts = True)

In [None]:
X_pred = catboost.Pool(df_pred, cat_features=cat_index)

In [None]:
Pred = model_avg_full.predict(X_pred)

In [None]:
submission = pd.read_csv('SampleSubmission.csv')
submission.price = Pred 
submission.to_csv('boost_ensemble.csv', index=False)