In [20]:
import pandas as pd
import numpy as np
import typing
import torch

from sklearn.model_selection import train_test_split

from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML
from lightautoml.tasks import Task

import phik
from phik.report import plot_correlation_matrix
from phik import report

In [21]:
train_data = pd.read_csv('https://drive.google.com/file/d/1SZWyFSM6bOSjrrO20ZbE6bTOxy2fkdEC/view?usp=sharing', low_memory=False)
test_data = pd.read_csv('https://drive.google.com/file/d/1YONOySodli6pkGyd-Kvn1-A712SM4FQT/view?usp=sharing', low_memory=False)
train_data.shape, test_data.shape

((279792, 77), (2974, 76))

## EDA analisys

Анализ отчет pandas_profiling (! big size file) https://drive.google.com/file/d/1xQl3LvpX9J0G6gJoaBjzRcBFKZi6QZXz/view?usp=sharing

In [22]:
for col in train_data.select_dtypes(include=np.number).columns:
    train_data[col] = pd.to_numeric(train_data[col], downcast = 'unsigned')

In [23]:
train_data.describe()

Unnamed: 0,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,osm_building_points_in_0.005,osm_building_points_in_0.0075,osm_building_points_in_0.01,...,reform_count_of_houses_500,reform_house_population_1000,reform_house_population_500,reform_mean_floor_count_1000,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,total_square,realty_type,price_type
count,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,...,279792.0,265196.0,252558.0,263084.0,249624.0,263553.0,250155.0,279792.0,279792.0,279792.0
mean,54.364078,47.76354,2.709084,40.605146,81.596171,133.285458,0.037442,0.885701,2.046467,3.748163,...,30.110661,2042.541716,644.610557,7.051233,7.360464,1967.532599,1967.98858,507.833604,54.974088,0.016058
std,4.245713,17.044625,4.202451,53.293388,105.193169,172.290136,0.391014,6.858338,14.801566,25.679859,...,27.686234,1359.884747,445.699329,3.542084,4.231369,45.807699,54.110015,1704.251771,47.856417,0.1257
min,42.651897,19.892178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,5.1,10.0,0.0
25%,53.2266,37.582988,0.0,7.0,16.0,28.0,0.0,0.0,0.0,0.0,...,10.0,932.0,290.0,4.591837,4.619959,1960.07,1959.890097,65.9,10.0,0.0
50%,55.67909,39.702435,1.0,22.0,46.0,77.0,0.0,0.0,0.0,0.0,...,25.0,1949.0,602.0,6.368932,6.395349,1970.890411,1971.647059,128.737034,10.0,0.0
75%,56.306976,55.957523,4.0,51.0,101.0,164.0,0.0,0.0,1.0,2.0,...,43.0,2978.0,936.0,8.698925,9.1,1983.701754,1986.95,336.0,110.0,0.0
max,69.50074,151.777,46.0,468.0,851.0,1392.0,30.0,586.0,949.0,1162.0,...,289.0,18392.0,6105.0,53.717949,221.666667,2019.0,2020.0,40000.0,110.0,1.0


In [24]:
def check_features(df):
    return pd.DataFrame({'unique_values': df.nunique(),'type': df.dtypes,'pct_missing': df.isna().sum()/len(df) * 100}).sort_values(by = 'pct_missing', ascending=False)

In [25]:
check_features(train_data).T

Unnamed: 0,floor,reform_mean_floor_count_500,reform_mean_year_building_500,reform_house_population_500,reform_mean_floor_count_1000,reform_mean_year_building_1000,reform_house_population_1000,street,osm_city_nearest_population,region,...,osm_finance_points_in_0.005,osm_finance_points_in_0.001,osm_culture_points_in_0.01,osm_culture_points_in_0.0075,osm_culture_points_in_0.005,osm_culture_points_in_0.001,osm_crossing_points_in_0.01,osm_crossing_points_in_0.0075,osm_crossing_points_in_0.005,price_type
unique_values,206,17121,43143,2366,49017,76044,6206,28841,169,49,...,29,7,216,159,111,16,268,191,108,2
type,object,float64,float64,float64,float64,float64,float64,object,float64,object,...,uint8,uint8,uint16,uint16,uint8,uint8,uint16,uint8,uint8,uint8
pct_missing,62.9886,10.7823,10.5925,9.73366,5.97158,5.80395,5.21673,0.573998,0.0196575,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
train_data['floor'].unique()

array([nan, 3.0, 4.0, -1.0, 1.0, 14.0, 2.0, 8.0, -2.0, 6.0, 10.0, 5.0,
       16.0, 19.0, 7.0, 9.0, 58.0, 24.0, 18.0, 26.0, 17.0, 48.0, 11.0,
       -3.0, 15.0, 22.0, 60.0, 12.0, 21.0, 35.0, 28.0, 38.0, 39.0, 13.0,
       81.0, 44.0, 82.0, 25.0, 45.0, 47.0, 23.0, 37.0, 29.0, 113.0, 78.0,
       42.0, 69.0, 27.0, 46.0, 53.0, 80.0, 70.0, 76.0, 64.0, 30.0, 73.0,
       77.0, 52.0, 67.0, 65.0, 20.0, 40.0, 49.0, 75.0, 93.0, 94.0, 91.0,
       72.0, 79.0, 84.0, 92.0, 33.0, 66.0, 90.0, 31.0, 36.0, 61.0, 71.0,
       68.0, 51.0, 97.0, 43.0, 95.0, 85.0, 50.0, 0.0, 62.0, 54.0, 74.0,
       57.0, 41.0, 34.0, 59.0, 56.0, 123.0, 55.0, 83.0, '27.0', '1.0',
       '5.0', '-1.0', '67.0', '2.0', '0.0', '4.0', '6.0', '3.0', '15.0',
       '10.0', '11.0', '30.0', '12.0', '-2.0', '14.0', '36.0', '8.0',
       '50.0', '17.0', '19.0', '37.0', '68.0', '7.0', '42.0', '9.0',
       '16.0', '20.0', '53.0', '91.0', '84.0', '38.0', '21.0', '48.0',
       '22.0', '23.0', '1', '18.0', 'подвал, 1', '2', 'подвал',
  

In [28]:
#Вручную обработали столбец с этажами
for df in [train_data, test_data]:
    df.replace('1', 1, inplace=True)
    df.replace('1.0', 1, inplace=True)

train_data['floor'] = train_data.apply(lambda row: 1 if row['floor'] == 1 else 0, axis=1)
test_data['floor'] = test_data.apply(lambda row: 1 if row['floor'] == 1 else 0, axis=1)

In [27]:
# вручную удалила повторяющиеся признаки с разным метражом, оставив только значение "in 0.01" для osm и "1000" для reform, чтобы матрица корреляции помещалась на экране
columns_for_pearson = [
                       'per_square_meter_price', 
                       'city', 'floor',
                       'osm_amenity_points_in_0.01', 
                       'osm_building_points_in_0.01', 
                       'osm_catering_points_in_0.01', 
                       'osm_city_closest_dist', 
                       'osm_city_nearest_name', 
                       'osm_city_nearest_population',
                       'osm_crossing_closest_dist', 
                       'osm_crossing_points_in_0.01', 
                       'osm_culture_points_in_0.01',
                       'osm_healthcare_points_in_0.01', 
                       'osm_historic_points_in_0.01', 
                       'osm_hotels_points_in_0.01',
                       'osm_leisure_points_in_0.01', 
                       'osm_offices_points_in_0.01', 
                       'osm_shops_points_in_0.01', 
                       'osm_subway_closest_dist',
                       'osm_train_stop_closest_dist', 
                       'osm_train_stop_points_in_0.01', 
                       'osm_transport_stop_closest_dist',
                       'osm_transport_stop_points_in_0.01', 
                       'reform_count_of_houses_1000', 
                       'reform_house_population_1000',
                       'reform_mean_floor_count_1000', 
                       'reform_mean_year_building_1000', 
                       'region', 
                       'total_square', 
                       'street', 
                       'date', 
                       'realty_type', 
                       'price_type']

In [29]:
# Находим признаки, имеющие максимальное значение коэффициента корреляции Пирсона с целевой переменной. 
# В качестве отсечки выбрано значение 0.4 по модулю
pearson = train_data[columns_for_pearson].corr().round(2)
pearson_max_corr = (
    pearson['per_square_meter_price'].to_frame().reset_index()
    .rename(columns={'per_square_meter_price':'pearson', 'index':'feature'})
    .sort_values(by='pearson', ascending=False)
    .query('pearson > 0.4 or pearson < -0.4')
    )
pearson_max_corr

Unnamed: 0,feature,pearson
0,per_square_meter_price,1.0
6,osm_city_nearest_population,0.55
2,osm_amenity_points_in_0.01,0.48
10,osm_healthcare_points_in_0.01,0.46
4,osm_catering_points_in_0.01,0.46
13,osm_leisure_points_in_0.01,0.46
15,osm_shops_points_in_0.01,0.44
20,osm_transport_stop_points_in_0.01,0.43
8,osm_crossing_points_in_0.01,0.43
14,osm_offices_points_in_0.01,0.42


In [30]:
# fig, ax = plt.subplots(figsize=(20, 20))
# sns.heatmap(train_data[columns_for_pearson].corr().round(2), annot=True, square=True, cmap='mako')
# ax.set_title(label = 'МАТРИЦА КОРРЕЛЯЦИИ ПРИЗНАКОВ $r$', fontdict={'fontsize': 15, 'fontweight': 'bold'})
# plt.show();

Высокая взаимная корреляция (> 0.8) у следующих переменных:
- amenity и catering, healthcare, office, shop
- catering и shop
- office и shop, catering
- healthcare и catering, office, shop,
- transport и crossing

In [31]:
# убрали city и street(закодирована)

columns_for_phik = [
  'per_square_meter_price',
 'floor',
 'osm_amenity_points_in_0.01',
 'osm_building_points_in_0.01',
 'osm_catering_points_in_0.01',
 'osm_city_closest_dist',
 'osm_city_nearest_name',
 'osm_city_nearest_population',
 'osm_crossing_closest_dist',
 'osm_crossing_points_in_0.01',
 'osm_culture_points_in_0.01',
 'osm_healthcare_points_in_0.01',
 'osm_historic_points_in_0.01',
 'osm_hotels_points_in_0.01',
 'osm_leisure_points_in_0.01',
 'osm_offices_points_in_0.01',
 'osm_shops_points_in_0.01',
 'osm_subway_closest_dist',
 'osm_train_stop_closest_dist',
 'osm_train_stop_points_in_0.01',
 'osm_transport_stop_closest_dist',
 'osm_transport_stop_points_in_0.01',
 'reform_count_of_houses_1000',
 'reform_house_population_1000',
 'reform_mean_floor_count_1000',
 'reform_mean_year_building_1000',
 'region',
 'total_square',
 'date',
 'realty_type',
 'price_type']

In [32]:
# считаем корреляцию phik - она позволяет находитб взаимосвязи в том числе между категориальными переменными. 
# направление взаимосвязи не видно, только абсолютное значение. Чем ближе к единице, тем лучше

# выделяем интервальные переменные
interval_cols = ['osm_amenity_points_in_0.01', 'osm_building_points_in_0.01',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
       'osm_city_nearest_population','osm_crossing_closest_dist', 'osm_crossing_points_in_0.001', 'osm_culture_points_in_0.01',
       'osm_healthcare_points_in_0.01', 'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.01',
       'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.01',
       'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
       'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.01',
       'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.01', 'per_square_meter_price',
       'reform_count_of_houses_1000', 'reform_house_population_1000', 
       'reform_mean_floor_count_1000', 'reform_mean_year_building_1000',
      'total_square', 'realty_type', 'price_type', 'many_floors',
      'city', 'street']

# строим матрицу корреляции
phik_overview = train_data[columns_for_phik].phik_matrix(interval_cols=interval_cols)

phik_overview.round(2)

# визуализируем с помощью тепловой карты
# fig, ax = plt.subplots(figsize=(20, 20))
# sns.heatmap(phik_overview.round(2), annot=True, square=True, cmap='mako')
# ax.set_title(label = 'МАТРИЦА КОРРЕЛЯЦИИ ПРИЗНАКОВ $\phi_K$', fontdict={'fontsize': 15, 'fontweight': 'bold'})

# plt.tight_layout()
# plt.show();

Unnamed: 0,per_square_meter_price,floor,osm_amenity_points_in_0.01,osm_building_points_in_0.01,osm_catering_points_in_0.01,osm_city_closest_dist,osm_city_nearest_name,osm_city_nearest_population,osm_crossing_closest_dist,osm_crossing_points_in_0.01,...,osm_transport_stop_points_in_0.01,reform_count_of_houses_1000,reform_house_population_1000,reform_mean_floor_count_1000,reform_mean_year_building_1000,region,total_square,date,realty_type,price_type
per_square_meter_price,1.0,0.18,0.48,0.02,0.48,0.05,0.57,0.55,0.0,0.57,...,0.46,0.16,0.19,0.23,0.03,0.52,0.02,0.11,0.18,0.05
floor,0.18,1.0,0.07,0.01,0.06,0.08,0.28,0.15,0.01,0.14,...,0.12,0.03,0.1,0.15,0.0,0.23,0.01,0.22,0.04,0.0
osm_amenity_points_in_0.01,0.48,0.07,1.0,0.02,0.96,0.07,0.56,0.49,0.0,0.82,...,0.69,0.63,0.62,0.25,0.04,0.45,0.05,0.07,0.18,0.22
osm_building_points_in_0.01,0.02,0.01,0.02,1.0,0.02,0.0,0.28,0.17,0.0,0.11,...,0.04,0.03,0.03,0.03,0.0,0.22,0.0,0.01,0.03,0.01
osm_catering_points_in_0.01,0.48,0.06,0.96,0.02,1.0,0.05,0.51,0.47,0.0,0.8,...,0.63,0.57,0.6,0.23,0.03,0.4,0.05,0.07,0.19,0.2
osm_city_closest_dist,0.05,0.08,0.07,0.0,0.05,1.0,0.8,0.11,0.73,0.22,...,0.15,0.09,0.11,0.13,0.0,0.4,0.0,0.02,0.06,0.01
osm_city_nearest_name,0.57,0.28,0.56,0.28,0.51,0.8,1.0,1.0,0.42,0.64,...,0.62,0.55,0.48,0.59,0.44,1.0,0.12,0.22,0.37,0.3
osm_city_nearest_population,0.55,0.15,0.49,0.17,0.47,0.11,1.0,1.0,0.0,0.6,...,0.49,0.2,0.24,0.31,0.05,0.97,0.07,0.14,0.14,0.07
osm_crossing_closest_dist,0.0,0.01,0.0,0.0,0.0,0.73,0.42,0.0,1.0,0.0,...,0.02,0.01,0.0,0.01,0.0,0.07,0.0,0.0,0.01,0.0
osm_crossing_points_in_0.01,0.57,0.14,0.82,0.11,0.8,0.22,0.64,0.6,0.0,1.0,...,0.77,0.67,0.59,0.41,0.17,0.55,0.18,0.17,0.31,0.09


In [33]:
# Формируем список переменных с максимальным коэффициентов корреляции phik с целевой переменной 
phik_max_corr = (
    phik_overview['per_square_meter_price'].to_frame().reset_index()
    .rename(columns={'per_square_meter_price':'phik', 'index':'feature'})
    .sort_values(by='phik', ascending=False)
    .query('phik > 0.4')
    .round(2)
    )
phik_max_corr


Unnamed: 0,feature,phik
0,per_square_meter_price,1.0
9,osm_crossing_points_in_0.01,0.57
6,osm_city_nearest_name,0.57
7,osm_city_nearest_population,0.55
26,region,0.52
2,osm_amenity_points_in_0.01,0.48
4,osm_catering_points_in_0.01,0.48
14,osm_leisure_points_in_0.01,0.47
15,osm_offices_points_in_0.01,0.47
16,osm_shops_points_in_0.01,0.47


Матрица корреляции phik показала максимальную взаимосвязь целевой переменной с пременными:
- количество пешеходных переходов в радиусе 1 км
- название ближайшего города
- население ближайшего города
- регион
- количество в радиусе 1 км точек кейтеринга, досуга, офисов, магазинов, медучреждений, остановок общественного транспорта и объектов связаннных с удобством

Также обнаружены новые взаимосвязи переменных между собой, которые не были видны на матрице корреляции Пирсона:
- этаж и price_type
- название близлежащего города и население ближайшего города, расстояние до ближайшего метро, остановки общественного транспорта, регион
- регион и расстояние до ближайшего метро

## Modelling

In [35]:
# add features
city_population = pd.read_csv('city_population.csv')
zarplaty = pd.read_excel('zarplaty.xlsx')

def city_type(row):
    if row >=1000000:
        return "1Million"
    elif  (row<1000000)&(row >200000):
        return "Medium"
    elif  (row <=200000):
        return "Small"
    
def floor_type(row):
    if ('1' in str(row))&(row!=-1):
        return 1
    else:
        return 0
    
def add_features(df):
    df['age'] = round(2021 - df['reform_mean_year_building_500'])
    df.city = df.city.apply(lambda x: x.lower())
    
    city_population_clean = city_population.groupby('settlement').agg({'population':'sum'}).reset_index()
    city_population_clean.columns = ['city', 'city_population']
    city_population_clean['city_population']
    city_population_clean.city = city_population_clean.city.apply(lambda x: x.lower())
    df = df.merge(city_population_clean, on = 'city', how='left')
    
    for col in df.select_dtypes(include=np.number).columns:
        df[col] = pd.to_numeric(df[col], downcast = 'unsigned')
    
    df['city_type'] = df['city_population'].apply(lambda x: city_type(x))
    df.loc[df.city  == 'москва', 'city_type'] = "Capital"
    df.loc[df.city  == 'санкт-Петербург', 'city_type'] = "Capital"
    
    df = df.merge(zarplaty, on = 'region', how='left')
    df['zarplata'] = pd.to_numeric(df['zarplata'], downcast = 'unsigned')
    df['floor_type'] = df['floor'].apply(lambda x: floor_type(x))
    
    return df

In [36]:
train_data = add_features(train_data)
test_data = add_features(test_data)

In [37]:
train_data.shape, test_data.shape

((279967, 82), (2974, 81))

In [38]:
train_data.head()

Unnamed: 0,city,floor,id,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,...,total_square,street,date,realty_type,price_type,age,city_population,city_type,zarplata,floor_type
0,пермь,0,COL_0,57.998207,56.292797,4,19,35,52,0,...,32.0,S27289,2020-01-05,10,0,60.0,1048011.0,1Million,41958.0,0
1,шатура,0,COL_1,55.574284,39.543835,3,24,37,59,0,...,280.0,S17052,2020-01-05,10,0,64.0,32885.0,Small,58066.0,0
2,ярославль,0,COL_2,57.61914,39.850525,1,30,67,128,0,...,297.4,S16913,2020-01-05,110,0,48.0,604128.0,Medium,,0
3,новокузнецк,0,COL_3,53.897083,87.108604,0,0,5,21,0,...,190.0,S10148,2020-01-05,110,0,7.0,551919.0,Medium,43429.0,0
4,москва,0,COL_4,55.80259,37.48711,1,23,64,153,0,...,60.2,S1338,2020-01-05,10,0,60.0,12380691.0,Capital,100070.0,0


In [39]:
train_data = train_data.query('price_type == 1')

In [40]:
train_data.replace('1', 1, inplace=True)
train_data.replace('1.0', 1, inplace=True)
test_data.replace('1', 1, inplace=True)
test_data.replace('1.0', 1, inplace=True)

train_data['floor'] = train_data.apply(lambda row: 1 if row['floor'] == 1 else 0, axis=1)
test_data['floor'] = test_data.apply(lambda row: 1 if row['floor'] == 1 else 0, axis=1)

In [41]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
#TEST_SIZE = 0.1 # Test size for metric check
TIMEOUT = 100 # Time in seconds for automl run USE TIMEOUT = 1700 for perfect score

np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [42]:
# train_df, test_df = train_test_split(train_data, 
#                                      test_size=TEST_SIZE, 
#                                      random_state=RANDOM_STATE)

In [43]:
THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1

def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:
    """
    Реализация кастомной метрики для хакатона.

    :param y_true: float, реальная цена
    :param y_pred: float, предсказанная цена
    :return: float, значение метрики
    """
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9


def deviation_metric(y_true: np.array, y_pred: np.array) -> float:
    return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]).mean()

In [44]:
task = Task('reg', loss = 'rmsle', metric = deviation_metric)

roles = {
    'target': 'per_square_meter_price',
    'drop': 'id'
}

In [45]:
automl = TabularUtilizedAutoML(task = task,
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               reader_params = {'n_jobs': N_THREADS,
                                                'cv': N_FOLDS,
                                                'random_state': RANDOM_STATE})

oof_pred = automl.fit_predict(train_data, roles = roles)

Current random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
Found reader_params in kwargs, need to combine
Merged variant for reader_params = {'n_jobs': 4, 'cv': 5, 'random_state': 42}
Start automl preset with listed constraints:
- time: 100.0 seconds
- cpus: 4 cores
- memory: 16 gb

Train data shape: (4493, 82)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 89.13144850730896 secs
Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Linear model: C = 1e-05 score = [-3.7152774]
Linear model: C = 5e-05 score = [-3.7152774]
Linear model: C = 0.0001 score = [-3.7152774]

===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Linear model: C = 1e-05 score = [-3.3165922]
Linear model: C = 5e-05 score = [-3.3165922]
Linear model: C = 0.0001 score = [-3.3165922]

===== Start working with fold 2 for Lvl_0_Pipe_0

Time limit exceeded after calculating fold 0


Early stopping, best iteration is:
[1336]	valid's l2: 0.117866	valid's Opt metric: 1.6654
Lvl_0_Pipe_1_Mod_0_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_2_CatBoost ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_2_CatBoost =====

0:	learn: 0.5690497	test: 0.5902307	best: 0.5902307 (0)	total: 9.89ms	remaining: 19.8s
100:	learn: 0.3220134	test: 0.3603304	best: 0.3603304 (100)	total: 241ms	remaining: 4.53s
200:	learn: 0.2853510	test: 0.3517094	best: 0.3517094 (200)	total: 498ms	remaining: 4.46s
300:	learn: 0.2576051	test: 0.3490464	best: 0.3490393 (299)	total: 737ms	remaining: 4.16s
400:	learn: 0.2379688	test: 0.3474727	best: 0.3474727 (400)	total: 1s	remaining: 4s
500:	learn: 0.2222526	test: 0.3449390	best: 0.3449390 (500)	total: 1.23s	remaining: 3.67s
600:	learn: 0.2078858	test: 0.3447732	best: 0.3443742 (549)	total: 1.49s	remaining: 3.46s
700:	learn: 0.1960459	test: 0.3442549	best: 0.3441099 (652)	total: 1.75s	remaining: 3.24s
800:	learn: 0.185711

1900:	learn: 0.1579787	test: 0.3420298	best: 0.3418931 (1815)	total: 3.71s	remaining: 193ms
1999:	learn: 0.1538067	test: 0.3423697	best: 0.3418931 (1815)	total: 3.89s	remaining: 0us

bestTest = 0.3418931473
bestIteration = 1815

Shrink model to first 1816 iterations.
Lvl_0_Pipe_1_Mod_3_CatBoost fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_3_CatBoost ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_3_CatBoost =====

0:	learn: 0.5749238	test: 0.5962756	best: 0.5962756 (0)	total: 3.23ms	remaining: 9.69s
100:	learn: 0.3538624	test: 0.3750769	best: 0.3750769 (100)	total: 205ms	remaining: 5.9s
200:	learn: 0.3247688	test: 0.3579003	best: 0.3579003 (200)	total: 397ms	remaining: 5.53s
300:	learn: 0.3067646	test: 0.3529264	best: 0.3529264 (300)	total: 596ms	remaining: 5.34s
400:	learn: 0.2906869	test: 0.3494385	best: 0.3494385 (400)	total: 773ms	remaining: 5.01s
500:	learn: 0.2787513	test: 0.3480935	best: 0.3480935 (500)	total: 940ms	remaining: 4.69s
600:	learn: 0.268

1200:	learn: 0.2311849	test: 0.2972003	best: 0.2971525 (1196)	total: 2.42s	remaining: 3.63s
1300:	learn: 0.2256356	test: 0.2963918	best: 0.2963918 (1300)	total: 2.62s	remaining: 3.42s
1400:	learn: 0.2204745	test: 0.2958023	best: 0.2958023 (1400)	total: 2.81s	remaining: 3.2s
1500:	learn: 0.2158370	test: 0.2953656	best: 0.2953011 (1496)	total: 2.98s	remaining: 2.97s
1600:	learn: 0.2115284	test: 0.2947484	best: 0.2947203 (1567)	total: 3.14s	remaining: 2.75s
1700:	learn: 0.2073122	test: 0.2944945	best: 0.2944290 (1697)	total: 3.37s	remaining: 2.57s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2944289789
bestIteration = 1697

Shrink model to first 1698 iterations.
Lvl_0_Pipe_1_Mod_3_CatBoost fitting and predicting completed
Time left 37.47469425201416


Time limit exceeded in one of the tasks. AutoML will blend level 1 models.


Blending: Optimization starts with equal weights and score -1.7954214419609544
Blending, iter 0: score = -1.4724501895733335, weights = [0.         0.86662203 0.05572809 0.07764989]
Blending, iter 1: score = -1.4661120519816215, weights = [0.         0.7553367  0.10243508 0.14222825]
Blending, iter 2: score = -1.4661120519816215, weights = [0.         0.7553367  0.10243508 0.14222825]
No score update. Terminated

Automl preset training completed in 66.05 seconds.


In [46]:
#deviation_metric(np.array(train_df['per_square_meter_price']), oof_pred.data[:, 0])

In [49]:
output['per_square_meter_price'] = output['per_square_meter_price'] * 0.9

output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] \
    = output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] * 0.9

In [50]:
output = pd.DataFrame({'id': test_data['id'],
                       'per_square_meter_price': automl.predict(test_data).data[:, 0]})
output['per_square_meter_price'] = output['per_square_meter_price'] * 0.9

output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] \
    = output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] * 0.9

output.to_csv('raifHack_ki7.csv', index=False)

In [51]:
output.shape

(2974, 2)