In [1]:
import pandas as pd
import numpy as np
import typing
import torch

from sklearn.model_selection import train_test_split

from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML
from lightautoml.tasks import Task

import phik
from phik.report import plot_correlation_matrix
from phik import report

In [2]:
train_data = pd.read_csv('../input/train.csv')
test_data = pd.read_csv('../input/test.csv')
train_data.shape, test_data.shape

((279792, 77), (2974, 76))

## EDA analisys

Анализ отчет pandas_profiling (! big size file) https://drive.google.com/file/d/1xQl3LvpX9J0G6gJoaBjzRcBFKZi6QZXz/view?usp=sharing

In [3]:
for col in train_data.select_dtypes(include=np.number).columns:
    train_data[col] = pd.to_numeric(train_data[col], downcast = 'unsigned')

In [4]:
train_data.describe()

Unnamed: 0,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,osm_building_points_in_0.005,osm_building_points_in_0.0075,osm_building_points_in_0.01,...,reform_count_of_houses_500,reform_house_population_1000,reform_house_population_500,reform_mean_floor_count_1000,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,total_square,realty_type,price_type
count,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,...,279792.0,265196.0,252558.0,263084.0,249624.0,263553.0,250155.0,279792.0,279792.0,279792.0
mean,54.364078,47.76354,2.709084,40.605146,81.596171,133.285458,0.037442,0.885701,2.046467,3.748163,...,30.110661,2042.541716,644.610557,7.051233,7.360464,1967.532599,1967.98858,507.833604,54.974088,0.016058
std,4.245713,17.044625,4.202451,53.293388,105.193169,172.290136,0.391014,6.858338,14.801566,25.679859,...,27.686234,1359.884747,445.699329,3.542084,4.231369,45.807699,54.110015,1704.251771,47.856417,0.1257
min,42.651897,19.892178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,5.1,10.0,0.0
25%,53.2266,37.582988,0.0,7.0,16.0,28.0,0.0,0.0,0.0,0.0,...,10.0,932.0,290.0,4.591837,4.619959,1960.07,1959.890097,65.9,10.0,0.0
50%,55.67909,39.702435,1.0,22.0,46.0,77.0,0.0,0.0,0.0,0.0,...,25.0,1949.0,602.0,6.368932,6.395349,1970.890411,1971.647059,128.737034,10.0,0.0
75%,56.306976,55.957523,4.0,51.0,101.0,164.0,0.0,0.0,1.0,2.0,...,43.0,2978.0,936.0,8.698925,9.1,1983.701754,1986.95,336.0,110.0,0.0
max,69.50074,151.777,46.0,468.0,851.0,1392.0,30.0,586.0,949.0,1162.0,...,289.0,18392.0,6105.0,53.717949,221.666667,2019.0,2020.0,40000.0,110.0,1.0


In [5]:
def check_features(df):
    return pd.DataFrame({'unique_values': df.nunique(),'type': df.dtypes,'pct_missing': df.isna().sum()/len(df) * 100}).sort_values(by = 'pct_missing', ascending=False)

In [6]:
check_features(train_data).T

Unnamed: 0,floor,reform_mean_floor_count_500,reform_mean_year_building_500,reform_house_population_500,reform_mean_floor_count_1000,reform_mean_year_building_1000,reform_house_population_1000,street,osm_city_nearest_population,region,...,osm_finance_points_in_0.005,osm_finance_points_in_0.001,osm_culture_points_in_0.01,osm_culture_points_in_0.0075,osm_culture_points_in_0.005,osm_culture_points_in_0.001,osm_crossing_points_in_0.01,osm_crossing_points_in_0.0075,osm_crossing_points_in_0.005,price_type
unique_values,206,17121,43143,2366,49017,76044,6206,28841,169,49,...,29,7,216,159,111,16,268,191,108,2
type,object,float64,float64,float64,float64,float64,float64,object,float64,object,...,uint8,uint8,uint16,uint16,uint8,uint8,uint16,uint8,uint8,uint8
pct_missing,62.9886,10.7823,10.5925,9.73366,5.97158,5.80395,5.21673,0.573998,0.0196575,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
train_data['floor'].unique()

array([nan, 3.0, 4.0, -1.0, 1.0, 14.0, 2.0, 8.0, -2.0, 6.0, 10.0, 5.0,
       16.0, 19.0, 7.0, 9.0, 58.0, 24.0, 18.0, 26.0, 17.0, 48.0, 11.0,
       -3.0, 15.0, 22.0, 60.0, 12.0, 21.0, 35.0, 28.0, 38.0, 39.0, 13.0,
       81.0, 44.0, 82.0, 25.0, 45.0, 47.0, 23.0, 37.0, 29.0, 113.0, 78.0,
       42.0, 69.0, 27.0, 46.0, 53.0, 80.0, 70.0, 76.0, 64.0, 30.0, 73.0,
       77.0, 52.0, 67.0, 65.0, 20.0, 40.0, 49.0, 75.0, 93.0, 94.0, 91.0,
       72.0, 79.0, 84.0, 92.0, 33.0, 66.0, 90.0, 31.0, 36.0, 61.0, 71.0,
       68.0, 51.0, 97.0, 43.0, 95.0, 85.0, 50.0, 0.0, 62.0, 54.0, 74.0,
       57.0, 41.0, 34.0, 59.0, 56.0, 123.0, 55.0, 83.0, '27.0', '1.0',
       '5.0', '-1.0', '67.0', '2.0', '0.0', '4.0', '6.0', '3.0', '15.0',
       '10.0', '11.0', '30.0', '12.0', '-2.0', '14.0', '36.0', '8.0',
       '50.0', '17.0', '19.0', '37.0', '68.0', '7.0', '42.0', '9.0',
       '16.0', '20.0', '53.0', '91.0', '84.0', '38.0', '21.0', '48.0',
       '22.0', '23.0', '1', '18.0', 'подвал, 1', '2', 'подвал',
  

In [8]:
#Вручную обработали столбец с этажами
for df in [train_data, test_data]:
    df.replace('1', 1, inplace=True)
    df.replace('1.0', 1, inplace=True)

train_data['floor'] = train_data.apply(lambda row: 1 if row['floor'] == 1 else 0, axis=1)
test_data['floor'] = test_data.apply(lambda row: 1 if row['floor'] == 1 else 0, axis=1)

In [9]:
# вручную удалила повторяющиеся признаки с разным метражом, оставив только значение "in 0.01" для osm и "1000" для reform, чтобы матрица корреляции помещалась на экране
columns_for_pearson = [
                       'per_square_meter_price', 
                       'city', 'floor',
                       'osm_amenity_points_in_0.01', 
                       'osm_building_points_in_0.01', 
                       'osm_catering_points_in_0.01', 
                       'osm_city_closest_dist', 
                       'osm_city_nearest_name', 
                       'osm_city_nearest_population',
                       'osm_crossing_closest_dist', 
                       'osm_crossing_points_in_0.01', 
                       'osm_culture_points_in_0.01',
                       'osm_healthcare_points_in_0.01', 
                       'osm_historic_points_in_0.01', 
                       'osm_hotels_points_in_0.01',
                       'osm_leisure_points_in_0.01', 
                       'osm_offices_points_in_0.01', 
                       'osm_shops_points_in_0.01', 
                       'osm_subway_closest_dist',
                       'osm_train_stop_closest_dist', 
                       'osm_train_stop_points_in_0.01', 
                       'osm_transport_stop_closest_dist',
                       'osm_transport_stop_points_in_0.01', 
                       'reform_count_of_houses_1000', 
                       'reform_house_population_1000',
                       'reform_mean_floor_count_1000', 
                       'reform_mean_year_building_1000', 
                       'region', 
                       'total_square', 
                       'street', 
                       'date', 
                       'realty_type', 
                       'price_type']

In [10]:
# Находим признаки, имеющие максимальное значение коэффициента корреляции Пирсона с целевой переменной. 
# В качестве отсечки выбрано значение 0.4 по модулю
pearson = train_data[columns_for_pearson].corr().round(2)
pearson_max_corr = (
    pearson['per_square_meter_price'].to_frame().reset_index()
    .rename(columns={'per_square_meter_price':'pearson', 'index':'feature'})
    .sort_values(by='pearson', ascending=False)
    .query('pearson > 0.4 or pearson < -0.4')
    )
pearson_max_corr

Unnamed: 0,feature,pearson
0,per_square_meter_price,1.0
6,osm_city_nearest_population,0.55
2,osm_amenity_points_in_0.01,0.48
10,osm_healthcare_points_in_0.01,0.46
4,osm_catering_points_in_0.01,0.46
13,osm_leisure_points_in_0.01,0.46
15,osm_shops_points_in_0.01,0.44
20,osm_transport_stop_points_in_0.01,0.43
8,osm_crossing_points_in_0.01,0.43
14,osm_offices_points_in_0.01,0.42


In [11]:
# fig, ax = plt.subplots(figsize=(20, 20))
# sns.heatmap(train_data[columns_for_pearson].corr().round(2), annot=True, square=True, cmap='mako')
# ax.set_title(label = 'МАТРИЦА КОРРЕЛЯЦИИ ПРИЗНАКОВ $r$', fontdict={'fontsize': 15, 'fontweight': 'bold'})
# plt.show();

Высокая взаимная корреляция (> 0.8) у следующих переменных:
- amenity и catering, healthcare, office, shop
- catering и shop
- office и shop, catering
- healthcare и catering, office, shop,
- transport и crossing

In [12]:
# убрали city и street(закодирована)

columns_for_phik = [
  'per_square_meter_price',
 'floor',
 'osm_amenity_points_in_0.01',
 'osm_building_points_in_0.01',
 'osm_catering_points_in_0.01',
 'osm_city_closest_dist',
 'osm_city_nearest_name',
 'osm_city_nearest_population',
 'osm_crossing_closest_dist',
 'osm_crossing_points_in_0.01',
 'osm_culture_points_in_0.01',
 'osm_healthcare_points_in_0.01',
 'osm_historic_points_in_0.01',
 'osm_hotels_points_in_0.01',
 'osm_leisure_points_in_0.01',
 'osm_offices_points_in_0.01',
 'osm_shops_points_in_0.01',
 'osm_subway_closest_dist',
 'osm_train_stop_closest_dist',
 'osm_train_stop_points_in_0.01',
 'osm_transport_stop_closest_dist',
 'osm_transport_stop_points_in_0.01',
 'reform_count_of_houses_1000',
 'reform_house_population_1000',
 'reform_mean_floor_count_1000',
 'reform_mean_year_building_1000',
 'region',
 'total_square',
 'date',
 'realty_type',
 'price_type']

In [13]:
# считаем корреляцию phik - она позволяет находитб взаимосвязи в том числе между категориальными переменными. 
# направление взаимосвязи не видно, только абсолютное значение. Чем ближе к единице, тем лучше

# выделяем интервальные переменные
interval_cols = ['osm_amenity_points_in_0.01', 'osm_building_points_in_0.01',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
       'osm_city_nearest_population','osm_crossing_closest_dist', 'osm_crossing_points_in_0.001', 'osm_culture_points_in_0.01',
       'osm_healthcare_points_in_0.01', 'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.01',
       'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.01',
       'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
       'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.01',
       'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.01', 'per_square_meter_price',
       'reform_count_of_houses_1000', 'reform_house_population_1000', 
       'reform_mean_floor_count_1000', 'reform_mean_year_building_1000',
      'total_square', 'realty_type', 'price_type', 'many_floors',
      'city', 'street']

# строим матрицу корреляции
phik_overview = train_data[columns_for_phik].phik_matrix(interval_cols=interval_cols)

phik_overview.round(2)

# визуализируем с помощью тепловой карты
# fig, ax = plt.subplots(figsize=(20, 20))
# sns.heatmap(phik_overview.round(2), annot=True, square=True, cmap='mako')
# ax.set_title(label = 'МАТРИЦА КОРРЕЛЯЦИИ ПРИЗНАКОВ $\phi_K$', fontdict={'fontsize': 15, 'fontweight': 'bold'})

# plt.tight_layout()
# plt.show();

Unnamed: 0,per_square_meter_price,floor,osm_amenity_points_in_0.01,osm_building_points_in_0.01,osm_catering_points_in_0.01,osm_city_closest_dist,osm_city_nearest_name,osm_city_nearest_population,osm_crossing_closest_dist,osm_crossing_points_in_0.01,...,osm_transport_stop_points_in_0.01,reform_count_of_houses_1000,reform_house_population_1000,reform_mean_floor_count_1000,reform_mean_year_building_1000,region,total_square,date,realty_type,price_type
per_square_meter_price,1.0,0.18,0.48,0.02,0.48,0.05,0.57,0.55,0.0,0.57,...,0.46,0.16,0.19,0.23,0.03,0.52,0.02,0.11,0.18,0.05
floor,0.18,1.0,0.07,0.01,0.06,0.08,0.28,0.15,0.01,0.14,...,0.12,0.03,0.1,0.15,0.0,0.23,0.01,0.22,0.04,0.0
osm_amenity_points_in_0.01,0.48,0.07,1.0,0.02,0.96,0.07,0.56,0.49,0.0,0.82,...,0.69,0.63,0.62,0.25,0.04,0.45,0.05,0.07,0.18,0.22
osm_building_points_in_0.01,0.02,0.01,0.02,1.0,0.02,0.0,0.28,0.17,0.0,0.11,...,0.04,0.03,0.03,0.03,0.0,0.22,0.0,0.01,0.03,0.01
osm_catering_points_in_0.01,0.48,0.06,0.96,0.02,1.0,0.05,0.51,0.47,0.0,0.8,...,0.63,0.57,0.6,0.23,0.03,0.4,0.05,0.07,0.19,0.2
osm_city_closest_dist,0.05,0.08,0.07,0.0,0.05,1.0,0.8,0.11,0.73,0.22,...,0.15,0.09,0.11,0.13,0.0,0.4,0.0,0.02,0.06,0.01
osm_city_nearest_name,0.57,0.28,0.56,0.28,0.51,0.8,1.0,1.0,0.42,0.64,...,0.62,0.55,0.48,0.59,0.44,1.0,0.12,0.22,0.37,0.3
osm_city_nearest_population,0.55,0.15,0.49,0.17,0.47,0.11,1.0,1.0,0.0,0.6,...,0.49,0.2,0.24,0.31,0.05,0.97,0.07,0.14,0.14,0.07
osm_crossing_closest_dist,0.0,0.01,0.0,0.0,0.0,0.73,0.42,0.0,1.0,0.0,...,0.02,0.01,0.0,0.01,0.0,0.07,0.0,0.0,0.01,0.0
osm_crossing_points_in_0.01,0.57,0.14,0.82,0.11,0.8,0.22,0.64,0.6,0.0,1.0,...,0.77,0.67,0.59,0.41,0.17,0.55,0.18,0.17,0.31,0.09


In [14]:
# Формируем список переменных с максимальным коэффициентов корреляции phik с целевой переменной 
phik_max_corr = (
    phik_overview['per_square_meter_price'].to_frame().reset_index()
    .rename(columns={'per_square_meter_price':'phik', 'index':'feature'})
    .sort_values(by='phik', ascending=False)
    .query('phik > 0.4')
    .round(2)
    )
phik_max_corr


Unnamed: 0,feature,phik
0,per_square_meter_price,1.0
9,osm_crossing_points_in_0.01,0.57
6,osm_city_nearest_name,0.57
7,osm_city_nearest_population,0.55
26,region,0.52
2,osm_amenity_points_in_0.01,0.48
4,osm_catering_points_in_0.01,0.48
14,osm_leisure_points_in_0.01,0.47
15,osm_offices_points_in_0.01,0.47
16,osm_shops_points_in_0.01,0.47


Матрица корреляции phik показала максимальную взаимосвязь целевой переменной с пременными:
- количество пешеходных переходов в радиусе 1 км
- название ближайшего города
- население ближайшего города
- регион
- количество в радиусе 1 км точек кейтеринга, досуга, офисов, магазинов, медучреждений, остановок общественного транспорта и объектов связаннных с удобством

Также обнаружены новые взаимосвязи переменных между собой, которые не были видны на матрице корреляции Пирсона:
- этаж и price_type
- название близлежащего города и население ближайшего города, расстояние до ближайшего метро, остановки общественного транспорта, регион
- регион и расстояние до ближайшего метро

## Modelling

In [15]:
# add features
city_population = pd.read_csv('city_population.csv')
zarplaty = pd.read_csv('zarplaty.csv', sep = ';')

def city_type(row):
    if row >=1000000:
        return "1Million"
    elif  (row<1000000)&(row >200000):
        return "Medium"
    elif  (row <=200000):
        return "Small"
    
def floor_type(row):
    if ('1' in str(row))&(row!=-1):
        return 1
    else:
        return 0
    
def add_features(df):
    df['age'] = round(2021 - df['reform_mean_year_building_500'])
    df.city = df.city.apply(lambda x: x.lower())
    
    city_population_clean = city_population.groupby('settlement').agg({'population':'sum'}).reset_index()
    city_population_clean.columns = ['city', 'city_population']
    city_population_clean['city_population']
    city_population_clean.city = city_population_clean.city.apply(lambda x: x.lower())
    df = df.merge(city_population_clean, on = 'city', how='left')
    
    for col in df.select_dtypes(include=np.number).columns:
        df[col] = pd.to_numeric(df[col], downcast = 'unsigned')
    
    df['city_type'] = df['city_population'].apply(lambda x: city_type(x))
    df.loc[df.city  == 'москва', 'city_type'] = "Capital"
    df.loc[df.city  == 'санкт-Петербург', 'city_type'] = "Capital"
    
    df = df.merge(zarplaty, on = 'region', how='left')
    df['zarplata'] = pd.to_numeric(df['zarplata'], downcast = 'unsigned')
    df['floor_type'] = df['floor'].apply(lambda x: floor_type(x))
    
    return df

In [16]:
train_data = add_features(train_data)
test_data = add_features(test_data)

In [17]:
train_data.shape, test_data.shape

((279967, 82), (2974, 81))

In [18]:
def convert_to_float(x):
    try:
        return float(x)
    except:
        return np.nan
    
for data in [train_data, test_data]:
    data['floor'] = data['floor'].map(convert_to_float).astype(float)
    i = 2
    data['lat_'+str(i)] = data['lat'].round(i)
    data['lng_'+str(i)] = data['lng'].round(i)
    data['square_ll'+str(i)] = data['lat_'+str(i)].astype(str) + '_' + data['lng_'+str(i)].astype(str)

In [19]:
train_data.head()

Unnamed: 0,city,floor,id,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,...,realty_type,price_type,age,city_population,city_type,zarplata,floor_type,lat_2,lng_2,square_ll2
0,пермь,0.0,COL_0,57.998207,56.292797,4,19,35,52,0,...,10,0,60.0,1048011.0,1Million,41958.0,0,58.0,56.29,58.0_56.29
1,шатура,0.0,COL_1,55.574284,39.543835,3,24,37,59,0,...,10,0,64.0,32885.0,Small,58066.0,0,55.57,39.54,55.57_39.54
2,ярославль,0.0,COL_2,57.61914,39.850525,1,30,67,128,0,...,110,0,48.0,604128.0,Medium,,0,57.62,39.85,57.62_39.85
3,новокузнецк,0.0,COL_3,53.897083,87.108604,0,0,5,21,0,...,110,0,7.0,551919.0,Medium,43429.0,0,53.9,87.11,53.9_87.11
4,москва,0.0,COL_4,55.80259,37.48711,1,23,64,153,0,...,10,0,60.0,12380691.0,Capital,100070.0,0,55.8,37.49,55.8_37.49


In [20]:
train_data = train_data.query('price_type == 1')

In [21]:
train_data.replace('1', 1, inplace=True)
train_data.replace('1.0', 1, inplace=True)
test_data.replace('1', 1, inplace=True)
test_data.replace('1.0', 1, inplace=True)

train_data['floor2'] = train_data.apply(lambda row: 1 if row['floor'] == 1 else 0, axis=1)
test_data['floor2'] = test_data.apply(lambda row: 1 if row['floor'] == 1 else 0, axis=1)

In [21]:
N_THREADS = 64 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
#TEST_SIZE = 0.1 # Test size for metric check
TIMEOUT = 5 * 3600 # Time in seconds for automl run USE TIMEOUT = 1700 for perfect score

np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [42]:
# train_df, test_df = train_test_split(train_data, 
#                                      test_size=TEST_SIZE, 
#                                      random_state=RANDOM_STATE)

In [22]:
THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1

def deviation_metric_vec(y_true: np.array, y_pred: np.array) -> float:
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    
    metr = deviation * 0.0 + 9
    
    metr[np.abs(deviation) <= THRESHOLD] = 0
    
    metr[deviation <= - 4 * THRESHOLD] = 9 * NEGATIVE_WEIGHT
    
    mask = (-4 * THRESHOLD < deviation) & (deviation < -THRESHOLD)
    metr[mask] = NEGATIVE_WEIGHT * ((deviation[mask] / THRESHOLD) + 1) ** 2
    
    mask = (deviation < 4 * THRESHOLD) & (deviation > THRESHOLD)
    metr[mask] = ((deviation[mask] / THRESHOLD) - 1) ** 2
    
    return metr.mean()



In [24]:
# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.dataset.roles import DatetimeRole

In [26]:
task = Task('reg', loss = 'rmsle', metric = deviation_metric_vec)

roles = {
    'target': 'per_square_meter_price',
    DatetimeRole(base_date=False, base_feats=True, seasonality=('d', 'wd')): 'date',
    'drop': 'id'
}

In [28]:
automl = TabularAutoML(task = task,
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               reader_params = {'n_jobs': N_THREADS,
                                                'cv': N_FOLDS,
                                                'random_state': RANDOM_STATE},
                      general_params = {'use_algos': [['lgb_tuned']]},
                      verbose = 3)

oof_pred = automl.fit_predict(train_data, roles = roles)

[[0;30;41mlog_lvl_1[0m] Start automl preset with listed constraints:
[[0;30;41mlog_lvl_1[0m] - time: 18000.00 seconds
[[0;30;41mlog_lvl_1[0m] - cpus: 64 cores
[[0;30;41mlog_lvl_1[0m] - memory: 16 gb

[[0;30;41mlog_lvl_1[0m] [1mTrain data shape: (4493, 85)[0m

[[0;30;42mlog_lvl_3[0m] Feats was rejected during automatic roles guess: []
[[0;30;41mlog_lvl_1[0m] [1mLayer 1[0m ...

[[0;30;41mlog_lvl_1[0m] Layer 1 train process start. Time left 17983.04 secs
[[0;30;41mlog_lvl_1[0m] Start fitting [1mSelector_LightGBM[0m ...
[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 16, 'feature_fraction': 0.9, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 1, 'reg_lambda': 0.0, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'objective': 'mse', 'metric': None, 'num_class': 1} 3000 200 100
Training until validation scores don't i

[ 2021-09-25 23:02:39,319][0m A new study created in memory with name: no-name-491fe56f-9a4a-430c-aaec-eadc3be61e52[0m


[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 108, 'feature_fraction': 0.6872700594236812, 'bagging_fraction': 0.5917173949330818, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 0.0023531598052637494, 'reg_lambda': 0.00010291881465670109, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'min_sum_hessian_in_leaf': 1.3145103232150122, 'objective': 'mse', 'metric': None, 'num_class': 1} 3000 200 100
Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.162854	valid's Opt metric: 2.24574
[200]	valid's l2: 0.127591	valid's Opt metric: 1.82796
[300]	valid's l2: 0.120201	valid's Opt metric: 1.7072
[400]	valid's l2: 0.1173	valid's Opt metric: 1.6605
[500]	valid's l2: 0.115966	valid's Opt metric: 1.62863
[600]	valid's l2: 0.114828	valid's Opt metric: 1.60211
[700]	valid's l2: 0.114284	valid's Opt metric: 1.58609
[800]	valid's l2: 0.113804

[ 2021-09-25 23:02:59,682][0m Trial 0 finished with value: -1.5351299972137153 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 108, 'bagging_fraction': 0.5917173949330818, 'min_sum_hessian_in_leaf': 1.3145103232150122, 'reg_alpha': 0.0023531598052637494, 'reg_lambda': 0.00010291881465670109}. Best is trial 0 with value: -1.5351299972137153.[0m


Early stopping, best iteration is:
[1336]	valid's l2: 0.11268	valid's Opt metric: 1.53513
[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 218, 'feature_fraction': 0.5499874579090014, 'bagging_fraction': 0.9330880728874675, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 0.023585940584142682, 'reg_lambda': 1.5320059381854043e-08, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'min_sum_hessian_in_leaf': 0.2537815508265665, 'objective': 'mse', 'metric': None, 'num_class': 1} 3000 200 100
Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.163648	valid's Opt metric: 2.23815
[200]	valid's l2: 0.12927	valid's Opt metric: 1.81432
[300]	valid's l2: 0.121446	valid's Opt metric: 1.68845
[400]	valid's l2: 0.118971	valid's Opt metric: 1.6411
[500]	valid's l2: 0.117818	valid's Opt metric: 1.6108
[600]	valid's l2: 0.117057	valid's Opt metric:

[ 2021-09-25 23:03:37,918][0m Trial 1 finished with value: -1.542478685458183 and parameters: {'feature_fraction': 0.5499874579090014, 'num_leaves': 218, 'bagging_fraction': 0.9330880728874675, 'min_sum_hessian_in_leaf': 0.2537815508265665, 'reg_alpha': 0.023585940584142682, 'reg_lambda': 1.5320059381854043e-08}. Best is trial 0 with value: -1.5351299972137153.[0m


Early stopping, best iteration is:
[1482]	valid's l2: 0.115379	valid's Opt metric: 1.54248
[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 251, 'feature_fraction': 0.9849549260809971, 'bagging_fraction': 0.9692763545078751, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 8.509499823666633, 'reg_lambda': 0.0036085571407386235, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'min_sum_hessian_in_leaf': 0.0010071984838809194, 'objective': 'mse', 'metric': None, 'num_class': 1} 3000 200 100
Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.180307	valid's Opt metric: 2.47589
[200]	valid's l2: 0.1428	valid's Opt metric: 2.13576
[300]	valid's l2: 0.131565	valid's Opt metric: 1.99568
[400]	valid's l2: 0.12744	valid's Opt metric: 1.93974
[500]	valid's l2: 0.125154	valid's Opt metric: 1.89559
[600]	valid's l2: 0.123981	valid's Opt metric:

[ 2021-09-25 23:03:45,613][0m Trial 2 finished with value: -1.818529064983803 and parameters: {'feature_fraction': 0.9849549260809971, 'num_leaves': 251, 'bagging_fraction': 0.9692763545078751, 'min_sum_hessian_in_leaf': 0.0010071984838809194, 'reg_alpha': 8.509499823666633, 'reg_lambda': 0.0036085571407386235}. Best is trial 0 with value: -1.5351299972137153.[0m


[1200]	valid's l2: 0.121744	valid's Opt metric: 1.81848
[1300]	valid's l2: 0.121744	valid's Opt metric: 1.81848
Early stopping, best iteration is:
[1186]	valid's l2: 0.121724	valid's Opt metric: 1.81853
[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 251, 'feature_fraction': 0.8058265802441404, 'bagging_fraction': 0.5115312125207079, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 3.9696182670988566e-05, 'reg_lambda': 2.630213296503227e-08, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'min_sum_hessian_in_leaf': 0.12563152773938666, 'objective': 'mse', 'metric': None, 'num_class': 1} 3000 200 100
Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.162896	valid's Opt metric: 2.24048
[200]	valid's l2: 0.12745	valid's Opt metric: 1.82706
[300]	valid's l2: 0.119257	valid's Opt metric: 1.69999
[400]	valid's l2: 0.116384	valid's Opt 

[ 2021-09-25 23:04:07,686][0m Trial 3 finished with value: -1.4946353519432116 and parameters: {'feature_fraction': 0.8058265802441404, 'num_leaves': 251, 'bagging_fraction': 0.5115312125207079, 'min_sum_hessian_in_leaf': 0.12563152773938666, 'reg_alpha': 3.9696182670988566e-05, 'reg_lambda': 2.630213296503227e-08}. Best is trial 3 with value: -1.4946353519432116.[0m


Early stopping, best iteration is:
[1742]	valid's l2: 0.111874	valid's Opt metric: 1.49464
[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 30, 'feature_fraction': 0.9868777594207296, 'bagging_fraction': 0.728034992108518, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 6.267062696005991e-07, 'reg_lambda': 0.00042472707398058225, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'min_sum_hessian_in_leaf': 1.382623217936987, 'objective': 'mse', 'metric': None, 'num_class': 1} 3000 200 100
Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.166819	valid's Opt metric: 2.28911
[200]	valid's l2: 0.131327	valid's Opt metric: 1.86174
[300]	valid's l2: 0.122352	valid's Opt metric: 1.73321
[400]	valid's l2: 0.119304	valid's Opt metric: 1.68693
[500]	valid's l2: 0.117897	valid's Opt metric: 1.65845
[600]	valid's l2: 0.117095	valid's Opt metri

[ 2021-09-25 23:04:17,036][0m Trial 4 finished with value: -1.5665177881588712 and parameters: {'feature_fraction': 0.9868777594207296, 'num_leaves': 30, 'bagging_fraction': 0.728034992108518, 'min_sum_hessian_in_leaf': 1.382623217936987, 'reg_alpha': 6.267062696005991e-07, 'reg_lambda': 0.00042472707398058225}. Best is trial 3 with value: -1.4946353519432116.[0m


Early stopping, best iteration is:
[1366]	valid's l2: 0.115288	valid's Opt metric: 1.56652
[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 146, 'feature_fraction': 0.7962072844310213, 'bagging_fraction': 0.9299702033681603, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 0.00011336872639641431, 'reg_lambda': 1.316390230170444e-08, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'min_sum_hessian_in_leaf': 0.5262961031076743, 'objective': 'mse', 'metric': None, 'num_class': 1} 3000 200 100
Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.161881	valid's Opt metric: 2.21538
[200]	valid's l2: 0.128273	valid's Opt metric: 1.79247
[300]	valid's l2: 0.120262	valid's Opt metric: 1.66391
[400]	valid's l2: 0.118241	valid's Opt metric: 1.60817
[500]	valid's l2: 0.117052	valid's Opt metric: 1.57729
[600]	valid's l2: 0.116632	valid's Opt me

[ 2021-09-25 23:04:52,076][0m Trial 5 finished with value: -1.5283787465826353 and parameters: {'feature_fraction': 0.7962072844310213, 'num_leaves': 146, 'bagging_fraction': 0.9299702033681603, 'min_sum_hessian_in_leaf': 0.5262961031076743, 'reg_alpha': 0.00011336872639641431, 'reg_lambda': 1.316390230170444e-08}. Best is trial 3 with value: -1.4946353519432116.[0m


Early stopping, best iteration is:
[1356]	valid's l2: 0.115361	valid's Opt metric: 1.52838
[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 29, 'feature_fraction': 0.9711008778424264, 'bagging_fraction': 0.9041986740582306, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 7.569183361880229e-08, 'reg_lambda': 0.014391207615728067, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'min_sum_hessian_in_leaf': 0.01653693718282442, 'objective': 'mse', 'metric': None, 'num_class': 1} 3000 200 100
Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.168223	valid's Opt metric: 2.27977
[200]	valid's l2: 0.13216	valid's Opt metric: 1.85074
[300]	valid's l2: 0.12309	valid's Opt metric: 1.7159
[400]	valid's l2: 0.120605	valid's Opt metric: 1.67095
[500]	valid's l2: 0.119477	valid's Opt metric: 1.64613
[600]	valid's l2: 0.119098	valid's Opt metric:

[ 2021-09-25 23:04:59,740][0m Trial 6 finished with value: -1.5914926961707605 and parameters: {'feature_fraction': 0.9711008778424264, 'num_leaves': 29, 'bagging_fraction': 0.9041986740582306, 'min_sum_hessian_in_leaf': 0.01653693718282442, 'reg_alpha': 7.569183361880229e-08, 'reg_lambda': 0.014391207615728067}. Best is trial 3 with value: -1.4946353519432116.[0m


Early stopping, best iteration is:
[1059]	valid's l2: 0.117775	valid's Opt metric: 1.59149
[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 214, 'feature_fraction': 0.7200762468698007, 'bagging_fraction': 0.8049983288913105, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 3.6331378936352306e-07, 'reg_lambda': 3.307847415252541e-05, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'min_sum_hessian_in_leaf': 2.1516897298083326, 'objective': 'mse', 'metric': None, 'num_class': 1} 3000 200 100
Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.161973	valid's Opt metric: 2.22572
[200]	valid's l2: 0.126939	valid's Opt metric: 1.78181
[300]	valid's l2: 0.119439	valid's Opt metric: 1.65608
[400]	valid's l2: 0.116852	valid's Opt metric: 1.602
[500]	valid's l2: 0.115699	valid's Opt metric: 1.57013
[600]	valid's l2: 0.115129	valid's Opt metr

[ 2021-09-25 23:05:29,347][0m Trial 7 finished with value: -1.509099962278922 and parameters: {'feature_fraction': 0.7200762468698007, 'num_leaves': 214, 'bagging_fraction': 0.8049983288913105, 'min_sum_hessian_in_leaf': 2.1516897298083326, 'reg_alpha': 3.6331378936352306e-07, 'reg_lambda': 3.307847415252541e-05}. Best is trial 3 with value: -1.4946353519432116.[0m


Early stopping, best iteration is:
[1388]	valid's l2: 0.113841	valid's Opt metric: 1.5091
[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 147, 'feature_fraction': 0.5911180438940311, 'bagging_fraction': 0.6558555380447055, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 0.0008325158565947976, 'reg_lambda': 4.609885087947832e-07, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'min_sum_hessian_in_leaf': 0.12030178871154672, 'objective': 'mse', 'metric': None, 'num_class': 1} 3000 200 100
Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.163384	valid's Opt metric: 2.23642
[200]	valid's l2: 0.127446	valid's Opt metric: 1.80543
[300]	valid's l2: 0.11985	valid's Opt metric: 1.67963
[400]	valid's l2: 0.11677	valid's Opt metric: 1.62401
[500]	valid's l2: 0.115218	valid's Opt metric: 1.58581
[600]	valid's l2: 0.114257	valid's Opt metri

[ 2021-09-25 23:05:54,931][0m Trial 8 finished with value: -1.4866729596336794 and parameters: {'feature_fraction': 0.5911180438940311, 'num_leaves': 147, 'bagging_fraction': 0.6558555380447055, 'min_sum_hessian_in_leaf': 0.12030178871154672, 'reg_alpha': 0.0008325158565947976, 'reg_lambda': 4.609885087947832e-07}. Best is trial 8 with value: -1.4866729596336794.[0m


Early stopping, best iteration is:
[1518]	valid's l2: 0.111628	valid's Opt metric: 1.48667
[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 233, 'feature_fraction': 0.9847923138822793, 'bagging_fraction': 0.7248770666848828, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 2.1874079799487576, 'reg_lambda': 0.0351113851431067, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'min_sum_hessian_in_leaf': 0.03807158379249393, 'objective': 'mse', 'metric': None, 'num_class': 1} 3000 200 100
Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.170949	valid's Opt metric: 2.36246
[200]	valid's l2: 0.132826	valid's Opt metric: 1.9346
[300]	valid's l2: 0.122571	valid's Opt metric: 1.76686
[400]	valid's l2: 0.118853	valid's Opt metric: 1.72401
[500]	valid's l2: 0.117205	valid's Opt metric: 1.70204
[600]	valid's l2: 0.116356	valid's Opt metric: 1

[ 2021-09-25 23:06:14,431][0m Trial 9 finished with value: -1.633406900843187 and parameters: {'feature_fraction': 0.9847923138822793, 'num_leaves': 233, 'bagging_fraction': 0.7248770666848828, 'min_sum_hessian_in_leaf': 0.03807158379249393, 'reg_alpha': 2.1874079799487576, 'reg_lambda': 0.0351113851431067}. Best is trial 8 with value: -1.4866729596336794.[0m


Early stopping, best iteration is:
[1543]	valid's l2: 0.115437	valid's Opt metric: 1.63341
[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 142, 'feature_fraction': 0.518532892035075, 'bagging_fraction': 0.6179086794065818, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 0.0652009782814182, 'reg_lambda': 1.171381382375702e-06, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'min_sum_hessian_in_leaf': 0.0043607965511154, 'objective': 'mse', 'metric': None, 'num_class': 1} 3000 200 100
Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.16539	valid's Opt metric: 2.27076
[200]	valid's l2: 0.129005	valid's Opt metric: 1.87455
[300]	valid's l2: 0.120646	valid's Opt metric: 1.73416
[400]	valid's l2: 0.11678	valid's Opt metric: 1.66667
[500]	valid's l2: 0.115542	valid's Opt metric: 1.6308
[600]	valid's l2: 0.114462	valid's Opt metric: 1.

[ 2021-09-25 23:06:38,329][0m Trial 10 finished with value: -1.5346650863519684 and parameters: {'feature_fraction': 0.518532892035075, 'num_leaves': 142, 'bagging_fraction': 0.6179086794065818, 'min_sum_hessian_in_leaf': 0.0043607965511154, 'reg_alpha': 0.0652009782814182, 'reg_lambda': 1.171381382375702e-06}. Best is trial 8 with value: -1.4866729596336794.[0m


Early stopping, best iteration is:
[1536]	valid's l2: 0.112108	valid's Opt metric: 1.53467
[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 93, 'feature_fraction': 0.8373377891692363, 'bagging_fraction': 0.502188743610675, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 0.00010251807433007782, 'reg_lambda': 5.62560812725547e-07, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'min_sum_hessian_in_leaf': 0.07581975333807078, 'objective': 'mse', 'metric': None, 'num_class': 1} 3000 200 100
Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.16256	valid's Opt metric: 2.25268
[200]	valid's l2: 0.126626	valid's Opt metric: 1.8255
[300]	valid's l2: 0.118325	valid's Opt metric: 1.69344
[400]	valid's l2: 0.115713	valid's Opt metric: 1.63804
[500]	valid's l2: 0.114719	valid's Opt metric: 1.61061
[600]	valid's l2: 0.113913	valid's Opt metric

[ 2021-09-25 23:07:01,861][0m Trial 11 finished with value: -1.5165997637997868 and parameters: {'feature_fraction': 0.8373377891692363, 'num_leaves': 93, 'bagging_fraction': 0.502188743610675, 'min_sum_hessian_in_leaf': 0.07581975333807078, 'reg_alpha': 0.00010251807433007782, 'reg_lambda': 5.62560812725547e-07}. Best is trial 8 with value: -1.4866729596336794.[0m


Early stopping, best iteration is:
[1498]	valid's l2: 0.11217	valid's Opt metric: 1.5166
[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 180, 'feature_fraction': 0.606504606468049, 'bagging_fraction': 0.5025241330190229, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 6.9237532850440935e-06, 'reg_lambda': 5.178366122578138e-07, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'min_sum_hessian_in_leaf': 0.19281972985977133, 'objective': 'mse', 'metric': None, 'num_class': 1} 3000 200 100
Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.163284	valid's Opt metric: 2.26016
[200]	valid's l2: 0.127038	valid's Opt metric: 1.84995
[300]	valid's l2: 0.118317	valid's Opt metric: 1.70864
[400]	valid's l2: 0.115731	valid's Opt metric: 1.65791
[500]	valid's l2: 0.11414	valid's Opt metric: 1.62389
[600]	valid's l2: 0.112989	valid's Opt metri

[ 2021-09-25 23:07:15,584][0m Trial 12 finished with value: -1.5673337806788625 and parameters: {'feature_fraction': 0.606504606468049, 'num_leaves': 180, 'bagging_fraction': 0.5025241330190229, 'min_sum_hessian_in_leaf': 0.19281972985977133, 'reg_alpha': 6.9237532850440935e-06, 'reg_lambda': 5.178366122578138e-07}. Best is trial 8 with value: -1.4866729596336794.[0m


Early stopping, best iteration is:
[888]	valid's l2: 0.11236	valid's Opt metric: 1.56733
[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 68, 'feature_fraction': 0.8591579297143492, 'bagging_fraction': 0.6090051583827969, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 1.3869919785090633e-05, 'reg_lambda': 4.588492805253924, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'min_sum_hessian_in_leaf': 8.753495888230532, 'objective': 'mse', 'metric': None, 'num_class': 1} 3000 200 100
Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.169748	valid's Opt metric: 2.33956
[200]	valid's l2: 0.130323	valid's Opt metric: 1.89153
[300]	valid's l2: 0.12071	valid's Opt metric: 1.72911
[400]	valid's l2: 0.117076	valid's Opt metric: 1.67228
[500]	valid's l2: 0.115534	valid's Opt metric: 1.63846
[600]	valid's l2: 0.114878	valid's Opt metric: 1.6

[ 2021-09-25 23:07:39,148][0m Trial 13 finished with value: -1.559970907215497 and parameters: {'feature_fraction': 0.8591579297143492, 'num_leaves': 68, 'bagging_fraction': 0.6090051583827969, 'min_sum_hessian_in_leaf': 8.753495888230532, 'reg_alpha': 1.3869919785090633e-05, 'reg_lambda': 4.588492805253924}. Best is trial 8 with value: -1.4866729596336794.[0m


Early stopping, best iteration is:
[1421]	valid's l2: 0.113157	valid's Opt metric: 1.55997
[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 190, 'feature_fraction': 0.6285831593958296, 'bagging_fraction': 0.563275612978938, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 0.0031432474570082538, 'reg_lambda': 1.3126327080961932e-08, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'min_sum_hessian_in_leaf': 0.0109072902598878, 'objective': 'mse', 'metric': None, 'num_class': 1} 3000 200 100
Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.163636	valid's Opt metric: 2.25006
[200]	valid's l2: 0.127593	valid's Opt metric: 1.82938
[300]	valid's l2: 0.119536	valid's Opt metric: 1.69258
[400]	valid's l2: 0.116685	valid's Opt metric: 1.64316
[500]	valid's l2: 0.115595	valid's Opt metric: 1.61736
[600]	valid's l2: 0.114595	valid's Opt met

[ 2021-09-25 23:08:04,673][0m Trial 14 finished with value: -1.5305128741147611 and parameters: {'feature_fraction': 0.6285831593958296, 'num_leaves': 190, 'bagging_fraction': 0.563275612978938, 'min_sum_hessian_in_leaf': 0.0109072902598878, 'reg_alpha': 0.0031432474570082538, 'reg_lambda': 1.3126327080961932e-08}. Best is trial 8 with value: -1.4866729596336794.[0m


Early stopping, best iteration is:
[1541]	valid's l2: 0.113033	valid's Opt metric: 1.53051
[[0;30;43mlog_lvl_2[0m] Selected params by Optuna: [1m{'feature_fraction': 0.5911180438940311, 'num_leaves': 147, 'bagging_fraction': 0.6558555380447055, 'min_sum_hessian_in_leaf': 0.12030178871154672, 'reg_alpha': 0.0008325158565947976, 'reg_lambda': 4.609885087947832e-07}[0m
[[0;30;41mlog_lvl_1[0m] Start fitting [1mLvl_0_Pipe_0_Mod_0_Tuned_LightGBM[0m ...
[[0;30;43mlog_lvl_2[0m] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_Tuned_LightGBM[0m =====
[[0;30;42mlog_lvl_3[0m] {'task': 'train', 'learning_rate': 0.05, 'num_leaves': 147, 'feature_fraction': 0.5911180438940311, 'bagging_fraction': 0.6558555380447055, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 0.0008325158565947976, 'reg_lambda': 4.609885087947832e-07, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 64, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'min_

In [46]:
#deviation_metric(np.array(train_df['per_square_meter_price']), oof_pred.data[:, 0])

In [49]:
#output['per_square_meter_price'] = output['per_square_meter_price'] * 0.9

#output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] \
#    = output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] * 0.9

In [29]:
output = pd.DataFrame({'id': test_data['id'],
                       'per_square_meter_price': automl.predict(test_data).data[:, 0]})
output['per_square_meter_price'] = output['per_square_meter_price'] * 0.9

output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] \
    = output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] * 0.9

output.to_csv('raifHack_ki7_upd.csv', index=False)

In [30]:
output.shape

(2974, 2)

In [31]:
output

Unnamed: 0,id,per_square_meter_price
0,COL_289284,40313.496094
1,COL_289305,38261.144531
2,COL_289318,45483.343750
3,COL_289354,84782.210938
4,COL_289399,47442.007812
...,...,...
2969,COL_455089,24740.089844
2970,COL_455212,42763.472656
2971,COL_455261,42031.343750
2972,COL_455381,41221.875000
