In [85]:
import numpy as np
import pandas as pd
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

In [75]:
df = pd.read_csv('fixed_floats.csv')

In [76]:
df.sample(5)

Unnamed: 0,cntroom,floor,maxfloor,livingarea,totalarea,kitchenarea,year,ceiling,finishing,garbage,...,typeofhouse,district,width,long,pas,gruz,combined,separate,lodge,balkone
2611,1,12,16,22.7,42.3,11.7,2023.0,2.7,,Да,...,,ЮВАО,55.692019,37.754592,1,1,1,0,0,1
641,4,23,41,,99.0,,,,,,...,Монолитный,ЗАО,55.672925,37.346587,3,2,3,0,0,0
2331,1,6,8,19.9,32.17,5.0,,,,,...,Монолитный,ЮВАО,55.647493,37.713054,1,1,1,0,0,0
1904,2,1,12,48.0,60.0,7.0,2005.0,3.1,,Да,...,,САО,55.786708,37.523249,1,1,1,1,0,0
707,5,31,41,,169.0,32.5,2021.0,3.5,,,...,Монолитный,ЗАО,55.755819,37.617644,1,3,3,0,0,0


In [77]:
cat_features = ['cnt_room', 'floor', 'maxfloor', 'year', 'garage', 'heating', 'parking', 'repair', 'window', 'credit', 'deal', 'typeofhousing', 
                'typeofhouse', 'district', 'pas', 'gruz', 'combined', 'separate', 'lodge', 'balkone']

In [78]:
RANDOM_STATE = 42

In [79]:
np.random.seed(RANDOM_STATE)

In [81]:
TARGET_NAME = 'price2'

In [82]:
df[TARGET_NAME] = np.log(df[TARGET_NAME])

In [83]:
from sklearn.model_selection import train_test_split

tr_data, te_data = train_test_split(
    df, test_size=0.2, random_state=RANDOM_STATE
)
print(f'Data splitted. PArts size: tr_data = {tr_data.shape}, te_data = {te_data.shape}')
tr_data.head()

Data splitted. PArts size: tr_data = (2827, 28), te_data = (707, 28)


Unnamed: 0,cntroom,floor,maxfloor,livingarea,totalarea,kitchenarea,year,ceiling,finishing,garbage,...,typeofhouse,district,width,long,pas,gruz,combined,separate,lodge,balkone
1885,2,4,19,22.8,43.5,12.2,2018.0,,,,...,Монолитный,СЗАО,55.82937,37.451555,3,0,1,0,0,0
2325,1,9,14,19.9,37.9,7.8,1995.0,2.64,,Да,...,Панельный,ЮВАО,55.652832,37.762749,1,1,1,0,0,1
1653,2,12,31,27.4,60.1,16.5,2019.0,3.1,,,...,Монолитный,СЗАО,55.769135,37.494521,3,2,1,0,1,0
1132,1,23,44,15.0,38.0,8.9,,,,Нет,...,Монолитный,СЗАО,55.835806,37.422818,1,1,1,0,1,0
1814,2,2,16,11.9,55.4,25.4,,3.1,Предчистовая,,...,Монолитный,САО,55.791769,37.505382,2,1,1,0,1,0


In [95]:
task = Task(
    'reg',
    loss='mae',
    metric='mae'
)

In [96]:
roles = {
    'target': TARGET_NAME,
}

In [97]:
N_FOLDS = 3

In [98]:
automl = TabularAutoML(
    task=task,
    timeout=60*5,
    reader_params={'cv':N_FOLDS, 'random_state':RANDOM_STATE}
)

In [99]:
%%time
oof_preds = automl.fit_predict(tr_data, roles=roles, verbose=1)

[20:09:19] Stdout logging level is INFO.
[20:09:19] Task: reg

[20:09:19] Start automl preset with listed constraints:
[20:09:19] - time: 300.00 seconds
[20:09:19] - CPU: 4 cores
[20:09:19] - memory: 16 GB

[20:09:19] [1mTrain data shape: (2827, 28)[0m
[20:09:23] Layer [1m1[0m train process start. Time left 295.74 secs
[20:09:23] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[20:09:26] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m-0.1622695726145889[0m
[20:09:26] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[20:09:26] Time left 293.11 secs
[20:09:27] [1mSelector_LightGBM[0m fitting and predicting completed
[20:09:27] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[20:09:32] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m-0.1219029870628251[0m
[20:09:32] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[20:09:32] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_L

In [93]:
%%time
te_preds = automl.predict(te_data)
print(f'Prediction for te_data:\n{te_preds}\nShape = {te_preds.shape}')

Prediction for te_data:
array([[13.197899 ],
       [13.821918 ],
       [12.509406 ],
       [13.142314 ],
       [14.079858 ],
       [12.8593   ],
       [12.514633 ],
       [13.032272 ],
       [13.238721 ],
       [13.133209 ],
       [12.770773 ],
       [12.696545 ],
       [13.153449 ],
       [12.748725 ],
       [13.057858 ],
       [12.787284 ],
       [13.029669 ],
       [12.736449 ],
       [13.335024 ],
       [13.031741 ],
       [12.86158  ],
       [13.04916  ],
       [13.210066 ],
       [12.5259   ],
       [13.161865 ],
       [13.178996 ],
       [13.013454 ],
       [12.806147 ],
       [13.021831 ],
       [13.0893755],
       [12.795087 ],
       [13.100224 ],
       [12.89562  ],
       [13.105422 ],
       [12.835537 ],
       [13.517277 ],
       [12.737491 ],
       [12.486616 ],
       [13.166773 ],
       [12.78266  ],
       [13.035093 ],
       [13.117752 ],
       [12.998508 ],
       [13.57905  ],
       [14.284048 ],
       [12.710657 ],
       [13

In [94]:
from sklearn.metrics import mean_squared_error
print(f'OOF score: {mean_squared_error(tr_data[TARGET_NAME].values, oof_preds.data[:, 0])}')
print(f'HOLDOUT score: {mean_squared_error(te_data[TARGET_NAME].values, te_preds.data[:, 0])}')

OOF score: 0.02644581854238253
HOLDOUT score: 0.02559491704917272


array([[ 430576.78],
       [ 374213.9 ],
       [ 330590.94],
       [ 334909.9 ],
       [ 444016.56],
       [ 496534.7 ],
       [ 292914.1 ],
       [ 612886.7 ],
       [ 551022.1 ],
       [ 421292.2 ],
       [ 267950.88],
       [ 268114.75],
       [ 564869.9 ],
       [ 431416.97],
       [ 268744.06],
       [ 523961.22],
       [ 522256.47],
       [ 381963.72],
       [ 270017.72],
       [ 313404.88],
       [ 563483.5 ],
       [ 523735.94],
       [ 480149.38],
       [ 490625.3 ],
       [ 518399.94],
       [ 325762.84],
       [ 986472.44],
       [ 476167.56],
       [ 257806.06],
       [ 390708.2 ],
       [ 479147.62],
       [ 339019.06],
       [ 375358.06],
       [ 336300.34],
       [ 352097.06],
       [ 600276.3 ],
       [ 435199.7 ],
       [ 338982.44],
       [ 506254.72],
       [ 295212.12],
       [ 287010.75],
       [ 304204.34],
       [ 578133.75],
       [ 354247.25],
       [ 283762.56],
       [ 379008.16],
       [ 505113.84],
       [ 3153