In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from IPython.display import display, HTML
from lightgbm import LGBMRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_log_error
import time
import math
from sklearn.model_selection import GridSearchCV

# Download data

In [0]:
train = pd.read_csv('./train.csv', index_col='id')
test = pd.read_csv('./test.csv', index_col='id')
train['timestamp'] = train['timestamp'].apply(pd.to_datetime)
test['timestamp'] = test['timestamp'].apply(pd.to_datetime)

In [0]:
def num_houses_with_bigger_column_value (column_name, value):
    return len(train[train[column_name] > value].index)

def num_houses_with_less_column_value (column_name, value):
    return len(train[train[column_name] < value].index)

In [0]:
def plot_feature_kde(column_name, min_value, max_value):
    data = train[column_name]
    data = data.where(data.notnull(), axis=0)
    data = data[(min_value <= data) & (data <= max_value)]
    ax = sns.kdeplot(data=data, shade=True)
    ax.set(xlabel= column_name, ylabel='density')

def plot_regplot(column_name1, column_name2, col1_min, col1_max, col2_min, col2_max, alpha=0.1):
    reg_plot_data = train[[column_name1, column_name2]]
    reg_plot_data = reg_plot_data[(col1_min <= train[column_name1]) & (train[column_name1] <= col1_max) \
                                & (col2_min <= train[column_name2]) & (train[column_name2] <= col2_max)]
    sns.regplot(x=reg_plot_data[column_name1], y=reg_plot_data[column_name2], \
              scatter_kws={'alpha':alpha})

def plot_features_jointplot(column_name1, column_name2, col1_min, col1_max, \
                            col2_min, col2_max, alpha=0.1):
    data = train.loc[:, [column_name1, column_name2]]
    data = data[data.notnull().all(axis=1)]
    data = data[(col1_min <= data[column_name1]) & (data[column_name1] <= col1_max) \
              & (col2_min <= data[column_name2]) & (data[column_name2] <= col2_max)]
    sns.jointplot(column_name1, column_name2, data=data, kind="kde", space=0, color="b", \
                scatter_kws={'alpha':alpha})

def plot_lmplot(column_name1, column_name2, hue, col1_min, col1_max, \
                            col2_min, col2_max, alpha=0.1):
    data = train.loc[:, [column_name1, column_name2, hue]]
    data=data[data.notnull().all(axis=1)]
    data = data[(col1_min <= data[column_name1]) & (data[column_name1] <= col1_max) \
              & (col2_min <= data[column_name2]) & (data[column_name2] <= col2_max)]
    sns.lmplot(x=column_name1, y=column_name2, hue=hue, data=data, scatter_kws={'alpha':alpha})

def plot_distplot(column_name):
    sns.distplot(a=train[column_name], kde=False)

# Understanding missing values

In [5]:
numeric_nan_info = train._get_numeric_data().isna().sum()
print(len(numeric_nan_info[numeric_nan_info > 0]), 'numeric columns have missing values.\n')
numeric_nan_info[numeric_nan_info > 0]

51 numeric columns have missing values.



life_sq                                   6383
floor                                      167
max_floor                                 9572
material                                  9572
build_year                               13605
num_room                                  9572
kitch_sq                                  9572
state                                    13559
preschool_quota                           6688
school_quota                              6685
hospital_beds_raion                      14441
raion_build_count_with_material_info      4991
build_count_block                         4991
build_count_wood                          4991
build_count_frame                         4991
build_count_brick                         4991
build_count_monolith                      4991
build_count_panel                         4991
build_count_foam                          4991
build_count_slag                          4991
build_count_mix                           4991
raion_build_c

In [6]:
numeric_features = train._get_numeric_data().columns
categorical_features = set(train.columns) - set(numeric_features)
categorical_features_nan_info = train[categorical_features].isna().sum()
print(len(categorical_features_nan_info[categorical_features_nan_info > 0]), \
      'categorical columns have missing values.\n')
categorical_features_nan_info[categorical_features_nan_info > 0]

0 categorical columns have missing values.



Series([], dtype: int64)

In [7]:
print(categorical_features)

{'detention_facility_raion', 'product_type', 'radiation_raion', 'big_road1_1line', 'railroad_terminal_raion', 'nuclear_reactor_raion', 'culture_objects_top_25', 'sub_area', 'thermal_power_plant_raion', 'timestamp', 'big_market_raion', 'oil_chemistry_raion', 'ecology', 'incineration_raion', 'water_1line', 'railroad_1line'}


In [0]:
categorical_features.remove('timestamp')

In [9]:
train.describe()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,area_m,raion_popul,green_zone_part,indust_part,children_preschool,preschool_quota,preschool_education_centers_raion,children_school,school_quota,school_education_centers_raion,school_education_centers_top_20_raion,hospital_beds_raion,healthcare_centers_raion,university_top_20_raion,sport_objects_raion,additional_education_raion,culture_objects_top_25_raion,shopping_centers_raion,office_raion,full_all,male_f,female_f,young_all,young_male,young_female,work_all,work_male,work_female,ekder_all,ekder_male,ekder_female,...,cafe_sum_3000_min_price_avg,cafe_sum_3000_max_price_avg,cafe_avg_price_3000,cafe_count_3000_na_price,cafe_count_3000_price_500,cafe_count_3000_price_1000,cafe_count_3000_price_1500,cafe_count_3000_price_2500,cafe_count_3000_price_4000,cafe_count_3000_price_high,big_church_count_3000,church_count_3000,mosque_count_3000,leisure_count_3000,sport_count_3000,market_count_3000,green_part_5000,prom_part_5000,office_count_5000,office_sqm_5000,trc_count_5000,trc_sqm_5000,cafe_count_5000,cafe_sum_5000_min_price_avg,cafe_sum_5000_max_price_avg,cafe_avg_price_5000,cafe_count_5000_na_price,cafe_count_5000_price_500,cafe_count_5000_price_1000,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
count,30471.0,24088.0,30304.0,20899.0,20899.0,16866.0,20899.0,20899.0,16912.0,30471.0,30471.0,30471.0,30471.0,30471.0,23783.0,30471.0,30471.0,23786.0,30471.0,30471.0,16030.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,...,29480.0,29480.0,29480.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30293.0,30471.0,30471.0,30471.0,30471.0,30471.0,30174.0,30174.0,30174.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0
mean,54.214269,34.403271,7.670803,12.558974,1.827121,3068.057,1.909804,6.399301,2.107025,17657050.0,84056.425552,0.218922,0.118871,5140.026156,3271.272464,4.065111,5354.269699,8324.970739,4.705031,0.109678,1190.738677,1.320895,0.138295,6.63503,2.896393,0.286732,4.200978,8.253454,146306.2,67207.789603,79098.662105,11178.809491,5723.853106,5455.013948,53667.908897,27253.585803,26414.414821,19209.707164,5811.618162,13398.167438,...,765.900619,1283.334756,1024.617808,7.274622,27.782055,30.450297,26.685078,13.320108,4.646516,0.707131,6.122674,12.287979,0.199075,3.87644,20.23931,2.319944,22.769557,10.348669,71.35624,1401057.0,30.131863,1173871.0,265.52847,765.098467,1278.280635,1021.689513,17.806898,66.19553,73.442421,63.46966,32.058318,10.78386,1.771783,15.045552,30.251518,0.442421,8.648814,52.796593,5.98707,7123035.0
std,38.031487,52.285733,5.319989,6.75655,1.481154,154387.8,0.851805,28.265979,0.880148,20649610.0,57871.285899,0.17509,0.118688,3816.62514,2169.759592,2.993795,3989.640917,4289.734174,3.445105,0.333328,1057.015001,1.492903,0.443796,6.571982,3.290058,1.510491,4.741812,23.536953,283025.1,129444.557322,153630.894568,8287.957623,4275.438364,4020.546283,37483.559701,18939.154637,18643.132758,13174.721197,4073.828409,9144.326481,...,224.006719,353.299663,288.163451,18.2424,71.826492,69.528682,70.855732,41.282,16.884524,2.775499,15.413697,25.785241,0.447815,13.188748,18.843867,2.271539,11.208471,5.668748,155.341362,2303052.0,23.924051,1004810.0,514.538671,152.408951,236.207686,194.117696,33.269057,125.934584,126.167671,124.076662,73.465611,28.385679,5.418807,29.118668,47.347938,0.609269,20.580741,46.29266,4.889219,4780111.0
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2081628.0,2546.0,0.001879,0.0,175.0,0.0,0.0,168.0,1012.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2546.0,1208.0,1341.0,365.0,189.0,177.0,1633.0,863.0,771.0,548.0,156.0,393.0,...,300.0,500.0,400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.52,0.21,0.0,0.0,0.0,0.0,0.0,300.0,500.0,400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100000.0
25%,38.0,20.0,3.0,9.0,1.0,1967.0,1.0,1.0,1.0,7307411.0,21819.0,0.063755,0.019509,1706.0,1874.0,2.0,1564.0,5782.0,2.0,0.0,520.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,28179.0,13522.0,15031.0,3459.0,1782.0,1677.0,13996.0,7394.0,6661.0,4695.0,1331.0,3365.0,...,650.0,1102.27,875.7975,0.0,1.0,2.0,2.0,1.0,0.0,0.0,1.0,3.0,0.0,0.0,5.0,0.0,14.78,6.05,2.0,85159.0,6.0,262000.0,20.0,670.87,1144.23,909.38,1.0,4.0,8.0,6.0,2.0,1.0,0.0,2.0,9.0,0.0,0.0,11.0,1.0,4740002.0
50%,49.0,30.0,6.5,12.0,1.0,1979.0,2.0,6.0,2.0,10508030.0,83502.0,0.167526,0.072158,4857.0,2854.0,4.0,5261.0,7377.0,5.0,0.0,990.0,1.0,0.0,5.0,2.0,0.0,3.0,2.0,85219.0,39261.0,45729.0,10988.0,5470.0,5333.0,52030.0,26382.0,26092.0,20036.0,6180.0,13540.0,...,711.11,1211.54,961.11,3.0,9.0,14.0,10.0,3.0,1.0,0.0,2.0,6.0,0.0,0.0,18.0,2.0,19.76,8.98,15.0,432438.0,31.0,1075495.0,108.0,721.74,1211.945,966.67,8.0,28.0,36.0,24.0,8.0,2.0,0.0,7.0,16.0,0.0,2.0,48.0,5.0,6274411.0
75%,63.0,43.0,11.0,17.0,2.0,2005.0,2.0,9.0,3.0,18036440.0,122862.0,0.336177,0.195781,7103.0,4050.0,6.0,7227.0,9891.0,7.0,0.0,1786.0,2.0,0.0,10.0,4.0,0.0,6.0,5.0,125111.0,58226.0,67872.0,14906.0,7597.0,7617.0,77612.0,38841.0,37942.0,29172.0,8563.0,20165.0,...,815.63,1333.33,1083.33,6.0,22.0,26.0,17.0,6.0,2.0,0.0,5.0,10.0,0.0,2.0,29.0,4.0,31.405,14.0,53.0,1433847.0,43.0,1683836.0,222.0,816.6575,1346.09,1091.67,15.0,59.0,69.0,51.0,21.0,5.0,1.0,12.0,28.0,1.0,7.0,76.0,10.0,8300000.0
max,5326.0,7478.0,77.0,117.0,6.0,20052010.0,19.0,2014.0,33.0,206071800.0,247469.0,0.852923,0.521867,19223.0,11926.0,13.0,19083.0,24750.0,14.0,2.0,4849.0,6.0,3.0,29.0,16.0,10.0,23.0,141.0,1716730.0,774585.0,942145.0,40692.0,20977.0,19715.0,161290.0,79622.0,81668.0,57086.0,19275.0,37811.0,...,1833.33,3000.0,2416.67,119.0,449.0,441.0,446.0,266.0,113.0,23.0,102.0,164.0,2.0,85.0,100.0,10.0,75.46,28.56,789.0,12702110.0,120.0,4585477.0,2645.0,1875.0,3000.0,2437.5,174.0,650.0,648.0,641.0,377.0,147.0,30.0,151.0,250.0,2.0,106.0,218.0,21.0,111111100.0


In [10]:
heatmap_df = train.iloc[:, 1:9].copy()
heatmap_df.dropna(inplace=True)
heatmap_df.corr()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq
full_sq,1.0,0.254972,0.160772,0.208166,0.050752,-0.006065,0.713927,0.042118
life_sq,0.254972,1.0,0.034216,0.043237,0.010481,-0.002401,0.187473,0.002555
floor,0.160772,0.034216,1.0,0.535355,0.028114,0.001426,0.003288,0.00822
max_floor,0.208166,0.043237,0.535355,1.0,0.071513,-0.000283,-0.006762,0.040285
material,0.050752,0.010481,0.028114,0.071513,1.0,-0.004606,-0.040406,0.033845
build_year,-0.006065,-0.002401,0.001426,-0.000283,-0.004606,1.0,-0.008501,0.00049
num_room,0.713927,0.187473,0.003288,-0.006762,-0.040406,-0.008501,1.0,0.016216
kitch_sq,0.042118,0.002555,0.00822,0.040285,0.033845,0.00049,0.016216,1.0


## &emsp; Product type NaN filling in test data

In [11]:
test[pd.isnull(test['product_type'])]

Unnamed: 0_level_0,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,sub_area,area_m,raion_popul,green_zone_part,indust_part,children_preschool,preschool_quota,preschool_education_centers_raion,children_school,school_quota,school_education_centers_raion,school_education_centers_top_20_raion,hospital_beds_raion,healthcare_centers_raion,university_top_20_raion,sport_objects_raion,additional_education_raion,culture_objects_top_25,culture_objects_top_25_raion,shopping_centers_raion,office_raion,thermal_power_plant_raion,incineration_raion,oil_chemistry_raion,radiation_raion,railroad_terminal_raion,big_market_raion,nuclear_reactor_raion,detention_facility_raion,...,cafe_count_3000,cafe_sum_3000_min_price_avg,cafe_sum_3000_max_price_avg,cafe_avg_price_3000,cafe_count_3000_na_price,cafe_count_3000_price_500,cafe_count_3000_price_1000,cafe_count_3000_price_1500,cafe_count_3000_price_2500,cafe_count_3000_price_4000,cafe_count_3000_price_high,big_church_count_3000,church_count_3000,mosque_count_3000,leisure_count_3000,sport_count_3000,market_count_3000,green_part_5000,prom_part_5000,office_count_5000,office_sqm_5000,trc_count_5000,trc_sqm_5000,cafe_count_5000,cafe_sum_5000_min_price_avg,cafe_sum_5000_max_price_avg,cafe_avg_price_5000,cafe_count_5000_na_price,cafe_count_5000_price_500,cafe_count_5000_price_1000,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
35136,2016-02-03,63.9,33.7,8,17,1,2013.0,2,12.9,3.0,,Nekrasovka,11391680.0,19940,0.055644,0.243205,1706,2395.0,5,1564,7377.0,5,0,540.0,0,0,0,4,no,0,0,0,no,yes,no,no,no,no,no,no,...,2,750.0,1250.0,1000.0,0,0,1,1,0,0,0,0,1,0,0,1,1,14.37,7.74,0,0,3,43437,11,709.09,1227.27,968.18,0,1,6,3,1,0,0,4,5,0,0,4,1
35155,2016-02-03,39.3,22.7,10,12,5,1968.0,2,6.8,2.0,,Novogireevo,4395333.0,94561,0.063755,0.038693,6120,2215.0,4,6533,5824.0,4,0,1015.0,2,0,7,1,no,0,5,1,no,no,no,yes,no,no,no,no,...,49,602.08,1062.5,832.29,1,13,25,7,2,1,0,4,9,0,7,25,4,24.97,10.24,15,351244,20,629676,92,670.79,1140.45,905.62,3,24,36,22,5,2,0,7,17,0,9,53,10
35227,2016-02-04,77.2,77.2,12,0,1,0.0,2,0.0,,,Poselenie Sosenskoe,66772450.0,9553,0.336177,0.072158,656,,0,629,,0,0,,0,0,1,0,no,0,0,1,no,no,no,no,no,yes,no,no,...,5,1600.0,2600.0,2100.0,0,0,1,1,1,2,0,0,4,0,0,1,0,32.9,11.06,2,91159,4,335876,25,750.0,1250.0,1000.0,1,10,6,4,2,2,0,2,12,0,0,6,3
35258,2016-02-05,35.21,13.66,4,1,1,,1,1.0,1.0,,Poselenie Vnukovskoe,25536300.0,4001,0.496315,0.007122,275,,0,264,,0,0,,0,0,0,0,no,0,1,0,no,no,no,no,no,no,no,no,...,2,650.0,1000.0,825.0,0,1,0,1,0,0,0,0,3,0,0,0,0,33.18,1.63,1,525,1,17000,6,716.67,1250.0,983.33,0,1,3,1,1,0,0,2,7,0,0,0,0
35265,2016-02-05,59.0,42.0,6,9,1,1979.0,3,5.9,3.0,,Strogino,16751120.0,155427,0.338151,0.041125,9254,4606.0,8,9515,11032.0,9,0,,1,0,6,2,no,0,10,5,no,no,no,no,no,no,no,yes,...,39,1102.7,1797.3,1450.0,2,1,11,14,8,2,1,3,6,0,0,15,1,22.2,7.51,20,671769,30,1162221,120,900.91,1495.45,1198.18,10,17,37,32,18,5,1,5,12,0,1,57,4
35274,2016-02-05,38.9,18.9,9,17,6,2015.0,1,9.2,1.0,,Poselenie Vnukovskoe,25536300.0,4001,0.496315,0.007122,275,,0,264,,0,0,,0,0,0,0,no,0,1,0,no,no,no,no,no,no,no,no,...,6,716.67,1250.0,983.33,0,1,3,1,1,0,0,1,4,0,0,0,0,39.61,3.19,1,117300,3,139300,15,793.33,1333.33,1063.33,0,3,6,4,1,1,0,2,12,0,0,7,0
35521,2016-02-11,76.0,45.4,12,17,1,1991.0,3,10.2,,,Novo-Peredelkino,8591335.0,111023,0.058543,0.025609,6354,4249.0,7,5261,9373.0,7,0,240.0,3,0,6,4,no,0,1,0,no,no,no,no,no,no,no,no,...,12,827.27,1409.09,1118.18,1,2,5,2,1,1,0,2,6,0,0,9,0,32.68,6.1,1,60000,7,137350,21,757.89,1289.47,1023.68,2,3,9,5,1,1,0,2,9,0,1,10,2
35573,2016-02-12,46.8,28.6,4,13,1,1971.0,2,6.1,3.0,,Fili Davydkovo,6982964.0,111374,0.348986,0.0,7116,3422.0,5,7425,6974.0,7,1,,2,0,6,0,no,0,5,1,no,no,no,yes,no,no,no,no,...,66,993.44,1647.54,1320.49,5,12,17,13,12,7,0,2,10,1,2,35,1,19.35,12.34,50,3444993,31,1297734,181,984.52,1625.0,1304.76,13,33,48,40,29,16,2,10,25,1,5,82,1
35585,2016-02-12,37.2,16.9,6,18,1,1972.0,1,10.4,3.0,,Obruchevskoe,6050065.0,78616,0.167526,0.093443,4215,2372.0,6,4635,6083.0,8,0,3300.0,2,1,11,1,no,0,4,5,no,no,no,yes,no,no,no,no,...,203,702.05,1182.05,942.05,8,48,74,55,16,1,1,1,13,1,4,52,2,24.33,3.27,53,1444054,47,1145385,361,751.3,1256.52,1003.91,16,79,119,102,40,4,1,4,36,1,7,81,10
35667,2016-02-15,42.3,28.2,3,9,2,1971.0,2,4.9,2.0,,Tekstil'shhiki,4808270.0,101708,0.11256,0.118537,5879,1463.0,4,6207,5580.0,7,0,1183.0,1,0,5,1,no,0,0,1,no,no,no,yes,no,no,no,no,...,43,625.0,1075.0,850.0,3,10,18,10,2,0,0,3,13,0,0,24,6,13.18,21.33,27,1211408,38,1527590,130,703.31,1185.95,944.63,9,37,41,28,12,3,0,9,25,0,4,65,13


In [12]:
train.loc[train['product_type'] == 'Investment','build_year'].median()

1976.0

In [13]:
train.loc[train['product_type'] == 'OwnerOccupier','build_year'].median()

2014.0

## &emsp; Data processing



In [0]:
my_imputer = SimpleImputer(strategy="median")
my_scaler = StandardScaler()

In [0]:
missed_features = np.array(['preschool_quota', 'school_quota',
       'hospital_beds_raion', 'raion_build_count_with_material_info',
       'build_count_block', 'build_count_wood', 'build_count_frame',
       'build_count_brick', 'build_count_monolith', 'build_count_panel',
       'build_count_foam', 'build_count_slag', 'build_count_mix',
       'raion_build_count_with_builddate_info', 'build_count_before_1920',
       'build_count_1921-1945', 'build_count_1946-1970',
       'build_count_1971-1995', 'build_count_after_1995', 'metro_min_walk',
       'metro_km_walk', 'railroad_station_walk_km',
       'railroad_station_walk_min', 'ID_railroad_station_walk',
       'cafe_sum_500_min_price_avg', 'cafe_sum_500_max_price_avg',
       'cafe_avg_price_500', 'cafe_sum_1000_min_price_avg',
       'cafe_sum_1000_max_price_avg', 'cafe_avg_price_1000',
       'cafe_sum_1500_min_price_avg', 'cafe_sum_1500_max_price_avg',
       'cafe_avg_price_1500', 'cafe_sum_2000_min_price_avg',
       'cafe_sum_2000_max_price_avg', 'cafe_avg_price_2000',
       'cafe_sum_3000_min_price_avg', 'cafe_sum_3000_max_price_avg',
       'cafe_avg_price_3000', 'prom_part_5000', 'cafe_sum_5000_min_price_avg',
       'cafe_sum_5000_max_price_avg', 'cafe_avg_price_5000'])

In [0]:
def change_life_sq (row):
    if row['life_sq'] > 100 and row['life_sq/full_sq'] > 1 / 0.67:
        return row['full_sq']
    elif row['life_sq/full_sq'] > 1 / 0.67:
        return row['life_sq'] - row['full_sq']
    elif row['full_sq'] < row['life_sq']:
        return row['full_sq']
    return row['life_sq']

def change_full_sq (row):
    if row['life_sq'] > 100 and row['life_sq/full_sq'] > 1 / 0.67:
        return row['full_sq']
    elif row['life_sq/full_sq'] > 1.3:
        return row['life_sq']
    elif row['full_sq'] < row['life_sq']:
        return row['life_sq']
    return row['full_sq']

def account_kitch_sq (row):
    if row['kitch_sq'] >= 0 and row['kitch_sq'] < row['full_sq_help']:
        return row['full_sq_help'] - row['kitch_sq']
    return row['life_sq_help']

def fill_max_floor (row):
    if not pd.isnull(row['build_year']) and row['build_year'] < 1930:
        return 2
    if not pd.isnull(row['max_floor']):
        return row['max_floor']
    if not pd.isnull(row['build_year']) and row['build_year'] > 0:
        dict_year = (row['build_year'] // 10) * 10
        if dict_year < 1930:
            return 2
        else:
            return d[dict_year]
    if not pd.isna(row['floor']):
        if row['floor'] > 16:
            return row['floor']
        if row['floor'] > 12:
            return 16
        if row['floor'] > 8:
            return 12
    return 8

def custom_pipeline(data_recieved, is_train=True, numeric_features=numeric_features):
    data = data_recieved.copy()
    
    if is_train:
        data.drop(data[data['full_sq'] > 1000].index, inplace=True)
        data.drop(data[data['build_year'] > 2018].index, inplace=True)
        data.drop(data[(data['full_sq'] == 0) & (data['life_sq'] == 0) & (data['kitch_sq'] == 0)].index, \
                  inplace=True)

    mean_division_value = 0.67
    
    data['life_sq/full_sq'] = data['life_sq'] / (data['full_sq'] + 1)
    data['life_sq/full_sq'].mask(np.isinf(data['life_sq/full_sq']), inplace=True)
    data['life_sq/full_sq'].fillna(mean_division_value, inplace=True)

    data.loc[data['life_sq'] > 200, 'life_sq'] = \
                                    data[data['life_sq'] > 200].apply(lambda x: \
                                    x['full_sq'] - x['kitch_sq'] if x['kitch_sq'] >= 0 else x['full_sq'], axis=1)

    mean_value = data['life_sq/full_sq'].mean()
    data['life_sq'] = data.apply(lambda x: x['life_sq'] if not pd.isnull(x['life_sq']) \
                                   else x['full_sq'] * mean_value, axis=1)
    
    data_help = data[['full_sq', 'life_sq', 'kitch_sq', 'life_sq/full_sq']].copy()
    data_help['life_sq_help'] = data_help.apply(change_life_sq, axis=1)
    data_help['full_sq_help'] = data_help.apply(change_full_sq, axis=1)
    data_help['life_sq_help'] = data_help.apply(account_kitch_sq, axis=1)
    data_help['life_sq_help'] = data_help.apply(lambda x: x['life_sq_help'] \
                                if not pd.isnull(x['life_sq_help']) else x['full_sq_help'] * mean_value, axis=1)
    data[['full_sq', 'life_sq']] = data_help[['full_sq_help', 'life_sq_help']]
    data.loc[data['life_sq/full_sq'] > 0.9, 'life_sq'] = \
                    data.loc[data['life_sq/full_sq'] > 0.9].apply(lambda x: x['full_sq'] * mean_value, axis=1)
    
    data.loc[:, 'life_sq'] = data.apply(lambda x: math.ceil(x['life_sq']), axis=1)
    data['life_sq/full_sq'] = (data['life_sq'] + 1) / (data['full_sq'] + 1)
    data['full_sq/life_sq'] = 1 / data['life_sq/full_sq']
    data['kitch_sq'] = data.apply(lambda x: x['full_sq'] - x['life_sq'] , axis = 1)
    data['life_sq/kitch_sq'] = (data['life_sq']) / (data['kitch_sq'] + 1)
    
    data_help = data[['kitch_sq', 'life_sq', 'full_sq', 'life_sq/kitch_sq', 'num_room']].copy()
    data_help['life_sq_help'] = data_help.apply(lambda x: x['kitch_sq'] \
                                        if x['life_sq/kitch_sq'] < x['num_room'] * 1.3 else x['life_sq'], axis=1)
    data_help['kitch_sq_help'] = data_help.apply(lambda x: x['life_sq'] \
                                        if x['life_sq/kitch_sq'] < x['num_room'] * 1.3 else x['kitch_sq'], axis=1)
    data[['life_sq', 'kitch_sq']]= data_help[['life_sq_help', 'kitch_sq_help']]
    
    data['life_sq/full_sq'] = (data['life_sq'] + 1) / (data['full_sq'] + 1)
    data['life_sq/kitch_sq'] = (data['life_sq'] + 1) / (data['kitch_sq'] + 1)
    data['full_sq/life_sq'] = 1 / data['life_sq/full_sq']
    data['kitch_sq/life_sq'] = 1 / data['life_sq/kitch_sq']
    
    data.rename(columns={'kitch_sq' : 'other_sq'}, inplace=True)
    
    numeric_features = [feature if feature != 'kitch_sq' else 'other_sq' for feature in numeric_features]
    
    data.loc[:, 'full_sq'] = data.apply(lambda x: round(x['full_sq']), axis=1)
    data.loc[:, 'life_sq'] = data.apply(lambda x: round(x['life_sq']), axis=1)
    data.loc[:, 'other_sq'] = data.apply(lambda x: round(x['other_sq']), axis=1)
    
    data.drop(columns=['life_sq/kitch_sq', 'kitch_sq/life_sq'], inplace=True)
    data['life_sq/full_sq'] = (data['life_sq'] + 1) / (data['full_sq'] + 1)
    data['life_sq/other_sq'] = (data['life_sq'] + 1) / (data['other_sq'] + 1)
    data['full_sq/life_sq'] = 1 / data['life_sq/full_sq']
    data['other_sq/life_sq'] = 1 / data['life_sq/other_sq']
    
    sample_data = data[['life_sq', 'other_sq', 'life_sq/other_sq']].copy()
    data.loc[:, 'life_sq'] = sample_data.apply(lambda x: x['life_sq'] if \
                                        x['life_sq/other_sq'] > 0.8 else x['other_sq'], axis=1)
    data.loc[:, 'other_sq'] = sample_data.apply(lambda x: x['other_sq'] if \
                                        x['life_sq/other_sq'] > 0.8 else x['life_sq'], axis=1)
    
    data['life_sq/full_sq'] = (data['life_sq'] + 1) / (data['full_sq'] + 1)
    data['life_sq/other_sq'] = (data['life_sq'] + 1) / (data['other_sq'] + 1)
    data['full_sq/life_sq'] = 1 / data['life_sq/full_sq']
    data['other_sq/life_sq'] = 1 / data['life_sq/other_sq'] 
    
    if is_train:
        data.drop(data[data['full_sq'] > 290].index, inplace=True)
        data.drop(1030, inplace=True)
    
    data.loc[:, 'max_floor'] = data.apply(lambda x: x['floor'] \
                                     if x['floor'] > x['max_floor'] else x['max_floor'], axis=1)
    data.loc[:, 'max_floor'] = data.apply(fill_max_floor, axis=1)
    data.loc[:, 'floor'] = data.apply(lambda x: x['max_floor'] // 2 if pd.isnull(x['floor']) \
                             else x['floor'], axis=1)
    data.loc[data['build_year'] < 1860, 'build_year'] = np.nan
    
    data.loc[:, 'num_room was missing'] = data['num_room'].isnull()
    data.loc[data['num_room'].isnull(), 'num_room'] = np.round(data.loc[data['num_room'].isnull(), \
                                                                       'life_sq'] / 23)
    data.loc[:, 'material'].fillna(7, inplace=True)
    
    for feature in missed_features:
        data[feature + ' was missing'] = data[feature].isnull()
        for area in set(data['sub_area'].values):
            if area in set(train['sub_area'].values):
                data.loc[(data['sub_area'] == area) & (pd.isnull(data[feature])), feature] = \
                train[(train['sub_area'] == area) & (~pd.isnull(train[feature]))][feature].median()

    data.loc[pd.isnull(data['product_type']), 'product_type'] = 'Investment'
                
    for column_name in data.columns:
        data[column_name + ' was missing'] = data[column_name].isnull()
    
    if is_train:
        my_imputer.fit(data.loc[:, numeric_features[: -1]])
    data.loc[:, numeric_features[: -1]] = my_imputer.transform(data.loc[:, numeric_features[: -1]])
    
    if is_train:
        my_scaler.fit(data.loc[:, numeric_features[: -1]])
    data.loc[:, numeric_features[:-1]] = my_scaler.transform(data.loc[:, numeric_features[: -1]])
    
    return data

In [0]:
new_train = custom_pipeline(train)
new_test = custom_pipeline(test, is_train=False)

In [18]:
new_train.isna().sum().sum()

0

In [19]:
new_test.isna().sum().sum()

0

In [0]:
new_train['month'] = new_train.apply(lambda x: x['timestamp'].month, axis=1)
new_train['year'] = new_train.apply(lambda x: x['timestamp'].year, axis=1)
new_test['month'] = new_test.apply(lambda x: x['timestamp'].month, axis=1)
new_test['year'] = new_test.apply(lambda x: x['timestamp'].year, axis=1)

In [21]:
numeric_features = new_train._get_numeric_data().columns
categorical_features = list(set(new_train.columns) - set(numeric_features))

categorical_features.remove('timestamp')

new_train[categorical_features].nunique()

detention_facility_raion       2
product_type                   2
big_road1_1line                2
radiation_raion                2
railroad_terminal_raion        2
nuclear_reactor_raion          2
sub_area                     146
culture_objects_top_25         2
thermal_power_plant_raion      2
big_market_raion               2
oil_chemistry_raion            2
ecology                        5
incineration_raion             2
water_1line                    2
railroad_1line                 2
dtype: int64

In [22]:
new_test[categorical_features].nunique()

detention_facility_raion       2
product_type                   2
big_road1_1line                2
radiation_raion                2
railroad_terminal_raion        2
nuclear_reactor_raion          2
sub_area                     145
culture_objects_top_25         2
thermal_power_plant_raion      2
big_market_raion               2
oil_chemistry_raion            2
ecology                        5
incineration_raion             2
water_1line                    2
railroad_1line                 2
dtype: int64

In [0]:
for column in categorical_features:
    if len(set(new_test[column]) - set(new_train[column])) > 0:
        print(column, list(set(new_test[column]) - set(new_train[column])))

In [0]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

label_encoder = LabelEncoder()

new_train.loc[:, 'sub_area'] = label_encoder.fit_transform(new_train['sub_area'])
new_test.loc[:, 'sub_area'] = label_encoder.transform(new_test['sub_area'])

In [0]:
label_features = ['sub_area']
one_hot_features = categorical_features.copy()
for feature in label_features:
    one_hot_features.remove(feature)

y = new_train['price_doc']
new_train.drop(columns=['price_doc'], inplace=True)

In [0]:
data_conc = pd.get_dummies(pd.concat([new_train, new_test], sort=False)[one_hot_features])

In [0]:
new_train.drop(columns=one_hot_features, inplace=True)
new_test.drop(columns=one_hot_features, inplace=True)

In [0]:
new_train_one_hot = data_conc.loc[: 30473, :]
new_test_one_hot = data_conc.loc[30474: , :]

In [0]:
new_train = pd.concat([new_train, new_train_one_hot], axis=1)
new_test = pd.concat([new_test, new_test_one_hot], axis=1)

In [0]:
numeric_features = new_train._get_numeric_data().columns
categorical_features = list(set(new_train.columns) - set (numeric_features))

In [31]:
for column in categorical_features:
    if not 'was missing' in column:
        print(column)

timestamp


In [0]:
new_train.drop(columns=['timestamp'], inplace=True)
new_test.drop(columns=['timestamp'], inplace=True)

In [33]:
set(new_train.columns) - set(new_test.columns)

{'price_doc was missing'}

In [0]:
new_train.drop(columns=['price_doc was missing'], inplace=True)

In [0]:
new_train_investment = pd.concat([new_train, y], axis=1)
new_train_investment = new_train_investment[new_train_investment['product_type_Investment'] == 1]
y_investment = new_train_investment['price_doc']
new_train_investment.drop(columns=['price_doc'], inplace=True)
new_train_owner_occupier = pd.concat([new_train, y], axis=1)
new_train_owner_occupier = new_train_owner_occupier[new_train_owner_occupier['product_type_OwnerOccupier'] == 1]
y_owner_occupier = new_train_owner_occupier['price_doc']
new_train_owner_occupier.drop(columns=['price_doc'], inplace=True)

In [0]:
def rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

In [0]:
grid_params = {
    'num_leaves' : [int(d) for d in np.linspace(20, 50, 5)],
    'learning_rate' : np.linspace(0.005, 0.15, 10),
    'n_estimators' : [int(d) for d in np.linspace(80, 170, 10)],
    'boosting_type' : ['gbdt', 'dart']
}

model = LGBMRegressor()
grid = GridSearchCV(model, grid_params, cv = 4, verbose=5, n_jobs=-1)

In [51]:
grid.fit(new_train_investment, y_investment, eval_metric=rmsle)

Fitting 4 folds for each of 1000 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 20.7min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed: 37.9min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 61.5min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed: 89.2min
[Parallel(n_jobs=-1)]: Done 878 tasks      | elapsed: 120.8min
[Parallel(n_jobs=-1)]: Done 1148 tasks      | elapsed: 157.1min
[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed: 197.1min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 242.1min
[Parallel(n_jobs=-1)]: Done 2174 tasks      | elapsed: 292.8min
[Parallel(n_jobs=-1)]: Done 2588 tasks      | elapsed: 355.9min
[Parallel(n_jobs=-1)]: Done 3038 tasks      | elapsed: 417.3min
[Parallel(n_jobs=-1)]: Done 3524 tasks      | elapsed: 481.9min
[Parallel(n_jobs=-1)]: Done 4000 out of 4

GridSearchCV(cv=4, error_score=nan,
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_depth=-1, min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1, num_leaves=31,
                                     objective=None, random_state=None,
                                     reg_alpha=0.0, reg_lambda=0.0, silent=True...
             iid='deprecated', n_jobs=-1,
             param_grid={'boosting_type': ['gbdt', 'dart'],
                         'learning_rate': array([0.005     , 0.02111111, 0.03722222, 0.05333333, 0.06944444,
       0.08555556, 0.10166667, 0.11777778, 0.13388889, 0.15      ]),
                         'n_estimators': [80, 90, 100, 110, 120, 130, 140, 1

In [52]:
grid.best_estimator_

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.05333333333333333,
              max_depth=-1, min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=170, n_jobs=-1, num_leaves=20,
              objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
              silent=True, subsample=1.0, subsample_for_bin=200000,
              subsample_freq=0)

I think threse hyperparameters are suitable not only for Investment but also for OwnerOccupier properties.
If the results would fall on OwnerOccupier houses, I'll make sure to find better hyperparameters.