In [135]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from IPython.display import display, HTML

# Download data

In [136]:
train = pd.read_csv('./train.csv', index_col='id')
test = pd.read_csv('./test.csv', index_col='id')
train['timestamp'] = train['timestamp'].apply(pd.to_datetime)
test['timestamp'] = test['timestamp'].apply(pd.to_datetime)

In [137]:
def num_houses_with_bigger_column_value (column_name, value):
    return len(train[train[column_name] > value].index)

def num_houses_with_less_column_value (column_name, value):
    return len(train[train[column_name] < value].index)

In [138]:
def plot_feature_kde(column_name, min_value, max_value):
    data = train[column_name]
    data = data.where(data.notnull(), axis=0)
    data = data[(min_value <= data) & (data <= max_value)]
    ax = sns.kdeplot(data=data, shade=True)
    ax.set(xlabel= column_name, ylabel='density')

def plot_regplot(column_name1, column_name2, col1_min, col1_max, col2_min, col2_max, alpha=0.1):
    reg_plot_data = train[[column_name1, column_name2]]
    reg_plot_data = reg_plot_data[(col1_min <= train[column_name1]) & (train[column_name1] <= col1_max) \
                                & (col2_min <= train[column_name2]) & (train[column_name2] <= col2_max)]
    sns.regplot(x=reg_plot_data[column_name1], y=reg_plot_data[column_name2], \
              scatter_kws={'alpha':alpha})

def plot_features_jointplot(column_name1, column_name2, col1_min, col1_max, \
                            col2_min, col2_max, alpha=0.1):
    data = train.loc[:, [column_name1, column_name2]]
    data = data[data.notnull().all(axis=1)]
    data = data[(col1_min <= data[column_name1]) & (data[column_name1] <= col1_max) \
              & (col2_min <= data[column_name2]) & (data[column_name2] <= col2_max)]
    sns.jointplot(column_name1, column_name2, data=data, kind="kde", space=0, color="b", \
                scatter_kws={'alpha':alpha})

def plot_lmplot(column_name1, column_name2, hue, col1_min, col1_max, \
                            col2_min, col2_max, alpha=0.1):
    data = train.loc[:, [column_name1, column_name2, hue]]
    data=data[data.notnull().all(axis=1)]
    data = data[(col1_min <= data[column_name1]) & (data[column_name1] <= col1_max) \
              & (col2_min <= data[column_name2]) & (data[column_name2] <= col2_max)]
    sns.lmplot(x=column_name1, y=column_name2, hue=hue, data=data, scatter_kws={'alpha':alpha})

def plot_distplot(column_name):
    sns.distplot(a=train[column_name], kde=False)

# Understanding missing values

In [139]:
numeric_nan_info = train._get_numeric_data().isna().sum()
print(len(numeric_nan_info[numeric_nan_info > 0]), 'numeric columns have missing values.\n')
numeric_nan_info[numeric_nan_info > 0]

51 numeric columns have missing values.



life_sq                                   6383
floor                                      167
max_floor                                 9572
material                                  9572
build_year                               13605
num_room                                  9572
kitch_sq                                  9572
state                                    13559
preschool_quota                           6688
school_quota                              6685
hospital_beds_raion                      14441
raion_build_count_with_material_info      4991
build_count_block                         4991
build_count_wood                          4991
build_count_frame                         4991
build_count_brick                         4991
build_count_monolith                      4991
build_count_panel                         4991
build_count_foam                          4991
build_count_slag                          4991
build_count_mix                           4991
raion_build_c

In [140]:
numeric_features = train._get_numeric_data().columns
categorical_features = set(train.columns) - set(numeric_features)
categorical_features_nan_info = train[categorical_features].isna().sum()
print(len(categorical_features_nan_info[categorical_features_nan_info > 0]), \
      'categorical columns have missing values.\n')
categorical_features_nan_info[categorical_features_nan_info > 0]

0 categorical columns have missing values.



Series([], dtype: int64)

In [141]:
train.describe()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,area_m,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
count,30471.0,24088.0,30304.0,20899.0,20899.0,16866.0,20899.0,20899.0,16912.0,30471.0,...,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0
mean,54.214269,34.403271,7.670803,12.558974,1.827121,3068.057,1.909804,6.399301,2.107025,17657050.0,...,32.058318,10.78386,1.771783,15.045552,30.251518,0.442421,8.648814,52.796593,5.98707,7123035.0
std,38.031487,52.285733,5.319989,6.75655,1.481154,154387.8,0.851805,28.265979,0.880148,20649610.0,...,73.465611,28.385679,5.418807,29.118668,47.347938,0.609269,20.580741,46.29266,4.889219,4780111.0
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2081628.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100000.0
25%,38.0,20.0,3.0,9.0,1.0,1967.0,1.0,1.0,1.0,7307411.0,...,2.0,1.0,0.0,2.0,9.0,0.0,0.0,11.0,1.0,4740002.0
50%,49.0,30.0,6.5,12.0,1.0,1979.0,2.0,6.0,2.0,10508030.0,...,8.0,2.0,0.0,7.0,16.0,0.0,2.0,48.0,5.0,6274411.0
75%,63.0,43.0,11.0,17.0,2.0,2005.0,2.0,9.0,3.0,18036440.0,...,21.0,5.0,1.0,12.0,28.0,1.0,7.0,76.0,10.0,8300000.0
max,5326.0,7478.0,77.0,117.0,6.0,20052010.0,19.0,2014.0,33.0,206071800.0,...,377.0,147.0,30.0,151.0,250.0,2.0,106.0,218.0,21.0,111111100.0


In [142]:
heatmap_df = train.iloc[:, 1:9].copy()
heatmap_df.dropna(inplace=True)
heatmap_df.corr()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq
full_sq,1.0,0.254972,0.160772,0.208166,0.050752,-0.006065,0.713927,0.042118
life_sq,0.254972,1.0,0.034216,0.043237,0.010481,-0.002401,0.187473,0.002555
floor,0.160772,0.034216,1.0,0.535355,0.028114,0.001426,0.003288,0.00822
max_floor,0.208166,0.043237,0.535355,1.0,0.071513,-0.000283,-0.006762,0.040285
material,0.050752,0.010481,0.028114,0.071513,1.0,-0.004606,-0.040406,0.033845
build_year,-0.006065,-0.002401,0.001426,-0.000283,-0.004606,1.0,-0.008501,0.00049
num_room,0.713927,0.187473,0.003288,-0.006762,-0.040406,-0.008501,1.0,0.016216
kitch_sq,0.042118,0.002555,0.00822,0.040285,0.033845,0.00049,0.016216,1.0


## &emsp; Data processing



In [143]:
my_imputer = SimpleImputer(strategy="median")

In [144]:
import math

def change_life_sq (row):
    if row['life_sq'] > 100 and row['life_sq/full_sq'] > 1 / 0.67:
        return row['full_sq']
    elif row['life_sq/full_sq'] > 1 / 0.67:
        return row['life_sq'] - row['full_sq']
    elif row['full_sq'] < row['life_sq']:
        return row['full_sq']
    return row['life_sq']

def change_full_sq (row):
    if row['life_sq'] > 100 and row['life_sq/full_sq'] > 1 / 0.67:
        return row['full_sq']
    elif row['life_sq/full_sq'] > 1.3:
        return row['life_sq']
    elif row['full_sq'] < row['life_sq']:
        return row['life_sq']
    return row['full_sq']

def account_kitch_sq (row):
    if row['kitch_sq'] >= 0 and row['kitch_sq'] < row['full_sq_help']:
        return row['full_sq_help'] - row['kitch_sq']
    return row['life_sq_help']

def fill_max_floor (row):
    if not pd.isnull(row['build_year']) and row['build_year'] < 1930:
        return 2
    if not pd.isnull(row['max_floor']):
        return row['max_floor']
    if not pd.isnull(row['build_year']) and row['build_year'] > 0:
        dict_year = (row['build_year'] // 10) * 10
        if dict_year < 1930:
            return 2
        else:
            return d[dict_year]
    if not pd.isna(row['floor']):
        if row['floor'] > 16:
            return row['floor']
        if row['floor'] > 12:
            return 16
        if row['floor'] > 8:
            return 12
    return 8

def custom_pipeline(data_recieved, is_train=True, numeric_features=numeric_features):
    data = data_recieved.copy()
    
    if is_train:
        data.drop(data[data['full_sq'] > 1000].index, inplace=True)
        data.drop(data[data['build_year'] > 2018].index, inplace=True)
        data.drop(data[(data['full_sq'] == 0) & (data['life_sq'] == 0) & (data['kitch_sq'] == 0)].index, \
                  inplace=True)

    mean_division_value = 0.67
    
    data['life_sq/full_sq'] = data['life_sq'] / (data['full_sq'] + 1)
    data['life_sq/full_sq'].mask(np.isinf(data['life_sq/full_sq']), inplace=True)
    data['life_sq/full_sq'].fillna(mean_division_value, inplace=True)

    data.loc[data['life_sq'] > 200, 'life_sq'] = \
                                    data[data['life_sq'] > 200].apply(lambda x: \
                                    x['full_sq'] - x['kitch_sq'] if x['kitch_sq'] >= 0 else x['full_sq'], axis=1)

    mean_value = data['life_sq/full_sq'].mean()
    data['life_sq'] = data.apply(lambda x: x['life_sq'] if not pd.isnull(x['life_sq']) \
                                   else x['full_sq'] * mean_value, axis=1)
    
    data_help = data[['full_sq', 'life_sq', 'kitch_sq', 'life_sq/full_sq']].copy()
    data_help['life_sq_help'] = data_help.apply(change_life_sq, axis=1)
    data_help['full_sq_help'] = data_help.apply(change_full_sq, axis=1)
    data_help['life_sq_help'] = data_help.apply(account_kitch_sq, axis=1)
    data_help['life_sq_help'] = data_help.apply(lambda x: x['life_sq_help'] \
                                if not pd.isnull(x['life_sq_help']) else x['full_sq_help'] * mean_value, axis=1)
    data[['full_sq', 'life_sq']] = data_help[['full_sq_help', 'life_sq_help']]
    data.loc[data['life_sq/full_sq'] > 0.9, 'life_sq'] = \
                    data.loc[data['life_sq/full_sq'] > 0.9].apply(lambda x: x['full_sq'] * mean_value, axis=1)
    
    data.loc[:, 'life_sq'] = data.apply(lambda x: math.ceil(x['life_sq']), axis=1)
    data['life_sq/full_sq'] = (data['life_sq'] + 1) / (data['full_sq'] + 1)
    data['full_sq/life_sq'] = 1 / data['life_sq/full_sq']
    data['kitch_sq'] = data.apply(lambda x: x['full_sq'] - x['life_sq'] , axis = 1)
    data['life_sq/kitch_sq'] = (data['life_sq']) / (data['kitch_sq'] + 1)
    
    data_help = data[['kitch_sq', 'life_sq', 'full_sq', 'life_sq/kitch_sq', 'num_room']].copy()
    data_help['life_sq_help'] = data_help.apply(lambda x: x['kitch_sq'] \
                                        if x['life_sq/kitch_sq'] < x['num_room'] * 1.3 else x['life_sq'], axis=1)
    data_help['kitch_sq_help'] = data_help.apply(lambda x: x['life_sq'] \
                                        if x['life_sq/kitch_sq'] < x['num_room'] * 1.3 else x['kitch_sq'], axis=1)
    data[['life_sq', 'kitch_sq']]= data_help[['life_sq_help', 'kitch_sq_help']]
    
    data['life_sq/full_sq'] = (data['life_sq'] + 1) / (data['full_sq'] + 1)
    data['life_sq/kitch_sq'] = (data['life_sq'] + 1) / (data['kitch_sq'] + 1)
    data['full_sq/life_sq'] = 1 / data['life_sq/full_sq']
    data['kitch_sq/life_sq'] = 1 / data['life_sq/kitch_sq']
    
    data.rename(columns={'kitch_sq' : 'other_sq'}, inplace=True)
    
    numeric_features = [feature if feature != 'kitch_sq' else 'other_sq' for feature in numeric_features]
    
    data.loc[:, 'full_sq'] = data.apply(lambda x: round(x['full_sq']), axis=1)
    data.loc[:, 'life_sq'] = data.apply(lambda x: round(x['life_sq']), axis=1)
    data.loc[:, 'other_sq'] = data.apply(lambda x: round(x['other_sq']), axis=1)
    
    data.loc[:, 'max_floor'] = data.apply(lambda x: x['floor'] \
                                     if x['floor'] > x['max_floor'] else x['max_floor'], axis=1)
    data.loc[:, 'max_floor'] = data.apply(fill_max_floor, axis=1)
    data.loc[data['build_year'] < 1860, 'build_year'] = np.nan
    
    for column_name in data.columns:
        data[column_name + ' was missing'] = data[column_name].isnull()
    if is_train:
        my_imputer.fit(data.loc[:, numeric_features[: -1]])
    data.loc[:, numeric_features[: -1]] = my_imputer.transform(data.loc[:, numeric_features[: -1]])
    
    
    return data

In [145]:
new_train = custom_pipeline(train)
new_test = custom_pipeline(test, is_train=False)

In [146]:
new_train.isna().sum().sum()

0

In [147]:
new_test.isna().sum().sum()

33

In [148]:
new_test[new_test.isna().any(axis=1)].isna().sum().head(12)

timestamp        0
full_sq          0
life_sq          0
floor            0
max_floor        0
material         0
build_year       0
num_room         0
other_sq         0
state            0
product_type    33
sub_area         0
dtype: int64

In [149]:
new_train.loc[new_train['product_type'] == 'Investment','build_year'].mean()

1980.286125681374

In [150]:
new_train.loc[new_train['product_type'] == 'OwnerOccupier','build_year'].mean()

1988.5880591597859

In [151]:
new_test.loc[new_test['product_type was missing'],: 'product_type']

Unnamed: 0_level_0,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,other_sq,state,product_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
35136,2016-02-03,64.0,51.0,8.0,17.0,1.0,2013.0,2.0,13.0,3.0,
35155,2016-02-03,39.0,33.0,10.0,12.0,5.0,1968.0,2.0,6.0,2.0,
35227,2016-02-04,77.0,29.0,12.0,2.0,1.0,1981.0,2.0,48.0,2.0,
35258,2016-02-05,35.0,35.0,4.0,4.0,1.0,1981.0,1.0,0.0,1.0,
35265,2016-02-05,59.0,54.0,6.0,9.0,1.0,1979.0,3.0,5.0,3.0,
35274,2016-02-05,39.0,30.0,9.0,17.0,6.0,2015.0,1.0,9.0,1.0,
35521,2016-02-11,76.0,66.0,12.0,17.0,1.0,1991.0,3.0,10.0,2.0,
35573,2016-02-12,47.0,41.0,4.0,13.0,1.0,1971.0,2.0,6.0,3.0,
35585,2016-02-12,37.0,27.0,6.0,18.0,1.0,1972.0,1.0,10.0,3.0,
35667,2016-02-15,42.0,38.0,3.0,9.0,2.0,1971.0,2.0,4.0,2.0,


In [152]:
new_test.loc[new_test['product_type was missing'], 'product_type'] = \
                            new_test.loc[new_test['product_type was missing'],:].apply(lambda x:'OwnerOccupier' \
                                                           if x['build_year'] >= 1988 else 'Investment', axis=1)

In [153]:
new_test.isna().sum().sum() + new_train.isna().sum().sum()

0

In [154]:
new_train['month'] = new_train.apply(lambda x: x['timestamp'].month, axis=1)
new_train['year'] = new_train.apply(lambda x: x['timestamp'].year, axis=1)
new_test['month'] = new_test.apply(lambda x: x['timestamp'].month, axis=1)
new_test['year'] = new_test.apply(lambda x: x['timestamp'].year, axis=1)

In [155]:
numeric_features = new_train._get_numeric_data().columns
categorical_features = list(set(new_train.columns) - set(numeric_features))

categorical_features.remove('timestamp')

new_train[categorical_features].nunique()

water_1line                    2
incineration_raion             2
nuclear_reactor_raion          2
culture_objects_top_25         2
detention_facility_raion       2
oil_chemistry_raion            2
railroad_1line                 2
product_type                   2
thermal_power_plant_raion      2
big_road1_1line                2
sub_area                     146
railroad_terminal_raion        2
ecology                        5
radiation_raion                2
big_market_raion               2
dtype: int64

In [156]:
new_test[categorical_features].nunique()

water_1line                    2
incineration_raion             2
nuclear_reactor_raion          2
culture_objects_top_25         2
detention_facility_raion       2
oil_chemistry_raion            2
railroad_1line                 2
product_type                   2
thermal_power_plant_raion      2
big_road1_1line                2
sub_area                     145
railroad_terminal_raion        2
ecology                        5
radiation_raion                2
big_market_raion               2
dtype: int64

In [157]:
for column in categorical_features:
    if len(set(new_test[column]) - set(new_train[column])) > 0:
        print(column, list(set(new_test[column]) - set(new_train[column])))

In [158]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

label_encoder = LabelEncoder()

new_train.loc[:, 'sub_area'] = label_encoder.fit_transform(new_train['sub_area'])
new_test.loc[:, 'sub_area'] = label_encoder.transform(new_test['sub_area'])

In [159]:
label_features = ['sub_area']
one_hot_features = categorical_features.copy()
for feature in label_features:
    one_hot_features.remove(feature)

y = new_train['price_doc']
new_train.drop(columns=['price_doc'], inplace=True)

In [160]:
data_conc = pd.get_dummies(pd.concat([new_train, new_test], sort=False)[one_hot_features])

In [161]:
new_train.drop(columns=one_hot_features, inplace=True)
new_test.drop(columns=one_hot_features, inplace=True)

In [162]:
new_train_one_hot = data_conc.loc[: 30473, :]
new_test_one_hot = data_conc.loc[30474: , :]

In [163]:
new_train = pd.concat([new_train, new_train_one_hot], axis=1)
new_test = pd.concat([new_test, new_test_one_hot], axis=1)

In [164]:
numeric_features = new_train._get_numeric_data().columns
categorical_features = list(set(new_train.columns) - set (numeric_features))

In [165]:
for column in categorical_features:
    if not 'was missing' in column:
        print(column)

timestamp


In [166]:
new_train.drop(columns=['timestamp'], inplace=True)
new_test.drop(columns=['timestamp'], inplace=True)

In [167]:
set(new_train.columns) - set(new_test.columns)

{'price_doc was missing'}

In [168]:
new_train.drop(columns=['price_doc was missing'], inplace=True)

In [174]:
new_train_investment = pd.concat([new_train, y], axis=1)
new_train_investment = new_train_investment[new_train_investment['product_type_Investment'] == 1]
y_investment = new_train_investment['price_doc']
new_train_investment.drop(columns=['price_doc'], inplace=True)
new_train_owner_occupier = pd.concat([new_train, y], axis=1)
new_train_owner_occupier = new_train_owner_occupier[new_train_owner_occupier['product_type_OwnerOccupier'] == 1]
y_owner_occupier = new_train_owner_occupier['price_doc']
new_train_owner_occupier.drop(columns=['price_doc'], inplace=True)

In [179]:
from lightgbm import LGBMRegressor

In [196]:
batch_size = new_train.shape[0] // 8
batches = []
for i in range(8):
    if i == 7:
        batches.append(new_train.loc[7 * batch_size : , : ])
    else:
        batches.append(new_train.loc[batch_size * i : batch_size * (i + 1) - 1, : ])

In [183]:
from sklearn.metrics import mean_squared_log_error

num_leaves_arr = np.linspace(20, 70, 10)
leaning_rate_arr = np.linspace(0.005, 0.15, 20)
n_estimators_arr = np.linspace(50, 200, 15)
best_tuple = (np.inf, 0, 0, 0)
for pos1, num_leaves in enumerate(num_leaves_arr):
    for pos2, learning_rate in enumerate(leaning_rate_arr):
        for pos3, n_estimators in enumerate(n_estimators_arr):
            results = []
            for i in range(8):
                reg = LGBMRegressor(num_leaves=num_leaves, learning_rate=learning_rate, n_estimators=n_estimators)
                reg.fit(batches[i], y)
                for j in range(8):
                    if j != i:
                        reg.predict(batches[j])
                        results.append(np.sqrt(mean_squared_log_error(batches[j], y)))
            cur_val_res = 0
            for res in results:
                cur_val_res += res
            cur_val_res /= len(results)
            if cur_val_res < best_tuple[0]:
                best_tuple = (cur_val_res, num_leaves, learning_rate, n_estimators)
            print('Phase', pos1, pos2, pos3, 'finished, best tuple is', best_tuple)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=200, n_jobs=-1, num_leaves=100, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)