In [129]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from IPython.display import display, HTML
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_log_error
import time
import math
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Download data

In [130]:
train = pd.read_csv('./train.csv', index_col='id')
test = pd.read_csv('./test.csv', index_col='id')
train['timestamp'] = train['timestamp'].apply(pd.to_datetime)
test['timestamp'] = test['timestamp'].apply(pd.to_datetime)

In [131]:
numeric_features = train._get_numeric_data().columns
categorical_features = list(set(train.columns) - set(numeric_features))
categorical_features.remove('timestamp')

## &emsp; Data processing



In [132]:
my_imputer = SimpleImputer(strategy="median")
my_scaler = StandardScaler()
my_hot_encoder = OneHotEncoder(handle_unknown='ignore')
my_label_encoder = LabelEncoder()

In [133]:
missed_features = np.array(['preschool_quota', 'school_quota',
       'hospital_beds_raion', 'raion_build_count_with_material_info',
       'build_count_block', 'build_count_wood', 'build_count_frame',
       'build_count_brick', 'build_count_monolith', 'build_count_panel',
       'build_count_foam', 'build_count_slag', 'build_count_mix',
       'raion_build_count_with_builddate_info', 'build_count_before_1920',
       'build_count_1921-1945', 'build_count_1946-1970',
       'build_count_1971-1995', 'build_count_after_1995', 'metro_min_walk',
       'metro_km_walk', 'railroad_station_walk_km',
       'railroad_station_walk_min', 'ID_railroad_station_walk',
       'cafe_sum_500_min_price_avg', 'cafe_sum_500_max_price_avg',
       'cafe_avg_price_500', 'cafe_sum_1000_min_price_avg',
       'cafe_sum_1000_max_price_avg', 'cafe_avg_price_1000',
       'cafe_sum_1500_min_price_avg', 'cafe_sum_1500_max_price_avg',
       'cafe_avg_price_1500', 'cafe_sum_2000_min_price_avg',
       'cafe_sum_2000_max_price_avg', 'cafe_avg_price_2000',
       'cafe_sum_3000_min_price_avg', 'cafe_sum_3000_max_price_avg',
       'cafe_avg_price_3000', 'prom_part_5000', 'cafe_sum_5000_min_price_avg',
       'cafe_sum_5000_max_price_avg', 'cafe_avg_price_5000'])

In [134]:
def change_life_sq (row):
    if row['life_sq'] > 100 and row['life_sq/full_sq'] > 1 / 0.67:
        return row['full_sq']
    elif row['life_sq/full_sq'] > 1 / 0.67:
        return row['life_sq'] - row['full_sq']
    elif row['full_sq'] < row['life_sq']:
        return row['full_sq']
    return row['life_sq']

def change_full_sq (row):
    if row['life_sq'] > 100 and row['life_sq/full_sq'] > 1 / 0.67:
        return row['full_sq']
    elif row['life_sq/full_sq'] > 1.3:
        return row['life_sq']
    elif row['full_sq'] < row['life_sq']:
        return row['life_sq']
    return row['full_sq']

def account_kitch_sq (row):
    if row['kitch_sq'] >= 0 and row['kitch_sq'] < row['full_sq_help']:
        return row['full_sq_help'] - row['kitch_sq']
    return row['life_sq_help']

def fill_max_floor (row):
    if not pd.isnull(row['build_year']) and row['build_year'] < 1930:
        return 2
    if not pd.isnull(row['max_floor']):
        return row['max_floor']
    if not pd.isnull(row['build_year']) and row['build_year'] > 0:
        dict_year = (row['build_year'] // 10) * 10
        if dict_year < 1930:
            return 2
        else:
            return d[dict_year]
    if not pd.isna(row['floor']):
        if row['floor'] > 16:
            return row['floor']
        if row['floor'] > 12:
            return 16
        if row['floor'] > 8:
            return 12
    return 8

In [135]:
def custom_pipeline(data_recieved, is_train=True, numeric_features=numeric_features, \
                    categorical_features=categorical_features):
    data = data_recieved.copy()
    
    if is_train:
        data.drop(data[data['full_sq'] > 1000].index, inplace=True)
        data.drop(data[data['build_year'] > 2018].index, inplace=True)
        data.drop(data[(data['full_sq'] == 0) & (data['life_sq'] == 0) & (data['kitch_sq'] == 0)].index, \
                  inplace=True)

    mean_division_value = 0.67
    
    data['life_sq/full_sq'] = data['life_sq'] / (data['full_sq'] + 1)
    data['life_sq/full_sq'].mask(np.isinf(data['life_sq/full_sq']), inplace=True)
    data['life_sq/full_sq'].fillna(mean_division_value, inplace=True)

    data.loc[data['life_sq'] > 200, 'life_sq'] = \
                                    data[data['life_sq'] > 200].apply(lambda x: \
                                    x['full_sq'] - x['kitch_sq'] if x['kitch_sq'] >= 0 else x['full_sq'], axis=1)

    mean_value = data['life_sq/full_sq'].mean()
    data['life_sq'] = data.apply(lambda x: x['life_sq'] if not pd.isnull(x['life_sq']) \
                                   else x['full_sq'] * mean_value, axis=1)
    
    data_help = data[['full_sq', 'life_sq', 'kitch_sq', 'life_sq/full_sq']].copy()
    data_help['life_sq_help'] = data_help.apply(change_life_sq, axis=1)
    data_help['full_sq_help'] = data_help.apply(change_full_sq, axis=1)
    data_help['life_sq_help'] = data_help.apply(account_kitch_sq, axis=1)
    data_help['life_sq_help'] = data_help.apply(lambda x: x['life_sq_help'] \
                                if not pd.isnull(x['life_sq_help']) else x['full_sq_help'] * mean_value, axis=1)
    data[['full_sq', 'life_sq']] = data_help[['full_sq_help', 'life_sq_help']]
    data.loc[data['life_sq/full_sq'] > 0.9, 'life_sq'] = \
                    data.loc[data['life_sq/full_sq'] > 0.9].apply(lambda x: x['full_sq'] * mean_value, axis=1)
    
    data.loc[:, 'life_sq'] = data.apply(lambda x: math.ceil(x['life_sq']), axis=1)
    data['life_sq/full_sq'] = (data['life_sq'] + 1) / (data['full_sq'] + 1)
    data['full_sq/life_sq'] = 1 / data['life_sq/full_sq']
    data['kitch_sq'] = data.apply(lambda x: x['full_sq'] - x['life_sq'] , axis = 1)
    data['life_sq/kitch_sq'] = (data['life_sq']) / (data['kitch_sq'] + 1)
    
    data_help = data[['kitch_sq', 'life_sq', 'full_sq', 'life_sq/kitch_sq', 'num_room']].copy()
    data_help['life_sq_help'] = data_help.apply(lambda x: x['kitch_sq'] \
                                        if x['life_sq/kitch_sq'] < x['num_room'] * 1.3 else x['life_sq'], axis=1)
    data_help['kitch_sq_help'] = data_help.apply(lambda x: x['life_sq'] \
                                        if x['life_sq/kitch_sq'] < x['num_room'] * 1.3 else x['kitch_sq'], axis=1)
    data[['life_sq', 'kitch_sq']]= data_help[['life_sq_help', 'kitch_sq_help']]
    
    data['life_sq/full_sq'] = (data['life_sq'] + 1) / (data['full_sq'] + 1)
    data['life_sq/kitch_sq'] = (data['life_sq'] + 1) / (data['kitch_sq'] + 1)
    data['full_sq/life_sq'] = 1 / data['life_sq/full_sq']
    data['kitch_sq/life_sq'] = 1 / data['life_sq/kitch_sq']
    
    data.rename(columns={'kitch_sq' : 'other_sq'}, inplace=True)
    
    numeric_features = [feature if feature != 'kitch_sq' else 'other_sq' for feature in numeric_features]
    
    data.loc[:, 'full_sq'] = data.apply(lambda x: round(x['full_sq']), axis=1)
    data.loc[:, 'life_sq'] = data.apply(lambda x: round(x['life_sq']), axis=1)
    data.loc[:, 'other_sq'] = data.apply(lambda x: round(x['other_sq']), axis=1)
    
    data.drop(columns=['life_sq/kitch_sq', 'kitch_sq/life_sq'], inplace=True)
    data['life_sq/full_sq'] = (data['life_sq'] + 1) / (data['full_sq'] + 1)
    data['life_sq/other_sq'] = (data['life_sq'] + 1) / (data['other_sq'] + 1)
    data['full_sq/life_sq'] = 1 / data['life_sq/full_sq']
    data['other_sq/life_sq'] = 1 / data['life_sq/other_sq']
    
    sample_data = data[['life_sq', 'other_sq', 'life_sq/other_sq']].copy()
    data.loc[:, 'life_sq'] = sample_data.apply(lambda x: x['life_sq'] if \
                                        x['life_sq/other_sq'] > 0.8 else x['other_sq'], axis=1)
    data.loc[:, 'other_sq'] = sample_data.apply(lambda x: x['other_sq'] if \
                                        x['life_sq/other_sq'] > 0.8 else x['life_sq'], axis=1)
    
    data['life_sq/full_sq'] = (data['life_sq'] + 1) / (data['full_sq'] + 1)
    data['life_sq/other_sq'] = (data['life_sq'] + 1) / (data['other_sq'] + 1)
    data['full_sq/life_sq'] = 1 / data['life_sq/full_sq']
    data['other_sq/life_sq'] = 1 / data['life_sq/other_sq'] 
    
    if is_train:
        data.drop(data[data['full_sq'] > 290].index, inplace=True)
        data.drop(1030, inplace=True)
    
    data.loc[:, 'max_floor'] = data.apply(lambda x: x['floor'] \
                                     if x['floor'] > x['max_floor'] else x['max_floor'], axis=1)
    data.loc[:, 'max_floor'] = data.apply(fill_max_floor, axis=1)
    data.loc[:, 'floor'] = data.apply(lambda x: x['max_floor'] // 2 if pd.isnull(x['floor']) \
                             else x['floor'], axis=1)
    data.loc[data['build_year'] < 1860, 'build_year'] = np.nan
    
    data.loc[:, 'num_room was missing'] = data['num_room'].isnull()
    data.loc[data['num_room'].isnull(), 'num_room'] = np.round(data.loc[data['num_room'].isnull(), \
                                                                       'life_sq'] / 23)
    data.loc[:, 'material'].fillna(7, inplace=True)
    
    for feature in missed_features:
        data[feature + ' was missing'] = data[feature].isnull()
        for area in set(data['sub_area'].values):
            if area in set(train['sub_area'].values):
                data.loc[(data['sub_area'] == area) & (pd.isnull(data[feature])), feature] = \
                train[(train['sub_area'] == area) & (~pd.isnull(train[feature]))][feature].median()

    data.loc[pd.isnull(data['product_type']), 'product_type'] = 'Investment'
                
    for column_name in data.columns:
        data[column_name + ' was missing'] = data[column_name].isnull()
    
    if is_train:
        my_label_encoder.fit(data['sub_area'])
    data.loc[:, 'sub_area'] = my_label_encoder.transform(data['sub_area'])
    
    if is_train:
        my_imputer.fit(data.loc[:, numeric_features[: -1]])
    data.loc[:, numeric_features[: -1]] = my_imputer.transform(data.loc[:, numeric_features[: -1]])
    
    if is_train:
        my_scaler.fit(data.loc[:, numeric_features[: -1]])
    data.loc[:, numeric_features[:-1]] = my_scaler.transform(data.loc[:, numeric_features[: -1]])
    
    label_features = ['sub_area']
    one_hot_features = categorical_features.copy()
    
    for feature in label_features:
        one_hot_features.remove(feature)
    
    if is_train:
        my_hot_encoder.fit(data.loc[:, one_hot_features])
    
    new_hot_features = pd.DataFrame(my_hot_encoder.transform(data.loc[:, one_hot_features]).toarray())
    for column in new_hot_features.columns:
        new_hot_features.rename(columns={column : 'One_hot_' + str(column)}, inplace=True)
    data[new_hot_features.columns] = new_hot_features.set_index(data.index)
    
    data.drop(columns=one_hot_features, inplace=True)
    
    data['month'] = data.apply(lambda x: x['timestamp'].month, axis=1)
    data['year'] = data.apply(lambda x: x['timestamp'].year, axis=1)
    
    data.drop(columns=['timestamp'], inplace=True)
    
    return data

In [136]:
new_train = custom_pipeline(train)

In [138]:
new_test = custom_pipeline(test, is_train=False)

In [149]:
investment_indexes = train[train['product_type'] == 'Investment'].index
investment_indexes = [index for index in investment_indexes if index in new_train.index]
owner_occupier_indexes = train[train['product_type'] != 'Investment'].index
owner_occupier_indexes = [index for index in owner_occupier_indexes if index in new_train.index]
new_train_investment = new_train.loc[investment_indexes, :]
new_train_investment_y = new_train_investment['price_doc']
new_train_investment.drop(columns=['price_doc'], inplace=True)
new_train_owner_occupier = new_train.loc[owner_occupier_indexes, :]
new_train_owner_occupier_y = new_train_owner_occupier['price_doc']
new_train_owner_occupier.drop(columns=['price_doc'], inplace=True)
owner_occupier_indexes = test[test['product_type'] == 'OwnerOccupier'].index
investment_indexes = test[test['product_type'] != 'OwnerOccupier'].index
new_test_investment = new_test.loc[investment_indexes, :]
new_test_owner_occupier = new_test.loc[owner_occupier_indexes, :]

In [150]:
def rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

In [151]:
investment_model = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, \
                                 importance_type='split', learning_rate=0.05333333333333333, max_depth=-1, \
                                 min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, \
                                 n_estimators=170, n_jobs=-1, num_leaves=20, objective=None, random_state=None, \
                                 reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, \
                                 subsample_for_bin=200000, subsample_freq=0)
owner_occupier_model = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, \
                                 importance_type='split', learning_rate=0.05333333333333333, max_depth=-1, \
                                 min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, \
                                 n_estimators=170, n_jobs=-1, num_leaves=20, objective=None, random_state=None, \
                                 reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, \
                                 subsample_for_bin=200000, subsample_freq=0)