In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from IPython.display import display, HTML
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_log_error
import time
import math
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Download data

In [2]:
train = pd.read_csv('./train.csv', index_col='id')
test = pd.read_csv('./test.csv', index_col='id')
train['timestamp'] = train['timestamp'].apply(pd.to_datetime)
test['timestamp'] = test['timestamp'].apply(pd.to_datetime)

In [3]:
numeric_features = train._get_numeric_data().columns
categorical_features = list(set(train.columns) - set(numeric_features))
categorical_features.remove('timestamp')

## &emsp; Data processing



In [4]:
my_imputer = SimpleImputer(strategy="median")
my_scaler = StandardScaler()
my_hot_encoder = OneHotEncoder(handle_unknown='ignore')
my_label_encoder = LabelEncoder()

In [5]:
missed_features = np.array(['preschool_quota', 'school_quota',
       'hospital_beds_raion', 'raion_build_count_with_material_info',
       'build_count_block', 'build_count_wood', 'build_count_frame',
       'build_count_brick', 'build_count_monolith', 'build_count_panel',
       'build_count_foam', 'build_count_slag', 'build_count_mix',
       'raion_build_count_with_builddate_info', 'build_count_before_1920',
       'build_count_1921-1945', 'build_count_1946-1970',
       'build_count_1971-1995', 'build_count_after_1995', 'metro_min_walk',
       'metro_km_walk', 'railroad_station_walk_km',
       'railroad_station_walk_min', 'ID_railroad_station_walk',
       'cafe_sum_500_min_price_avg', 'cafe_sum_500_max_price_avg',
       'cafe_avg_price_500', 'cafe_sum_1000_min_price_avg',
       'cafe_sum_1000_max_price_avg', 'cafe_avg_price_1000',
       'cafe_sum_1500_min_price_avg', 'cafe_sum_1500_max_price_avg',
       'cafe_avg_price_1500', 'cafe_sum_2000_min_price_avg',
       'cafe_sum_2000_max_price_avg', 'cafe_avg_price_2000',
       'cafe_sum_3000_min_price_avg', 'cafe_sum_3000_max_price_avg',
       'cafe_avg_price_3000', 'prom_part_5000', 'cafe_sum_5000_min_price_avg',
       'cafe_sum_5000_max_price_avg', 'cafe_avg_price_5000'])

In [6]:
def change_life_sq (row):
    if row['life_sq'] > 100 and row['life_sq/full_sq'] > 1 / 0.67:
        return row['full_sq']
    elif row['life_sq/full_sq'] > 1 / 0.67:
        return row['life_sq'] - row['full_sq']
    elif row['full_sq'] < row['life_sq']:
        return row['full_sq']
    return row['life_sq']

def change_full_sq (row):
    if row['life_sq'] > 100 and row['life_sq/full_sq'] > 1 / 0.67:
        return row['full_sq']
    elif row['life_sq/full_sq'] > 1.3:
        return row['life_sq']
    elif row['full_sq'] < row['life_sq']:
        return row['life_sq']
    return row['full_sq']

def account_kitch_sq (row):
    if row['kitch_sq'] >= 0 and row['kitch_sq'] < row['full_sq_help']:
        return row['full_sq_help'] - row['kitch_sq']
    return row['life_sq_help']
    
def fill_max_floor (row):
    if not pd.isnull(row['build_year']) and row['build_year'] < 1930:
        return 2
    if not pd.isnull(row['max_floor']):
        return row['max_floor']
    if not pd.isnull(row['build_year']) and row['build_year'] > 0:
        dict_year = (row['build_year'] // 10) * 10
        if dict_year < 1930:
            return 2
        else:
            return d[dict_year]
    if not pd.isna(row['floor']):
        if row['floor'] > 16:
            return row['floor']
        if row['floor'] > 12:
            return 16
        if row['floor'] > 8:
            return 12
    return 8

In [7]:
def custom_pipeline(data_recieved, is_train=True, numeric_features=numeric_features, \
                    categorical_features=categorical_features):
    data = data_recieved.copy()
    
    if is_train:
        data.drop(data[data['full_sq'] > 1000].index, inplace=True)
        data.drop(data[data['build_year'] > 2018].index, inplace=True)
        data.drop(data[(data['full_sq'] == 0) & (data['life_sq'] == 0) & (data['kitch_sq'] == 0)].index, \
                  inplace=True)

    mean_division_value = 0.67
    
    data['life_sq/full_sq'] = data['life_sq'] / (data['full_sq'] + 1)
    data['life_sq/full_sq'].mask(np.isinf(data['life_sq/full_sq']), inplace=True)
    data['life_sq/full_sq'].fillna(mean_division_value, inplace=True)

    data.loc[data['life_sq'] > 200, 'life_sq'] = \
                                data[data['life_sq'] > 200].apply(lambda x: \
                                x['full_sq'] - x['kitch_sq'] if x['kitch_sq'] >= 0 else x['full_sq'], axis=1)

    mean_value = data['life_sq/full_sq'].mean()
    data['life_sq'] = data.apply(lambda x: x['life_sq'] if not pd.isnull(x['life_sq']) \
                                   else x['full_sq'] * mean_value, axis=1)
    
    data_help = data[['full_sq', 'life_sq', 'kitch_sq', 'life_sq/full_sq']].copy()
    data_help['life_sq_help'] = data_help.apply(change_life_sq, axis=1)
    data_help['full_sq_help'] = data_help.apply(change_full_sq, axis=1)
    data_help['life_sq_help'] = data_help.apply(account_kitch_sq, axis=1)
    data_help['life_sq_help'] = data_help.apply(lambda x: x['life_sq_help'] \
                            if not pd.isnull(x['life_sq_help']) else x['full_sq_help'] * mean_value, axis=1)
    data[['full_sq', 'life_sq']] = data_help[['full_sq_help', 'life_sq_help']]
    data.loc[data['life_sq/full_sq'] > 0.9, 'life_sq'] = \
                    data.loc[data['life_sq/full_sq'] > 0.9].apply(lambda x: x['full_sq'] * mean_value, axis=1)
    
    data.loc[:, 'life_sq'] = data.apply(lambda x: math.ceil(x['life_sq']), axis=1)
    data['life_sq/full_sq'] = (data['life_sq'] + 1) / (data['full_sq'] + 1)
    data['full_sq/life_sq'] = 1 / data['life_sq/full_sq']
    data['kitch_sq'] = data.apply(lambda x: x['full_sq'] - x['life_sq'] , axis = 1)
    data['life_sq/kitch_sq'] = (data['life_sq']) / (data['kitch_sq'] + 1)
    
    data_help = data[['kitch_sq', 'life_sq', 'full_sq', 'life_sq/kitch_sq', 'num_room']].copy()
    data_help['life_sq_help'] = data_help.apply(lambda x: x['kitch_sq'] \
                                    if x['life_sq/kitch_sq'] < x['num_room'] * 1.3 else x['life_sq'], axis=1)
    data_help['kitch_sq_help'] = data_help.apply(lambda x: x['life_sq'] \
                                    if x['life_sq/kitch_sq'] < x['num_room'] * 1.3 else x['kitch_sq'], axis=1)
    data[['life_sq', 'kitch_sq']]= data_help[['life_sq_help', 'kitch_sq_help']]
    
    data['life_sq/full_sq'] = (data['life_sq'] + 1) / (data['full_sq'] + 1)
    data['life_sq/kitch_sq'] = (data['life_sq'] + 1) / (data['kitch_sq'] + 1)
    data['full_sq/life_sq'] = 1 / data['life_sq/full_sq']
    data['kitch_sq/life_sq'] = 1 / data['life_sq/kitch_sq']
    
    data.rename(columns={'kitch_sq' : 'other_sq'}, inplace=True)
    
    numeric_features = [feature if feature != 'kitch_sq' else 'other_sq' for feature in numeric_features]
    
    data.loc[:, 'full_sq'] = data.apply(lambda x: round(x['full_sq']), axis=1)
    data.loc[:, 'life_sq'] = data.apply(lambda x: round(x['life_sq']), axis=1)
    data.loc[:, 'other_sq'] = data.apply(lambda x: round(x['other_sq']), axis=1)
    
    data.drop(columns=['life_sq/kitch_sq', 'kitch_sq/life_sq'], inplace=True)
    data['life_sq/full_sq'] = (data['life_sq'] + 1) / (data['full_sq'] + 1)
    data['life_sq/other_sq'] = (data['life_sq'] + 1) / (data['other_sq'] + 1)
    data['full_sq/life_sq'] = 1 / data['life_sq/full_sq']
    data['other_sq/life_sq'] = 1 / data['life_sq/other_sq']
    
    sample_data = data[['life_sq', 'other_sq', 'life_sq/other_sq']].copy()
    data.loc[:, 'life_sq'] = sample_data.apply(lambda x: x['life_sq'] if \
                                        x['life_sq/other_sq'] > 0.8 else x['other_sq'], axis=1)
    data.loc[:, 'other_sq'] = sample_data.apply(lambda x: x['other_sq'] if \
                                        x['life_sq/other_sq'] > 0.8 else x['life_sq'], axis=1)
    
    data['life_sq/full_sq'] = (data['life_sq'] + 1) / (data['full_sq'] + 1)
    data['life_sq/other_sq'] = (data['life_sq'] + 1) / (data['other_sq'] + 1)
    data['full_sq/life_sq'] = 1 / data['life_sq/full_sq']
    data['other_sq/life_sq'] = 1 / data['life_sq/other_sq'] 
    
    if is_train:
        data.drop(data[data['full_sq'] > 290].index, inplace=True)
        data.drop(1030, inplace=True)
    
    data.loc[:, 'max_floor'] = data.apply(lambda x: x['floor'] \
                                     if x['floor'] > x['max_floor'] else x['max_floor'], axis=1)
    data.loc[:, 'max_floor'] = data.apply(fill_max_floor, axis=1)
    data.loc[:, 'floor'] = data.apply(lambda x: x['max_floor'] // 2 if pd.isnull(x['floor']) \
                             else x['floor'], axis=1)
    data.loc[data['build_year'] < 1860, 'build_year'] = np.nan
    
    data.loc[:, 'num_room was missing'] = data['num_room'].isnull()
    data.loc[data['num_room'].isnull(), 'num_room'] = np.round(data.loc[data['num_room'].isnull(), \
                                                                       'life_sq'] / 23)
    data.loc[:, 'material'].fillna(7, inplace=True)
    
    for feature in missed_features:
        data[feature + ' was missing'] = data[feature].isnull()
        for area in set(data['sub_area'].values):
            if area in set(train['sub_area'].values):
                data.loc[(data['sub_area'] == area) & (pd.isnull(data[feature])), feature] = \
                train[(train['sub_area'] == area) & (~pd.isnull(train[feature]))][feature].median()

    data.loc[pd.isnull(data['product_type']), 'product_type'] = 'Investment'
                
    for column_name in data.columns:
        data[column_name + ' was missing'] = data[column_name].isnull()
    
    if is_train:
        my_label_encoder.fit(data['sub_area'])
    data.loc[:, 'sub_area'] = my_label_encoder.transform(data['sub_area'])
    
    if is_train:
        my_imputer.fit(data.loc[:, numeric_features[: -1]])
    data.loc[:, numeric_features[: -1]] = my_imputer.transform(data.loc[:, numeric_features[: -1]])
    
    if is_train:
        my_scaler.fit(data.loc[:, numeric_features[: -1]])
    data.loc[:, numeric_features[:-1]] = my_scaler.transform(data.loc[:, numeric_features[: -1]])
    
    label_features = ['sub_area']
    one_hot_features = categorical_features.copy()
    
    for feature in label_features:
        one_hot_features.remove(feature)
    
    if is_train:
        my_hot_encoder.fit(data.loc[:, one_hot_features])
    
    new_hot_features = pd.DataFrame(my_hot_encoder.transform(data.loc[:, one_hot_features]).toarray())
    for column in new_hot_features.columns:
        new_hot_features.rename(columns={column : 'One_hot_' + str(column)}, inplace=True)
    data[new_hot_features.columns] = new_hot_features.set_index(data.index)
    
    data.drop(columns=one_hot_features, inplace=True)
    
    data['month'] = data.apply(lambda x: x['timestamp'].month, axis=1)
    data['year'] = data.apply(lambda x: x['timestamp'].year, axis=1)
    
    data.drop(columns=['timestamp'], inplace=True)
    
    return data

In [8]:
new_train = custom_pipeline(train)

In [9]:
new_test = custom_pipeline(test, is_train=False)

In [130]:
investment_indexes = train[train['product_type'] == 'Investment'].index
investment_indexes = [index for index in investment_indexes if index in new_train.index]
owner_occupier_indexes = train[train['product_type'] != 'Investment'].index
owner_occupier_indexes = [index for index in owner_occupier_indexes if index in new_train.index]
new_train_investment = new_train.loc[investment_indexes, :]
    new_train_investment_y = new_train_investment['price_doc']
new_train_investment.drop(columns=['price_doc'], inplace=True)
new_train_owner_occupier = new_train.loc[owner_occupier_indexes, :]
new_train_owner_occupier_y = new_train_owner_occupier['price_doc']
new_train_owner_occupier.drop(columns=['price_doc'], inplace=True)
owner_occupier_indexes = test[test['product_type'] == 'OwnerOccupier'].index
investment_indexes = test[test['product_type'] != 'OwnerOccupier'].index
new_test_investment = new_test.loc[investment_indexes, :]
new_test_owner_occupier = new_test.loc[owner_occupier_indexes, :]

In [131]:
def rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

In [132]:
investment_model = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                                  importance_type='split', learning_rate=0.0958080808080808,
                                  max_depth=-1, min_child_samples=20, min_child_weight=0.001,
                                  min_split_gain=0.0, n_estimators=114, n_jobs=-1, num_leaves=21,
                                  objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
                                  silent=True, subsample=1.0, subsample_for_bin=200000,
                                  subsample_freq=0)
owner_occupier_model = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                                  importance_type='split', learning_rate=0.0958080808080808,
                                  max_depth=-1, min_child_samples=20, min_child_weight=0.001,
                                  min_split_gain=0.0, n_estimators=114, n_jobs=-1, num_leaves=21,
                                  objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
                                  silent=True, subsample=1.0, subsample_for_bin=200000,
                                  subsample_freq=0)

In [133]:
def scorer_RMSLE(y_true, y_pred):
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true))** 2))

scorer = make_scorer(scorer_RMSLE, greater_is_better=False)

In [None]:
curr_feature_list = []

prev_score = np.inf

for i in range(2):
    print('Phase', i + 1, 'started:')
    print('---------------------')
    for idx, feature in enumerate(new_train_investment.columns):
        if feature in curr_feature_list:
            continue
        curr_feature_list.append(feature)
        cur_score = np.mean(cross_val_score(investment_model, new_train_investment.loc[:, curr_feature_list], \
                            new_train_investment_y, scoring=scorer, cv=5, fit_params = {'eval_metric' : rmsle}))
        if abs(cur_score) - prev_score < 0:
            prev_score = abs(cur_score)
        else:
            curr_feature_list.pop()
        if idx % 100 == 0:
            print(idx, ' features considered out of', len(new_train_investment.columns))
    print('Phase', i + 1, 'done!')
    print()

We have 2 epochs to understand which features we take because some features can heavily rely on others and we wouldn't have taken them if we had only 1 epoch.

In [135]:
curr_feature_list = ['full_sq', 'life_sq', 'floor', 'max_floor', 'material', 'build_year', 'num_room',\
                     'state', 'sub_area', 'area_m', 'raion_popul', 'green_zone_part', 'indust_part',\
                     'children_preschool', 'preschool_quota', 'school_quota', 'hospital_beds_raion',\
                     'sport_objects_raion', 'office_raion', 'full_all', 'female_f', 'young_all',\
                     'raion_build_count_with_material_info', 'build_count_block', 'build_count_panel',\
                     'build_count_foam', 'build_count_1921-1945', 'ID_metro', 'metro_min_avto',\
                     'metro_min_walk', 'kindergarten_km', 'ID_railroad_station_avto', 'mkad_km',\
                     'ttk_km', 'kremlin_km', 'big_road2_km', 'railroad_km', 'zd_vokzaly_avto_km',\
                     'ID_railroad_terminal', 'bus_terminal_avto_km', 'oil_chemistry_km', 'ts_km',\
                     'detention_facility_km', 'mosque_km', 'office_count_500', 'cafe_count_500',\
                     'cafe_count_1000_price_high', 'sport_count_1000', 'trc_count_2000', 'trc_sqm_2000',\
                     'cafe_sum_2000_max_price_avg', 'mosque_count_3000', 'market_count_5000',\
                     'life_sq/other_sq', 'other_sq/life_sq', 'One_hot_0', 'One_hot_18', 'year',\
                     'school_education_centers_top_20_raion', 'build_count_mix', 'One_hot_14']

In [136]:
len(curr_feature_list)

61

In [137]:
curr_feature_list

['full_sq',
 'life_sq',
 'floor',
 'max_floor',
 'material',
 'build_year',
 'num_room',
 'state',
 'sub_area',
 'area_m',
 'raion_popul',
 'green_zone_part',
 'indust_part',
 'children_preschool',
 'preschool_quota',
 'school_quota',
 'hospital_beds_raion',
 'sport_objects_raion',
 'office_raion',
 'full_all',
 'female_f',
 'young_all',
 'raion_build_count_with_material_info',
 'build_count_block',
 'build_count_panel',
 'build_count_foam',
 'build_count_1921-1945',
 'ID_metro',
 'metro_min_avto',
 'metro_min_walk',
 'kindergarten_km',
 'ID_railroad_station_avto',
 'mkad_km',
 'ttk_km',
 'kremlin_km',
 'big_road2_km',
 'railroad_km',
 'zd_vokzaly_avto_km',
 'ID_railroad_terminal',
 'bus_terminal_avto_km',
 'oil_chemistry_km',
 'ts_km',
 'detention_facility_km',
 'mosque_km',
 'office_count_500',
 'cafe_count_500',
 'cafe_count_1000_price_high',
 'sport_count_1000',
 'trc_count_2000',
 'trc_sqm_2000',
 'cafe_sum_2000_max_price_avg',
 'mosque_count_3000',
 'market_count_5000',
 'life_sq

We should remove year because we'll adjust prices later on and now the task is to evaluate the house as asset without looking at the date of the row

In [138]:
curr_feature_list.remove('year')

Intuitively all the important features were taken, let's take a look at some of the other important features which were not taken. (from data dictionary and overall housing market knowledge we can say that the most important features are among 50 first, other features are mostely less important)

In [139]:
set(new_train_investment.iloc[:, :50].columns) - set(curr_feature_list)

{'0_17_all',
 '0_17_female',
 '0_17_male',
 '0_6_all',
 '0_6_female',
 '0_6_male',
 '7_14_all',
 '7_14_female',
 '7_14_male',
 'additional_education_raion',
 'children_school',
 'culture_objects_top_25_raion',
 'ekder_all',
 'ekder_female',
 'ekder_male',
 'healthcare_centers_raion',
 'male_f',
 'other_sq',
 'preschool_education_centers_raion',
 'school_education_centers_raion',
 'shopping_centers_raion',
 'university_top_20_raion',
 'work_all',
 'work_female',
 'work_male',
 'young_female',
 'young_male'}

It's strange that other_sq, university_top_20_raion, shopping_centers_raion, additional_education_raion,
children_school, culture_objects_top_25_raion and healthcare_centers_raion were not taken! I'll add them manually

In [140]:
curr_feature_list.append('other_sq')
curr_feature_list.append('university_top_20_raion')
curr_feature_list.append('shopping_centers_raion')
curr_feature_list.append('additional_education_raion')
curr_feature_list.append('children_school')
curr_feature_list.append('culture_objects_top_25_raion')
curr_feature_list.append('healthcare_centers_raion')

Personally I want to add One_hot_features because the won't really affect learning time but can benefit our model because these features seem important judging by data dictionary

In [141]:
for feature in new_train_investment.columns:
    if 'One_hot' in feature and feature not in curr_feature_list:
        curr_feature_list.append(feature)

In [142]:
len(curr_feature_list)

95

Let's examine remaining features

In [143]:
set([a for a in new_train_investment.columns if 'was missing' not in a]) - set(curr_feature_list)

{'0_13_all',
 '0_13_female',
 '0_13_male',
 '0_17_all',
 '0_17_female',
 '0_17_male',
 '0_6_all',
 '0_6_female',
 '0_6_male',
 '16_29_all',
 '16_29_female',
 '16_29_male',
 '7_14_all',
 '7_14_female',
 '7_14_male',
 'ID_big_road1',
 'ID_big_road2',
 'ID_bus_terminal',
 'ID_railroad_station_walk',
 'additional_education_km',
 'basketball_km',
 'big_church_count_1000',
 'big_church_count_1500',
 'big_church_count_2000',
 'big_church_count_3000',
 'big_church_count_500',
 'big_church_count_5000',
 'big_church_km',
 'big_market_km',
 'big_road1_km',
 'build_count_1946-1970',
 'build_count_1971-1995',
 'build_count_after_1995',
 'build_count_before_1920',
 'build_count_brick',
 'build_count_frame',
 'build_count_monolith',
 'build_count_slag',
 'build_count_wood',
 'bulvar_ring_km',
 'cafe_avg_price_1000',
 'cafe_avg_price_1500',
 'cafe_avg_price_2000',
 'cafe_avg_price_3000',
 'cafe_avg_price_500',
 'cafe_avg_price_5000',
 'cafe_count_1000',
 'cafe_count_1000_na_price',
 'cafe_count_1000_p

These are the features I either found helpful or which complement features taken.

In [144]:
curr_feature_list.append('additional_education_km')
curr_feature_list.append('big_church_km')
curr_feature_list.append('big_market_km')
curr_feature_list.append('big_road1_km')
curr_feature_list.append('bulvar_ring_km')
curr_feature_list.append('church_synagogue_km')
curr_feature_list.append('fitness_km')
curr_feature_list.append('full_sq/life_sq')
curr_feature_list.append('green_zone_km')
curr_feature_list.append('ice_rink_km')
curr_feature_list.append('park_km')
curr_feature_list.append('public_healthcare_km')
curr_feature_list.append('public_transport_station_min_walk')
curr_feature_list.append('railroad_station_walk_min')
curr_feature_list.append('sadovoe_km')
curr_feature_list.append('school_km')
curr_feature_list.append('shopping_centers_km')
curr_feature_list.append('university_km')

Now let's drop unnecessary features

In [None]:
prev_score = np.inf

for i in range(2):
    total_phases = len(curr_feature_list )
    print('Phase', i + 1, 'started:')
    print('---------------------')
    total_phases = len(curr_feature_list )
    for idx, feature in enumerate(curr_feature_list ):
        if idx % 100 == 0:
            print(idx, ' features considered out of', total_phases)
        curr_feature_list.remove(feature)
        cur_score = np.mean(cross_val_score(investment_model, new_train_investment.loc[:, curr_feature_list], \
                            new_train_investment_y, scoring=scorer, cv=5, fit_params = {'eval_metric' : rmsle}))
        if abs(cur_score) - prev_score < 0:
            prev_score = abs(cur_score)
            print(feature, 'dropped out')
        else:
            curr_feature_list.insert(idx, feature)
    print('Phase', i + 1, 'done!')
    print()

In [145]:
curr_feature_list.remove('max_floor')
curr_feature_list.remove('area_m')
curr_feature_list.remove('preschool_quota')
curr_feature_list.remove('raion_build_count_with_material_info')
curr_feature_list.remove('build_count_mix')

In [147]:
len(curr_feature_list)

108

I believe that preschool_quota is an important feature a thus I want to understand whether this feature is bad or it was dropped due to conjuncture.

In [149]:
curr_feature_list.append('preschool_quota')

In [150]:
prev_score = np.array([])
for i in range (10):
    prev_score = np.append(prev_score, np.mean(cross_val_score(investment_model, \
                           new_train_investment.loc[:, curr_feature_list], new_train_investment_y, \
                           scoring=scorer, cv=5, fit_params = {'eval_metric' : rmsle})))
prev_score = np.mean(prev_score)
curr_feature_list.pop()
curr_score = np.array([])
for i in range (10):
    curr_score = np.append(prev_score, np.mean(cross_val_score(investment_model, \
                           new_train_investment.loc[:, curr_feature_list], new_train_investment_y, \
                           scoring=scorer, cv=5, fit_params = {'eval_metric' : rmsle})))
cur_score = np.mean(cur_score)
print('Prev_score', prev_score)
print('Cur_score', cur_score)

Prev_score -0.5787638004133642
Cur_score -0.5783484139975001


In [156]:
abs(cur_score) - abs(prev_score)

-0.00041538641586402747

Turns out that this feature is decreasing our performance

Let's make new features from the ones we didn't take with the assistance of TSNE and PCA

In [152]:
not_taken_features = list(set(new_train_investment.columns) - set(curr_feature_list))

In [153]:
full_new_train = pd.concat([new_train_investment, new_train_owner_occupier])

In [167]:
my_PCA = PCA(n_components=10, random_state=1)
new_coord_PCA = my_PCA.fit_transform(full_new_train.loc[:, not_taken_features])
my_PCA_scaler = StandardScaler()
new_coord_PCA = my_PCA_scaler.fit_transform(new_coord_PCA)
new_coord_PCA = pd.DataFrame(new_coord_PCA)
for column in new_coord_PCA.columns:
    new_coord_PCA.rename(columns={column : 'PCA_' + str(column)}, inplace=True)

In [176]:
full_new_train[new_coord_PCA.columns] = new_coord_PCA.set_index(full_new_train.index)

In [184]:
prev_score = np.inf

curr_PCA_feature_list = list(new_coord_PCA.columns)
for i in range(2):
    print('Phase', i + 1, 'started:')
    print('---------------------')
    for idx, feature in enumerate(curr_PCA_feature_list):
        curr_PCA_feature_list.remove(feature)
        cur_score = np.mean(cross_val_score(investment_model, full_new_train.loc[:, \
                            curr_feature_list + curr_PCA_feature_list], \
                            pd.concat([new_train_investment_y, new_train_owner_occupier_y]), 
                            scoring=scorer, cv=5, fit_params = {'eval_metric' : rmsle}))
        if abs(cur_score) - prev_score < 0:
            prev_score = abs(cur_score)
            print(feature, 'dropped out')
        else:
            curr_PCA_feature_list.insert(idx, feature)
    print('Phase', i + 1, 'done!')
    print()

Phase 1 started:
---------------------
PCA_0 dropped out
PCA_6 dropped out
Phase 1 done!

Phase 2 started:
---------------------
PCA_5 dropped out
Phase 2 done!



In [186]:
curr_feature_list += curr_PCA_feature_list

In [187]:
curr_feature_list

['life_sq',
 'floor',
 'material',
 'build_year',
 'num_room',
 'state',
 'sub_area',
 'raion_popul',
 'green_zone_part',
 'indust_part',
 'children_preschool',
 'school_quota',
 'hospital_beds_raion',
 'sport_objects_raion',
 'office_raion',
 'full_all',
 'female_f',
 'young_all',
 'build_count_block',
 'build_count_panel',
 'build_count_foam',
 'build_count_1921-1945',
 'ID_metro',
 'metro_min_avto',
 'metro_min_walk',
 'kindergarten_km',
 'ID_railroad_station_avto',
 'mkad_km',
 'ttk_km',
 'kremlin_km',
 'big_road2_km',
 'railroad_km',
 'zd_vokzaly_avto_km',
 'ID_railroad_terminal',
 'bus_terminal_avto_km',
 'oil_chemistry_km',
 'ts_km',
 'detention_facility_km',
 'mosque_km',
 'office_count_500',
 'cafe_count_500',
 'cafe_count_1000_price_high',
 'sport_count_1000',
 'trc_count_2000',
 'trc_sqm_2000',
 'cafe_sum_2000_max_price_avg',
 'mosque_count_3000',
 'market_count_5000',
 'life_sq/other_sq',
 'other_sq/life_sq',
 'One_hot_0',
 'One_hot_18',
 'school_education_centers_top_20_ra