In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection, preprocessing
import xgboost as xgb
import datetime
import math
from numpy.random import uniform 
import statsmodels.api as sm

# Load Files

In [5]:
train = pd.read_csv('train.csv', parse_dates=['timestamp'])
test = pd.read_csv('test.csv', parse_dates=['timestamp'])
macro = pd.read_csv('macro.csv', parse_dates=['timestamp'])
fx = pd.read_excel('BAD_ADDRESS_FIX.xlsx').drop_duplicates('id').set_index('id')

### Update files

In [6]:
train.update(fx)
test.update(fx)
print('Fix in train: ', train.index.intersection(fx.index).shape[0])
print('Fix in test : ', test.index.intersection(fx.index).shape[0])

train['id'] = train.index
train = train.reset_index(drop=True)
test['id'] = test.index
test = test.reset_index(drop=True)

id_test = test.id

Fix in train:  550
Fix in test :  115


In [7]:
#Make an inner join for train dataset and macro dataset
train_macro = pd.merge(train,macro)
test_macro = pd.merge(test,macro)

In [8]:
train = train_macro
test = test_macro

## Remove Outliers

In [17]:
#multiplier = 0.969
#clean data
bad_index = train[train.life_sq > train.full_sq].index
train.ix[bad_index, "life_sq"] = np.NaN
equal_index = [601,1896,2791]
test.ix[equal_index, "life_sq"] = test.ix[equal_index, "full_sq"]
bad_index = test[test.life_sq > test.full_sq].index
test.ix[bad_index, "life_sq"] = np.NaN
bad_index = train[train.life_sq < 5].index
train.ix[bad_index, "life_sq"] = np.NaN
bad_index = test[test.life_sq < 5].index
test.ix[bad_index, "life_sq"] = np.NaN
bad_index = train[train.full_sq < 5].index
train.ix[bad_index, "full_sq"] = np.NaN
bad_index = test[test.full_sq < 5].index
test.ix[bad_index, "full_sq"] = np.NaN
kitch_is_build_year = [13117]
train.ix[kitch_is_build_year, "build_year"] = train.ix[kitch_is_build_year, "kitch_sq"]
bad_index = train[train.kitch_sq >= train.life_sq].index
train.ix[bad_index, "kitch_sq"] = np.NaN
bad_index = test[test.kitch_sq >= test.life_sq].index
test.ix[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.kitch_sq == 0).values + (train.kitch_sq == 1).values].index
train.ix[bad_index, "kitch_sq"] = np.NaN
bad_index = test[(test.kitch_sq == 0).values + (test.kitch_sq == 1).values].index
test.ix[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.full_sq > 210) & (train.life_sq / train.full_sq < 0.3)].index
train.ix[bad_index, "full_sq"] = np.NaN
bad_index = test[(test.full_sq > 150) & (test.life_sq / test.full_sq < 0.3)].index
test.ix[bad_index, "full_sq"] = np.NaN
bad_index = train[train.life_sq > 300].index
train.ix[bad_index, ["life_sq", "full_sq"]] = np.NaN
bad_index = test[test.life_sq > 200].index
test.ix[bad_index, ["life_sq", "full_sq"]] = np.NaN
train.product_type.value_counts(normalize= True)
test.product_type.value_counts(normalize= True)
bad_index = train[train.build_year < 1500].index
train.ix[bad_index, "build_year"] = np.NaN
bad_index = test[test.build_year < 1500].index
test.ix[bad_index, "build_year"] = np.NaN
bad_index = train[train.num_room == 0].index 
train.ix[bad_index, "num_room"] = np.NaN
bad_index = test[test.num_room == 0].index 
test.ix[bad_index, "num_room"] = np.NaN
bad_index = [10076, 11621, 17764, 19390, 24007, 26713, 29172]
train.ix[bad_index, "num_room"] = np.NaN
bad_index = [3174, 7313]
test.ix[bad_index, "num_room"] = np.NaN
bad_index = train[(train.floor == 0).values * (train.max_floor == 0).values].index
train.ix[bad_index, ["max_floor", "floor"]] = np.NaN
bad_index = train[train.floor == 0].index
train.ix[bad_index, "floor"] = np.NaN
bad_index = train[train.max_floor == 0].index
train.ix[bad_index, "max_floor"] = np.NaN
bad_index = test[test.max_floor == 0].index
test.ix[bad_index, "max_floor"] = np.NaN
bad_index = train[train.floor > train.max_floor].index
train.ix[bad_index, "max_floor"] = np.NaN
bad_index = test[test.floor > test.max_floor].index
test.ix[bad_index, "max_floor"] = np.NaN
train.floor.describe(percentiles= [0.9999])
bad_index = [23584]
train.ix[bad_index, "floor"] = np.NaN
train.material.value_counts()
test.material.value_counts()
train.state.value_counts()
bad_index = train[train.state == 33].index
train.ix[bad_index, "state"] = np.NaN
test.state.value_counts()



2.0    2662
1.0    2266
3.0    1913
4.0     127
Name: state, dtype: int64

In [18]:
# brings error down a lot by removing extreme price per sqm
train.loc[train.full_sq == 0, 'full_sq'] = 50
train = train[train.price_doc/train.full_sq <= 600000]
train = train[train.price_doc/train.full_sq >= 10000]

## Feature Creation

In [20]:
train["timestamp"] = pd.to_datetime(train["timestamp"])
test["timestamp"] = pd.to_datetime(test["timestamp"])

# Add month-year
month_year = (train.timestamp.dt.month + train.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
train['month_year_cnt'] = month_year.map(month_year_cnt_map)

month_year = (test.timestamp.dt.month + test.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
test['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (train.timestamp.dt.weekofyear + train.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
train['week_year_cnt'] = week_year.map(week_year_cnt_map)

week_year = (test.timestamp.dt.weekofyear + test.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
test['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
train['month'] = train.timestamp.dt.month
train['dow'] = train.timestamp.dt.dayofweek

test['month'] = test.timestamp.dt.month
test['dow'] = test.timestamp.dt.dayofweek

# Add how many years passed
#print("duration creating")
train['duration'] = train.timestamp.dt.year - train["build_year"]
test['duration'] = test.timestamp.dt.year - test["build_year"]

In [21]:
train['area_per_room'] = train['life_sq'] / train['num_room'].astype(float) #rough area per room
train['livArea_ratio'] = train['life_sq'] / train['full_sq'].astype(float) #rough living area
train['yrs_old'] = 2017 - train['build_year'].astype(float) #years old from 2017
train['avgfloor_sq'] = train['life_sq']/train['max_floor'].astype(float) #living area per floor
train['pts_floor_ratio'] = train['public_transport_station_km']/train['max_floor'].astype(float)
# looking for significance of apartment buildings near public t 
#train['room_size'] = train['life_sq'] / train['num_room'].astype(float)
# doubled a var by accident
train['gender_ratio'] = train['male_f']/train['female_f'].astype(float)
train['kg_park_ratio'] = train['kindergarten_km']/train['park_km'].astype(float) #significance of children?
train['high_ed_extent'] = train['school_km'] / train['kindergarten_km'] #schooling
train['pts_x_state'] = train['public_transport_station_km'] * train['state'].astype(float) #public trans * state of listing
train['lifesq_x_state'] = train['life_sq'] * train['state'].astype(float) #life_sq times the state of the place
train['floor_x_state'] = train['floor'] * train['state'].astype(float) #relative floor * the state of the place

test['area_per_room'] = test['life_sq'] / test['num_room'].astype(float)
test['livArea_ratio'] = test['life_sq'] / test['full_sq'].astype(float)
test['yrs_old'] = 2017 - test['build_year'].astype(float)
test['avgfloor_sq'] = test['life_sq']/test['max_floor'].astype(float) #living area per floor
test['pts_floor_ratio'] = test['public_transport_station_km']/test['max_floor'].astype(float) #apartments near public t?
#test['room_size'] = test['life_sq'] / test['num_room'].astype(float)
test['gender_ratio'] = test['male_f']/test['female_f'].astype(float)
test['kg_park_ratio'] = test['kindergarten_km']/test['park_km'].astype(float)
test['high_ed_extent'] = test['school_km'] / test['kindergarten_km']
test['pts_x_state'] = test['public_transport_station_km'] * test['state'].astype(float) #public trans * state of listing
test['lifesq_x_state'] = test['life_sq'] * test['state'].astype(float)
test['floor_x_state'] = test['floor'] * test['state'].astype(float)

In [22]:
# Feature Engineering based on house itself
train['rel_floor'] = train['floor'] / train['max_floor'].astype(float)
train['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)
train['rel_kitchlife_sq'] = train['kitch_sq'] / train['life_sq'].astype(float)

test['rel_floor'] = test['floor'] / test['max_floor'].astype(float)
test['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)
test['rel_kitchlife_sq'] = test['kitch_sq'] / test['life_sq'].astype(float)

train.apartment_name=train.sub_area + train['metro_km_avto'].astype(str)
test.apartment_name=test.sub_area + train['metro_km_avto'].astype(str)

train['room_size'] = train['life_sq'] / train['num_room'].astype(float)
test['room_size'] = test['life_sq'] / test['num_room'].astype(float)

train['avg_room_size'] = train['full_sq'] / train['num_room'].astype(float)
test['avg_room_size'] = test['full_sq'] / test['num_room'].astype(float)

In [23]:
# Feature Engineering based on neighbor
train['green_zone'] = train["green_zone_part"]*train["area_m"]
test['green_zone'] = test["green_zone_part"]*test["area_m"]

train['neighbor_density'] = train['raion_popul']/train["area_m"]
test['neighbor_density'] = test['raion_popul']/test["area_m"]

train['industry_zone'] = train["indust_part"]*train["area_m"]
test['industry_zone'] = test["indust_part"]*test["area_m"]

#preschool
train['rel_preschoolseat'] = train['children_preschool']/train['preschool_quota']
test['rel_preschoolseat'] = test['children_preschool']/test['preschool_quota']

train['rel_preschoolcenter'] = train['children_preschool']/train['preschool_education_centers_raion']
test['rel_preschoolcenter'] = test['children_preschool']/test['preschool_education_centers_raion']

#school
train['rel_schoolseat'] = train['children_school']/train['school_quota']
test['rel_schoolseat'] = test['children_school']/test['school_quota']

train['rel_schoolcenter'] = train['children_school']/train['school_education_centers_raion']
test['rel_schoolcenter'] = test['children_school']/test['school_education_centers_raion']

train['rel_schooltopcenter'] = train['children_school']/train['school_education_centers_top_20_raion']
test['rel_schooltopcenter'] = test['children_school']/test['school_education_centers_top_20_raion']

In [24]:
# Female percent and male percent
train['male_percent'] = train['male_f']/train['full_all']
test['male_percent'] = test['male_f']/test['full_all']

train['female_percent'] = train['female_f']/train['full_all']
test['female_percent'] = test['female_f']/test['full_all']

train['young_percent'] = train['young_all']/train['full_all']
test['young_percent'] = test['young_all']/test['full_all']

train['work_percent'] = train['work_all']/train['full_all']
test['work_percent'] = test['work_all']/test['full_all']

train['elder_percent'] = train['ekder_all']/train['full_all']
test['elder_percent'] = test['ekder_all']/test['full_all']

In [25]:
# hospitals
train['hospitals_people'] = train['full_all']/train['healthcare_centers_raion']
test['hospitals_people'] = test['full_all']/test['healthcare_centers_raion']

In [26]:
# Weight distance based on population
train["weight_kindergarten_km"] = train["kindergarten_km"]*train['0_6_all']/train['full_all']
test["weight_kindergarten_km"] = test["kindergarten_km"]*test['0_6_all']/test['full_all']

train["weight_school_km"] = train["school_km"]*train['7_14_all']/train['full_all']
test["weight_school_km"] = test["school_km"]*test['7_14_all']/test['full_all']

train["weight_school_km"] = train["school_km"]*train['7_14_all']/train['full_all']
test["weight_school_km"] = test["school_km"]*test['7_14_all']/test['full_all']

train["weight_school_km"] = train["school_km"]*train['7_14_all']/train['full_all']
test["weight_school_km"] = test["school_km"]*test['7_14_all']/test['full_all']

In [27]:
# Creating Apartment Name Feature
train['apartment_name'] = train.sub_area + train['metro_km_avto'].astype(str)
test['apartment_name'] = test.sub_area + test['metro_km_avto'].astype(str)

In [28]:
# Creating transport distance related feature
train['min_basic_transport_km']  = train['metro_km_walk'] * train['railroad_station_walk_km'] * train['public_transport_station_km']
test['min_basic_transport_km']  = test['metro_km_walk'] * test['railroad_station_walk_km'] * test['public_transport_station_km']

train['inner_access_roads_km']  = train['sadovoe_km'] * train['ttk_km']
test['inner_access_roads_km']  =  test['sadovoe_km'] * test['ttk_km']

train['outer_access_roads_km']  = train['ttk_km'] * train['mkad_km']
test['outer_access_roads_km']  =  test['ttk_km'] * test['mkad_km']

In [30]:
rate_2015_q2 = 1
rate_2015_q1 = rate_2015_q2 / 0.9932
rate_2014_q4 = rate_2015_q1 / 1.0112
rate_2014_q3 = rate_2014_q4 / 1.0169
rate_2014_q2 = rate_2014_q3 / 1.0086
rate_2014_q1 = rate_2014_q2 / 1.0126
rate_2013_q4 = rate_2014_q1 / 0.9902
rate_2013_q3 = rate_2013_q4 / 1.0041
rate_2013_q2 = rate_2013_q3 / 1.0044
rate_2013_q1 = rate_2013_q2 / 1.0104  
rate_2012_q4 = rate_2013_q1 / 0.9832 
rate_2012_q3 = rate_2012_q4 / 1.0277
rate_2012_q2 = rate_2012_q3 / 1.0279
rate_2012_q1 = rate_2012_q2 / 1.0279
rate_2011_q4 = rate_2012_q1 / 1.076
rate_2011_q3 = rate_2011_q4 / 1.0236
rate_2011_q2 = rate_2011_q3 / 1
rate_2011_q1 = rate_2011_q2 / 1.011

In [32]:
# train 2015
train['average_q_price'] = 1

train_2015_q2_index = train.loc[train['timestamp'].dt.year == 2015].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2015_q2_index, 'average_q_price'] = rate_2015_q2

train_2015_q1_index = train.loc[train['timestamp'].dt.year == 2015].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2015_q1_index, 'average_q_price'] = rate_2015_q1


# train 2014
train_2014_q4_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2014_q4_index, 'average_q_price'] = rate_2014_q4

train_2014_q3_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2014_q3_index, 'average_q_price'] = rate_2014_q3

train_2014_q2_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2014_q2_index, 'average_q_price'] = rate_2014_q2

train_2014_q1_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2014_q1_index, 'average_q_price'] = rate_2014_q1


# train 2013
train_2013_q4_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2013_q4_index, 'average_q_price'] = rate_2013_q4

train_2013_q3_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2013_q3_index, 'average_q_price'] = rate_2013_q3

train_2013_q2_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2013_q2_index, 'average_q_price'] = rate_2013_q2

train_2013_q1_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2013_q1_index, 'average_q_price'] = rate_2013_q1


# train 2012
train_2012_q4_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2012_q4_index, 'average_q_price'] = rate_2012_q4

train_2012_q3_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2012_q3_index, 'average_q_price'] = rate_2012_q3

train_2012_q2_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2012_q2_index, 'average_q_price'] = rate_2012_q2

train_2012_q1_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2012_q1_index, 'average_q_price'] = rate_2012_q1


# train 2011
train_2011_q4_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2011_q4_index, 'average_q_price'] = rate_2011_q4

train_2011_q3_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2011_q3_index, 'average_q_price'] = rate_2011_q3

train_2011_q2_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2011_q2_index, 'average_q_price'] = rate_2011_q2

train_2011_q1_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2011_q1_index, 'average_q_price'] = rate_2011_q1

train['price_doc'] = train['price_doc'] * train['average_q_price']


In [33]:
col_name = "build_year"
new_col_name = col_name + "_encode"
# create a dictionary of original categorical value:average y for that value
row_val_dict = {}
for level in train[col_name].unique():
    if level is None:
        row_val_dict[level] = nan
    else:
        row_val_dict[level] = train[train[col_name] == level]['price_doc'].mean()
# apply the transform from the dictionary on all rows in the column
train[new_col_name] = train[col_name].apply(lambda i: i if math.isnan(i) else row_val_dict[i]+uniform(low=-5, high=5) ) 

In [34]:
tmp = []
for i in test[col_name].values:
    #print(i)
    if i not in row_val_dict.keys():
        tmp.append(np.nan)
    else:
        tmp.append(row_val_dict[i])
test[new_col_name] = tmp

In [35]:
col_name = "sub_area"
new_col_name = col_name + "_encode"
# create a dictionary of original categorical value:average y for that value
row_val_dict = {}
for level in train[col_name].unique():
    if level is None:
        row_val_dict[level] = nan
    else:
        row_val_dict[level] = train[train[col_name] == level]['price_doc'].mean()
# apply the transform from the dictionary on all rows in the column
train[new_col_name] = train[col_name].apply(lambda i: row_val_dict[i]+uniform(low=-5, high=5) ) 

In [36]:
tmp = []
for i in test[col_name].values:
    #print(i)
    if i not in row_val_dict.keys():
        tmp.append(np.nan)
    else:
        tmp.append(row_val_dict[i])
test[new_col_name] = tmp

In [37]:
y_train = train["price_doc"]
x_train = train.drop(["id", "timestamp", "price_doc","average_q_price"], axis=1)
x_test = test.drop(["id", "timestamp","average_q_price"], axis=1)

num_train = len(x_train)
x_all = pd.concat([x_train, x_test])

for c in x_all.columns:
    if x_all[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_all[c].values))
        x_all[c] = lbl.transform(list(x_all[c].values))
        #x_train.drop(c,axis=1,inplace=True)

x_train = x_all[:num_train]
x_test = x_all[num_train:]

In [38]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.6,
    'colsample_bytree': 1,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

In [39]:
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1500, early_stopping_rounds=200,
    verbose_eval=50, show_stdv=False)
#cv_output[['train-rmse-mean', 'test-rmse-mean']].plot()

[0]	train-rmse:7.70484e+06	test-rmse:7.71146e+06
[50]	train-rmse:2.11254e+06	test-rmse:2.54267e+06
[100]	train-rmse:1.81746e+06	test-rmse:2.3919e+06
[150]	train-rmse:1.6923e+06	test-rmse:2.35961e+06
[200]	train-rmse:1.59506e+06	test-rmse:2.34359e+06
[250]	train-rmse:1.50755e+06	test-rmse:2.33473e+06
[300]	train-rmse:1.43184e+06	test-rmse:2.32884e+06
[350]	train-rmse:1.36541e+06	test-rmse:2.32489e+06
[400]	train-rmse:1.30494e+06	test-rmse:2.32499e+06
[450]	train-rmse:1.2449e+06	test-rmse:2.32414e+06
[500]	train-rmse:1.19102e+06	test-rmse:2.32389e+06
[550]	train-rmse:1.14582e+06	test-rmse:2.32377e+06
[600]	train-rmse:1.09919e+06	test-rmse:2.32432e+06
[650]	train-rmse:1.06086e+06	test-rmse:2.325e+06
[700]	train-rmse:1.02249e+06	test-rmse:2.32739e+06


In [40]:
num_boost_rounds = len(cv_output)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

#fig, ax = plt.subplots(1, 1, figsize=(8, 13))
#xgb.plot_importance(model, max_num_features=50, height=0.5, ax=ax)

y_predict = model.predict(dtest)

In [41]:
#y_predict = np.round(y_predict)#np.round(y_predict * 0.99)
output = pd.DataFrame({'id': id_test, 'price_doc': y_predict})
output.head()

Unnamed: 0,id,price_doc
0,0,5381716.0
1,1,7950050.5
2,2,5260168.5
3,3,5815625.0
4,4,4880659.0


In [42]:
output.to_csv('single_update_macro.csv', index=False)