In [1]:
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score



In [2]:
# From here: https://www.kaggle.com/robertoruiz/sberbank-russian-housing-market/dealing-with-multicollinearity/notebook
macro_cols = ["balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]


In [3]:
location="D:\s_chernov\AnacondaProjects\SberHouse\Data\\"
df_train = pd.read_csv(location+"train.csv", parse_dates=['timestamp'])
df_test = pd.read_csv(location+"test.csv", parse_dates=['timestamp'])
df_macro = pd.read_csv(location+"macro.csv", parse_dates=['timestamp'], usecols=['timestamp'] + macro_cols)


In [4]:
# ylog will be log(1+y), as suggested by https://github.com/dmlc/xgboost/issues/446#issuecomment-135555130
ylog_train_all = np.log1p(df_train['price_doc'].values)
y_train_all = (df_train['price_doc'].values)
ysqrt_train_all = np.sqrt(df_train['price_doc'].values)
id_test = df_test['id']

df_train.drop(['id', 'price_doc'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)

# Build df_all = (df_train+df_test).join(df_macro)
num_train = len(df_train)
df_all = pd.concat([df_train, df_test])
df_all = pd.merge_ordered(df_all, df_macro, on='timestamp', how='left')
print(df_all.shape)

# Add month-year
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
df_all['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
df_all['month'] = df_all.timestamp.dt.month
df_all['dow'] = df_all.timestamp.dt.dayofweek

# Other feature engineering
df_all['rel_floor'] = df_all['floor'] / df_all['max_floor'].astype(float)
df_all['rel_kitch_sq'] = df_all['kitch_sq'] / df_all['full_sq'].astype(float)

# Remove timestamp column (may overfit the model in train)
df_all.drop(['timestamp'], axis=1, inplace=True)

(38133, 303)


In [5]:
# Deal with categorical values
df_numeric = df_all.select_dtypes(exclude=['object'])
df_obj = df_all.select_dtypes(include=['object']).copy()

for c in df_obj:
    df_obj[c] = pd.factorize(df_obj[c])[0]

df_values = pd.concat([df_numeric, df_obj], axis=1)

In [6]:
# Convert to numpy values
X_all = df_values.values
print(X_all.shape)

# Create a validation set, with last 20% of data
num_val = int(num_train * 0.2)

X_train_all = X_all[:num_train]
X_train = X_all[:num_train-num_val]
X_val = X_all[num_train-num_val:num_train]
ylog_train = ylog_train_all[:-num_val]
y_train=y_train_all[:-num_val]
ysqrt_train=ysqrt_train_all[:-num_val]

ylog_val = ylog_train_all[-num_val:]
y_val = y_train_all[-num_val:]
ysqrt_val = ysqrt_train_all[-num_val:]


X_test = X_all[num_train:]

df_columns = df_values.columns

print('X_train_all shape is', X_train_all.shape)
print('X_train shape is', X_train.shape)
print('y_train shape is', ylog_train.shape)
print('X_val shape is', X_val.shape)
print('y_val shape is', ylog_val.shape)
print('X_test shape is', X_test.shape)

(38133L, 308L)
('X_train_all shape is', (30471L, 308L))
('X_train shape is', (24377L, 308L))
('y_train shape is', (24377L,))
('X_val shape is', (6094L, 308L))
('y_val shape is', (6094L,))
('X_test shape is', (7662L, 308L))


In [7]:
dtrain_all = xgb.DMatrix(X_train_all, ylog_train_all, feature_names=df_columns)
dtrain = xgb.DMatrix(X_train, ylog_train, feature_names=df_columns)
dval = xgb.DMatrix(X_val, ylog_val, feature_names=df_columns)
dtest = xgb.DMatrix(X_test, feature_names=df_columns)

In [8]:
dtrain_all = xgb.DMatrix(X_train_all, ysqrt_train_all, feature_names=df_columns)
dtrain = xgb.DMatrix(X_train, ysqrt_train, feature_names=df_columns)
dval = xgb.DMatrix(X_val, ysqrt_val, feature_names=df_columns)
dtest = xgb.DMatrix(X_test, feature_names=df_columns)

In [9]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 1.0,
    'colsample_bytree': 0.7,
    'objective': 'binary:logistic',
    'eval_metric': 'rmse',
    'silent': 1
}

np
# Uncomment to tune XGB `num_boost_rounds`
partial_model = xgb.train(xgb_params, dtrain, num_boost_round=1000, evals=[(dval, 'val')],
                       early_stopping_rounds=20, verbose_eval=20)

num_boost_round = partial_model.best_iteration

[0]	val-rmse:2685.37
Will train until val-rmse hasn't improved in 20 rounds.
[20]	val-rmse:1149.02
[40]	val-rmse:663.835
[60]	val-rmse:532.622
[80]	val-rmse:496.88
[100]	val-rmse:484.047
[120]	val-rmse:476.756
[140]	val-rmse:472.721
[160]	val-rmse:469.956
[180]	val-rmse:467.716
[200]	val-rmse:466.322
[220]	val-rmse:465.251
[240]	val-rmse:464.757
[260]	val-rmse:463.88
[280]	val-rmse:463.383
Stopping. Best iteration:
[271]	val-rmse:463.363



In [10]:
pred = partial_model.predict(dval)
print '________________'
#print r2_score (y_val,np.exp(pred) - 1)
print r2_score (y_val,pred**2)
#print len(pred),min(pred),max(pred)
#print len(y_test_pos),min(y_test_pos),max(y_test_pos)
pred

________________
0.724200068615


array([ 2795.14428711,  2988.04492188,  2214.23974609, ...,  2214.26879883,
        3125.76513672,  2277.97143555], dtype=float32)

In [11]:
#partial_model.get_score()

In [None]:
num_boost_round = partial_model.best_iteration


In [None]:
model = xgb.train(dict(xgb_params, silent=0), dtrain_all, num_boost_round=num_boost_round)

In [None]:
#ylog_pred = model.predict(dtest)
#y_pred = np.exp(ylog_pred) - 1
ysqrt_pred = model.predict(dtest)
y_pred = ysqrt_pred**2

df_sub = pd.DataFrame({'id': id_test, 'price_doc': y_pred})

df_sub.to_csv(location+'sub.csv', index=False)

In [None]:
plt.hist(np.sqrt(df_sub.price_doc),100)
plt.show()