In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection, preprocessing
from sklearn import ensemble   
from sklearn import datasets   
from sklearn.utils import shuffle   
import xgboost as xgb
color = sns.color_palette()

%matplotlib inline

pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', 500)

In [9]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())
warn_log = True
def logrmse(predictions, targets):
    index = predictions > 0
    if (not index.all()):
        if (warn_log == True):
            warn_log = False
            print 'warning: not positive'
    return rmse(np.log(predictions[index]), np.log(targets[index]))
def xgblogrmse(predictions, dtrain):
    return ('logrmse', logrmse(predictions, dtrain.get_label()))

In [33]:
def resolve():
    ###  read the train, test and macro files
    train_df = pd.read_csv("../input/train.csv", parse_dates=['timestamp'])
    test_df = pd.read_csv("../input/test.csv", parse_dates=['timestamp'])
    macro_df = pd.read_csv("../input/macro.csv", parse_dates=['timestamp'])
    print(train_df.shape, test_df.shape)
    
    # combine macro information with train and test
    train_df = pd.merge(train_df, macro_df, how='left', on='timestamp')
    test_df = pd.merge(test_df, macro_df, how='left', on='timestamp')
    print(train_df.shape, test_df.shape)
    
    # undersampling by magic numbers
#     trainsub = train_df[train_df.timestamp < '2015-01-01']
#     trainsub = trainsub[trainsub.product_type=="Investment"]
#     ind_1m = trainsub[trainsub.price_doc <= 1000000].index
#     ind_2m = trainsub[trainsub.price_doc == 2000000].index
#     ind_3m = trainsub[trainsub.price_doc == 3000000].index
#     train_index = set(train_df.index.copy())
#     for ind, gap in zip([ind_1m, ind_2m, ind_3m], [10, 3, 2]):
#         ind_set = set(ind)
#         ind_set_cut = ind.difference(set(ind[::gap]))
#         train_index = train_index.difference(ind_set_cut)
        
    ###  convert categorical variables into numerical variables by label encoding
    objlist = []
    for f in train_df.columns:
        if train_df[f].dtype=='object':
            objlist.append(f)       
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values.astype('str')) + list(test_df[f].values.astype('str')))
            train_df[f] = lbl.transform(list(train_df[f].values.astype('str')))
            test_df[f] = lbl.transform(list(test_df[f].values.astype('str')))
            
    # year and month #
    train_df["yearmonth"] = train_df["timestamp"].dt.year*100 + train_df["timestamp"].dt.month
    test_df["yearmonth"] = test_df["timestamp"].dt.year*100 + test_df["timestamp"].dt.month
    # year and week #
    train_df["yearweek"] = train_df["timestamp"].dt.year*100 + train_df["timestamp"].dt.weekofyear
    test_df["yearweek"] = test_df["timestamp"].dt.year*100 + test_df["timestamp"].dt.weekofyear
    # year #
    train_df["year"] = train_df["timestamp"].dt.year
    test_df["year"] = test_df["timestamp"].dt.year
    # month of year #
    train_df["month_of_year"] = train_df["timestamp"].dt.month
    test_df["month_of_year"] = test_df["timestamp"].dt.month
    # week of year #
    train_df["week_of_year"] = train_df["timestamp"].dt.weekofyear
    test_df["week_of_year"] = test_df["timestamp"].dt.weekofyear
    # day of week #
    train_df["day_of_week"] = train_df["timestamp"].dt.weekday
    test_df["day_of_week"] = test_df["timestamp"].dt.weekday

    ### We could potentially add more variables like this. But for now let us start with model building using these additional variables. Let us drop the variables which are not needed in model building.
    train_X = train_df.drop(["id", "timestamp", "price_doc"], axis=1)
    test_X = test_df.drop(["id", "timestamp"] , axis=1)
    # Since our metric is "RMSLE", let us use log of the target variable for model building rather than using the actual target variable.
    # train_y = np.log1p(train_df.price_doc.values)
    train_y =(train_df.price_doc.values)
    
    return df_train['id'].values, train_X.values, train_y, test_X.values

## 3 baseline models

In [73]:
import bruno
reload(bruno)
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 0
}
num_boost_rounds = 420  # From Bruno's original CV, I think
bruno_xgb = XgbModel(xgb_params, 1000)

In [74]:
import gunja
reload(gunja)
xgb_params = {
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.6,
    'colsample_bytree': 1,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 0
}
num_boost_rounds = 422
gunja_xgb = XgbModel(xgb_params, 1000)

In [6]:
import louis
reload(louis)
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 0
}
num_boost_rounds = 385  # This was the CV output, as earlier version shows
louis_xgb = XgbModel(xgb_params, 1)

# rfr

In [52]:
import rfr
reload(rfr)
my_rfr = ensemble.RandomForestRegressor(n_estimators=70,max_depth=12)
my_rfr = rfr.RfrModel(my_rfr)

## my_xgb

In [35]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 0
}
my_xgb = XgbModel(xgb_params, 500)

In [66]:
temp = pd.read_csv('../input/train.csv', index_col='id')

In [67]:
temp.index

Int64Index([    1,     2,     3,     4,     5,     6,     7,     8,     9,
               10,
            ...
            30464, 30465, 30466, 30467, 30468, 30469, 30470, 30471, 30472,
            30473],
           dtype='int64', name=u'id', length=30471)

In [68]:
id_train

array([    1,     2,     3, ..., 30471, 30472, 30473], dtype=int64)

In [69]:
fx = pd.read_excel('../input/BAD_ADDRESS_FIX.xlsx').drop_duplicates('id').set_index('id')

In [70]:
temp.update(fx)

# stacking

In [3]:
df_train = pd.read_csv('../input/train.csv')
id_train = df_train['id'].values
y = df_train['price_doc'].values

In [43]:
from stacking import *
import stacking
reload(stacking)

<module 'stacking' from 'stacking.pyc'>

In [44]:
class DataResolver(object):
    def __init__(self):
        self.__time = -1
    
    def next(self):
        self.__time = self.__time + 1
        if (self.__time >= 1):
            return resolve()
        return rfr.resolve()

In [53]:
base_models = []
base_models.append(my_rfr)
base_models.append(my_xgb)
# base_models.append(bruno_xgb)
# base_models.append(gunja_xgb)
# base_models.append(louis_xgb)

In [54]:
stacking = Stacking(3, base_models, DataResolver(), logrmse)

In [55]:
s_train, s_test = stacking.fit(id_train, 7662)#38132 30471

((30471, 292), (7662, 291))
((30471, 391), (7662, 390))
model 0 fold 0
train: 0.387185662396 val: 0.46317713979
model 0 fold 1
train: 0.385543231084 val: 0.482501056009
model 0 fold 2
train: 0.385960219561 val: 0.472786824998
((30471, 292), (7662, 291))
((30471, 391), (7662, 390))
model 1 fold 0
[0]	train-rmse:8.24322e+06	val-rmse:8.1257e+06
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 20 rounds.
[50]	train-rmse:2.49225e+06	val-rmse:2.81501e+06
[100]	train-rmse:2.15807e+06	val-rmse:2.6445e+06
[150]	train-rmse:2.03816e+06	val-rmse:2.60676e+06
[200]	train-rmse:1.94939e+06	val-rmse:2.59428e+06
[250]	train-rmse:1.87826e+06	val-rmse:2.58575e+06
[300]	train-rmse:1.81294e+06	val-rmse:2.57893e+06
[350]	train-rmse:1.74812e+06	val-rmse:2.57286e+06
[400]	train-rmse:1.69398e+06	val-rmse:2.57037e+06
[450]	train-rmse:1.6413e+06	val-rmse:2.56727e+06
[499]	train-rmse:1.59814e+06	val-rmse:2.56574e+06
train: 0.404648279

In [58]:
pd.DataFrame(s_train).to_csv('../stacking/rfr_xgb/3_train.csv', index=False)

In [59]:
pd.DataFrame(s_test).to_csv('../stacking/rfr_xgb/3_test.csv', index=False)

In [60]:
y

array([ 5850000,  6000000,  5700000, ...,  6970959, 13500000,  5600000], dtype=int64)

In [61]:
s_train

array([[  4762740.70477522,   4609924.5       ],
       [  4998582.42053592,   5416123.        ],
       [  5204945.16485092,   4926099.        ],
       ..., 
       [  5437575.43828289,   5570410.        ],
       [ 10072001.69092676,  10696752.        ],
       [  6189967.06682039,   5869192.5       ]])

In [62]:
s_test

array([[ 5463892.23383271,  5537997.        ],
       [ 8476058.39433344,  8675585.33333333],
       [ 5915374.29501199,  5610116.5       ],
       ..., 
       [ 4553765.13804639,  4516005.83333333],
       [ 5584019.3606476 ,  5397009.33333333],
       [ 8616919.26055608,  8457699.66666667]])

In [63]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.6,
    'colsample_bytree': 1,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

dtrain = xgb.DMatrix(s_train, y)
#dtrain = xgb.DMatrix(s_train[:25000], y[:25000])
dval = xgb.DMatrix(s_train[25000:], y[25000:])
dtest = xgb.DMatrix(s_test)


num_boost_rounds = 1000
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=400, evals={(dtrain,'train')}, verbose_eval=20)
# model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds, evals=[(dtrain,'train'), (dval,'val')], early_stopping_rounds=20, verbose_eval=20)

y_predict = model.predict(dtest)

[0]	train-rmse:8.19094e+06
[20]	train-rmse:3.77798e+06
[40]	train-rmse:2.6509e+06
[60]	train-rmse:2.41604e+06
[80]	train-rmse:2.35392e+06
[100]	train-rmse:2.31773e+06
[120]	train-rmse:2.28764e+06
[140]	train-rmse:2.26144e+06
[160]	train-rmse:2.236e+06
[180]	train-rmse:2.21765e+06
[200]	train-rmse:2.1995e+06
[220]	train-rmse:2.18339e+06
[240]	train-rmse:2.16741e+06
[260]	train-rmse:2.15331e+06
[280]	train-rmse:2.13959e+06
[300]	train-rmse:2.12722e+06
[320]	train-rmse:2.11218e+06
[340]	train-rmse:2.09904e+06
[360]	train-rmse:2.08584e+06
[380]	train-rmse:2.0745e+06
[399]	train-rmse:2.06176e+06


In [30]:
df_test = pd.read_csv('../input/test.csv')
id_test = df_test.id

In [64]:
stacking_output = pd.DataFrame({'id': id_test, 'price_doc': y_predict })

In [65]:
stacking_output.to_csv('../stacking.csv', index=False)