In [24]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,cross_val_predict, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from itertools import product
from sklearn.linear_model import Ridge
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("datasets/train.csv", index_col='Id')
test = pd.read_csv("datasets/test.csv", index_col='Id')

In [3]:
train.shape

(1460, 80)

In [4]:
X, y = train.drop('SalePrice', axis=1), train.SalePrice.copy()

In [5]:
X.shape, y.shape

((1460, 79), (1460,))

In [6]:
def rmsle(estimator, X, y):
    p = estimator.predict(X)
    return np.sqrt(mean_squared_error(np.log1p(y), np.log1p(p)))

def rmsle_log_y(estimator, X, y):
    p = estimator.predict(X)
    return np.sqrt(mean_squared_error(y, p))

def rmsle_sqrt_y(estimator, X, y):
    p = estimator.predict(X)
    y = np.power(y, 2)
    p = np.power(p, 2)
    return np.sqrt(mean_squared_error(np.log1p(y), np.log1p(p)))

kf = KFold(n_splits=5, shuffle=True, random_state=1)

In [7]:
X1 = X.select_dtypes(include=[np.number]).fillna(-1)

In [8]:
print("Dims", X1.shape)

Dims (1460, 36)


In [9]:
model = RandomForestRegressor(n_estimators=1000, random_state=0)
error = cross_val_score(model, X1, y, cv=kf, scoring=rmsle).mean()
print('RMSLE:', error)

RMSLE: 0.14582352617986846


In [10]:
X2 = X.copy()

for col in X2.columns:
    if X2[col].dtype == object:
        enc = LabelEncoder()
        X2[col] = enc.fit_transform(X[col].fillna('Missing'))
print('Dims', X2.shape)

X2.fillna(-1, inplace=True)
model = RandomForestRegressor(n_estimators=1000, random_state=0)
error = cross_val_score(model, X2, y, cv=kf, scoring=rmsle).mean()

print('RMSLE:', error)

Dims (1460, 79)
RMSLE: 0.14383736485915208


In [14]:
model = RandomForestRegressor(n_estimators=1000, random_state=0)
error = cross_val_score(model, X1, np.log1p(y), cv=kf, scoring=rmsle_log_y).mean()
print('RF, X1, log-target RMSLE:', error)

model = RandomForestRegressor(n_estimators=1000, random_state=0)
error = cross_val_score(model, X2, np.log1p(y), cv=kf, scoring=rmsle_log_y).mean()
print('RF, X2, log-target RMSLE:', error)

RF, X1, log-target RMSLE: 0.14516215148522235
RF, X2, log-target RMSLE: 0.14213588563678017


In [17]:
model = RandomForestRegressor(n_estimators=1000, random_state=0)
error = cross_val_score(model, X1, np.sqrt(y), cv=kf,scoring=rmsle_sqrt_y).mean()
print('RF, X1, sqrt-target RMSLE:', error)

model = RandomForestRegressor(n_estimators=1000, random_state=0)
error = cross_val_score(model, X2, np.sqrt(y), cv=kf, scoring=rmsle_sqrt_y).mean()
print('RF, X2, sqrt-target RMSLE:', error)

RF, X1, sqrt-target RMSLE: 0.14565293448427202
RF, X2, sqrt-target RMSLE: 0.14300460013198157


In [19]:
model = GradientBoostingRegressor(random_state=0)
error = cross_val_score(model, X1, np.log1p(y), cv=kf, scoring=rmsle_log_y).mean()
print('GBM, X1, log-target RMSLE:', error)

model = GradientBoostingRegressor(random_state=0)
error = cross_val_score(model, X2, np.log1p(y), cv=kf, scoring=rmsle_log_y).mean()
print('GBM, X2, log-target RMSLE:', error)

GBM, X1, log-target RMSLE: 0.1334924549135666
GBM, X2, log-target RMSLE: 0.12980689048155078


In [20]:
model = GradientBoostingRegressor(random_state=0)
error = cross_val_score(model, X1, np.sqrt(y), cv=kf, scoring=rmsle_sqrt_y).mean()
print('GBM, X1, sqrt-target RMSLE:', error)

model = GradientBoostingRegressor(random_state=0)
error = cross_val_score(model, X2, np.sqrt(y), cv=kf, scoring=rmsle_sqrt_y).mean()
print('GBM, X2, sqrt-target RMSLE:', error)

GBM, X1, sqrt-target RMSLE: 0.13425897281342522
GBM, X2, sqrt-target RMSLE: 0.1309192356821107


In [22]:
kf_out = KFold(n_splits=5, shuffle=True, random_state=1)
kf_in = KFold(n_splits=5, shuffle=True, random_state=2)

In [35]:
cv_mean = []
for fold, (tr, ts) in enumerate(kf_out.split(X, y)):
    X1_train, X1_test= X1.iloc[tr], X1.iloc[ts]
    X2_train, X2_test= X2.iloc[tr], X2.iloc[ts]
    y_train, y_test = y.iloc[tr], y.iloc[ts]

    modelos = [GradientBoostingRegressor(random_state=0),RandomForestRegressor(random_state=0)]
    targets = [np.log1p, np.sqrt]
    feature_sets = [(X1_train, X1_test), (X2_train, X2_test)]
    predictions_cv = []
    predictions_test = []
    
    for model, target, feature_set in product(modelos, targets, feature_sets):
        predictions_cv.append(cross_val_predict(model, feature_set[0], target(y_train), cv=kf_in).reshape(-1,1))
        model.fit(feature_set[0], target(y_train))
        ptest = model.predict(feature_set[1])
        predictions_test.append(ptest.reshape(-1,1))

    predictions_cv = np.concatenate(predictions_cv, axis=1)
    predictions_test = np.concatenate(predictions_test, axis=1)
    stacker = Ridge()
    stacker.fit(predictions_cv, np.log1p(y_train))
    
    
    
    error = rmsle_log_y(stacker, predictions_test, np.log1p(y_test))
    cv_mean.append(error)
    print('RMSLE Fold %d - RMSLE %.4f' % (fold, error))
    
print('RMSLE CV5 %.4f' % np.mean(cv_mean))

RMSLE Fold 0 - RMSLE 0.1250
RMSLE Fold 1 - RMSLE 0.1446
RMSLE Fold 2 - RMSLE 0.1253
RMSLE Fold 3 - RMSLE 0.1387
RMSLE Fold 4 - RMSLE 0.1086
RMSLE CV5 0.1284


In [26]:
model.fit(X1,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [32]:
y_test = test.select_dtypes(include=[np.number]).fillna(-1)

In [33]:
X1.shape, y_test.shape

((1460, 36), (1459, 36))

In [37]:
price = model.predict(y)

ValueError: Expected 2D array, got 1D array instead:
array=[208500. 181500. 223500. ... 266500. 142125. 147500.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [41]:
tree_sub = pd.Series(price, index=test.index, name='SalePrice')
tree_sub.to_csv("sample_submission.csv", header=True)

In [42]:
! head sample_submission.csv

Id,SalePrice
1461,127833.632
1462,156178.575
1463,182478.937
1464,182777.685
1465,199098.534
1466,183509.66
1467,169790.983
1468,175690.897
1469,182564.192
