In [16]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,cross_val_predict, KFold
from sklearn.ensemble import RandomForestRegressor

In [31]:
train = pd.read_csv("datasets/train.csv", index_col='Id')
test = pd.read_csv("datasets/test.csv", index_col='Id')

In [12]:
train.shape

(1460, 80)

In [17]:
X, y = train.drop('SalePrice', axis=1), train.SalePrice.copy()

In [18]:
X.shape, y.shape

((1460, 79), (1460,))

In [19]:
def rmsle(estimator, X, y):
    p = estimator.predict(X)
    return np.sqrt(mean_squared_error(np.log1p(y), np.log1p(p)))

def rmsle_log_y(estimator, X, y):
    p = estimator.predict(X)
    return np.sqrt(mean_squared_error(y, p))

def rmsle_sqrt_y(estimator, X, y):
    p = estimator.predict(X)
    y = np.power(y, 2)
    p = np.power(p, 2)
    return np.sqrt(mean_squared_error(np.log1p(y), np.log1p(p)))

kf = KFold(n_splits=5, shuffle=True, random_state=1)

In [22]:
X1 = X.select_dtypes(include=[np.number]).fillna(-1)

In [24]:
print("Dims", X1.shape)

Dims (1460, 36)


In [25]:
model = RandomForestRegressor(n_estimators=1000, random_state=0)
error = cross_val_score(model, X1, y, cv=kf, scoring=rmsle).mean()
print('RMSLE:', error)

RMSLE: 0.14582352617986846


In [26]:
model.fit(X1,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [32]:
y_test = test.select_dtypes(include=[np.number]).fillna(-1)

In [33]:
X1.shape, y_test.shape

((1460, 36), (1459, 36))

In [34]:
price = model.predict(y_test)

In [41]:
tree_sub = pd.Series(price, index=test.index, name='SalePrice')
tree_sub.to_csv("sample_submission.csv", header=True)

In [42]:
! head sample_submission.csv

Id,SalePrice
1461,127833.632
1462,156178.575
1463,182478.937
1464,182777.685
1465,199098.534
1466,183509.66
1467,169790.983
1468,175690.897
1469,182564.192
