In [203]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import numpy as np
import time

In [204]:
train_df = pd.read_csv('data/train.csv')

In [205]:
#train_df.info()
#train_df[['MSSubClass', 'MSZoning', 'LotArea', 'Utilities', 'Neighborhood',
#'Condition1', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'Exterior1st',
#'ExterCond', 'Foundation', 'HeatingQC', 'Electrical', '1stFlrSF', 'Functional',
#'GarageQual', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal']].isnull().sum()

In [206]:
X = train_df[[
    'MSSubClass', 'MSZoning', 'LotArea', 'Utilities', 'Neighborhood',
    'Condition1', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'Exterior1st',
    'ExterCond', 'Foundation', 'HeatingQC', 'Electrical', '1stFlrSF', 'Functional',
    'GarageQual', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal'
]].copy()

imputer = SimpleImputer(strategy='constant', fill_value='NA')
X[[
    'Electrical', 'GarageQual', 'PoolQC', 'Fence', 'MiscFeature'
]] = imputer.fit_transform(X[[
    'Electrical', 'GarageQual', 'PoolQC', 'Fence', 'MiscFeature'
]])

X['MSSubClass'] = X['MSSubClass'].astype(str)
X = pd.get_dummies(X, drop_first=True)

print("X table size:", X.shape)
print("NA:", X.isnull().sum().sum())

X table size: (1460, 111)
NA: 0


In [207]:
Y = train_df[['SalePrice']].copy()

In [208]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [209]:
housePriceModel = RandomForestRegressor(
    n_estimators=5000,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=True,
    n_jobs=-1,
    random_state=42,
)

start = time.time()
housePriceModel.fit(X_train, Y_train)
end = time.time()

print(f"\nTraining finished in: {end - start:.3f} seconds")

predictions = housePriceModel.predict(X_test)

mae = mean_absolute_error(Y_test, predictions)

print(f"MedinAverage error {mae:,.2f} dollars") 

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': housePriceModel.feature_importances_
})
print("\nImportance of features:")
print(feature_importance.sort_values('importance', ascending=False).head(10))

  return fit_method(estimator, *args, **kwargs)



Training finished in: 4.387 seconds
MedinAverage error 21,624.77 dollars

Importance of features:
                 feature  importance
1            OverallQual    0.195137
4               1stFlrSF    0.157645
0                LotArea    0.099688
3           YearRemodAdd    0.088516
76      Foundation_PConc    0.051704
14         MSSubClass_60    0.040036
40  Neighborhood_NridgHt    0.036746
39  Neighborhood_NoRidge    0.022624
75     Foundation_CBlock    0.020812
2            OverallCond    0.018858


In [210]:
print("Model mistaken by:",round((mae / np.mean(Y_train)) * 100, 2), "%")
#print("Model mistaken by:",((mae / np.mean(Y_train)) * 100), "%")

Model mistaken by: 11.92 %
