In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import numpy as np
import time

In [423]:
train_df = pd.read_csv('data/train.csv')

In [424]:
#train_df.info()
#train_df[['MSSubClass', 'MSZoning', 'LotArea', 'Utilities', 'Neighborhood',
#'Condition1', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'Exterior1st',
#'ExterCond', 'Foundation', 'HeatingQC', 'Electrical', '1stFlrSF', 'Functional',
#'GarageQual', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'SaleCondition',
#'YrSold']].isnull().sum()

In [425]:
X = train_df[[
    'MSSubClass',
    'MSZoning',
    'LotArea', 
    #'Utilities', 
    'Neighborhood',
    'Condition1', 
    'OverallQual', 
    'OverallCond', 
    'YearRemodAdd', 
    'Exterior1st',
    'ExterCond', 
    'Foundation', 
    'HeatingQC', 
    'Electrical', 
    #'1stFlrSF',
    'GrLivArea',
    'Functional',
    'GarageQual', 
    'PoolQC', 
    'Fence', 
    'MiscFeature', 
    'MiscVal'
    #'SaleCondition'
    #'YrSold'
]].copy()

imputer = SimpleImputer(strategy='constant', fill_value='NA')
X[[
    'Electrical', 
    'GarageQual', 
    'PoolQC', 
    'Fence', 
    'MiscFeature'
]] = imputer.fit_transform(X[[
    'Electrical', 
    'GarageQual', 
    'PoolQC', 
    'Fence', 
    'MiscFeature'
]])

X['MSSubClass'] = X['MSSubClass'].astype(str)
X = pd.get_dummies(X, drop_first=True)

print("X table size:", X.shape)
print("NA:", X.isnull().sum().sum())

X table size: (1460, 110)
NA: 0


In [426]:
Y = train_df[['SalePrice']].copy()

In [427]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [428]:
housePriceModel = RandomForestRegressor(
    n_estimators=5500,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=True,
    n_jobs=-1,
    random_state=42,
)

start = time.time()
housePriceModel.fit(X_train, Y_train)
end = time.time()

print(f"\nTraining finished in: {end - start:.3f} seconds")

predictions = housePriceModel.predict(X_test)

mae = mean_absolute_error(Y_test, predictions)

print(f"MedinAverage error {mae:,.2f} dollars") 

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': housePriceModel.feature_importances_
})

feature_importance['base_feature'] = feature_importance['feature'].apply(lambda x: x.split('_')[0])

aggregated_importance = feature_importance.groupby('base_feature')['importance'].sum().sort_values(ascending=False)

print("\nImportance:")
print(aggregated_importance)

  return fit_method(estimator, *args, **kwargs)



Training finished in: 5.020 seconds
MedinAverage error 20,628.55 dollars

Importance:
base_feature
OverallQual     0.196600
GrLivArea       0.182282
Neighborhood    0.115569
LotArea         0.096518
YearRemodAdd    0.089928
Foundation      0.071727
MSSubClass      0.057885
Exterior1st     0.045428
MSZoning        0.024404
HeatingQC       0.021536
GarageQual      0.020632
OverallCond     0.018429
Fence           0.012807
Condition1      0.010965
ExterCond       0.010187
Functional      0.008139
PoolQC          0.007423
Electrical      0.006658
MiscFeature     0.001687
MiscVal         0.001195
Name: importance, dtype: float64


In [429]:
print("Model mistaken by:",round((mae / np.mean(Y_train)) * 100, 2), "%")
#print("Model mistaken by:",((mae / np.mean(Y_train)) * 100), "%")

Model mistaken by: 11.37 %
