In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import numpy as np
import time
import json
import os

In [None]:
try:
    trainDf = pd.read_csv('data/train.csv')
    rootCheck = True 
except FileNotFoundError:
    trainDf = pd.read_csv('../../data/train.csv')
    rootCheck = False

In [None]:
#trainDf.info()
#trainDf[['MSSubClass', 'MSZoning', 'LotArea', 'Utilities', 'Neighborhood',
#'Condition1', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'Exterior1st',
#'ExterCond', 'Foundation', 'HeatingQC', 'Electrical', '1stFlrSF', 'Functional',
#'GarageQual', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'SaleCondition',
#'YrSold']].isnull().sum()

In [None]:
X = trainDf[[
    'MSSubClass',
    'MSZoning',
    'LotArea', 
    #'Utilities', 
    'Neighborhood',
    'Condition1', 
    'OverallQual', 
    'OverallCond', 
    'YearRemodAdd', 
    'Exterior1st',
    'ExterCond', 
    'Foundation', 
    'HeatingQC', 
    'Electrical', 
    #'1stFlrSF',
    'GrLivArea',
    'Functional',
    'GarageQual', 
    'PoolQC', 
    'Fence', 
    'MiscFeature', 
    'MiscVal'
    #'SaleCondition'
    #'YrSold'
]].copy()

imputer = SimpleImputer(strategy='constant', fill_value='NA')
X[[
    'Electrical', 
    'GarageQual', 
    'PoolQC', 
    'Fence', 
    'MiscFeature'
]] = imputer.fit_transform(X[[
    'Electrical', 
    'GarageQual', 
    'PoolQC', 
    'Fence', 
    'MiscFeature'
]])

X['MSSubClass'] = X['MSSubClass'].astype(str)
X = pd.get_dummies(X, drop_first=True)

print("X table size:", X.shape)
print("NA:", X.isnull().sum().sum())

In [None]:
Y = trainDf[['SalePrice']].copy()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
housePriceModelRF = RandomForestRegressor(
    n_estimators=5500,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=True,
    n_jobs=-1,
    random_state=42,
)

start = time.time()
housePriceModelRF.fit(X_train, Y_train)
end = time.time()

print(f"\nTraining finished in: {end - start:.3f} seconds")

predictions = housePriceModelRF.predict(X_test)

mae = mean_absolute_error(Y_test, predictions)

print(f"MedinAverage error {mae:,.2f} dollars") 

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': housePriceModelRF.feature_importances_
})

feature_importance['base_feature'] = feature_importance['feature'].apply(lambda x: x.split('_')[0])

aggregated_importance = feature_importance.groupby('base_feature')['importance'].sum().sort_values(ascending=False)

mae = mean_absolute_error(Y_test, predictions)
mape = round((mae / np.mean(Y_train)) * 100, 2)

In [None]:
if rootCheck == True:
    resultData = {
        "modelName": "RandomForest",
        "mae" : int(mae),
        "mape": mape
    }

    resultsDir = 'modelsInfo/collectedData'
    os.makedirs(resultsDir, exist_ok=True)

    file_path = os.path.join(resultsDir, 'resultsRF.json')
    with open(file_path, 'w') as f:
        json.dump(resultData, f)
else: #Debug
    print(f"\nTraining finished in: {end - start:.3f} seconds")
    print(f"\nMedinAverage error {mae:,.2f} dollars") 
    print("\nModel mistaken by:", mape, "%")

    print("\nImportance:")
    print(aggregated_importance)