In [None]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import numpy as np
import time
import json
import os

In [None]:
try:
    current_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    current_dir = os.getcwd()

data_path = os.path.join(current_dir, '..', 'data', 'train.csv')

if not os.path.exists(data_path):
    data_path = os.path.join(current_dir, 'data', 'train.csv')

train_df = pd.read_csv(data_path)

In [None]:
#train_df.info()
#train_df[['MSSubClass', 'MSZoning', 'LotArea', 'Utilities', 'Neighborhood',
#'Condition1', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'Exterior1st',
#'ExterCond', 'Foundation', 'HeatingQC', 'Electrical', '1stFlrSF', 'Functional',
#'GarageQual', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'SaleCondition',
#'YrSold']].isnull().sum()

In [None]:
X = train_df[[
    'MSSubClass',
    'MSZoning',
    'LotArea', 
    #'Utilities', 
    'Neighborhood',
    'Condition1', 
    'OverallQual', 
    'OverallCond', 
    'YearRemodAdd', 
    'Exterior1st',
    'ExterCond', 
    'Foundation', 
    'HeatingQC', 
    'Electrical', 
    #'1stFlrSF',
    'GrLivArea',
    'Functional',
    'GarageQual', 
    'PoolQC', 
    'Fence', 
    'MiscFeature', 
    'MiscVal'
    #'SaleCondition'
    #'YrSold'
]].copy()

imputer = SimpleImputer(strategy='constant', fill_value='NA')
X[[
    'Electrical', 
    'GarageQual', 
    'PoolQC', 
    'Fence', 
    'MiscFeature'
]] = imputer.fit_transform(X[[
    'Electrical', 
    'GarageQual', 
    'PoolQC', 
    'Fence', 
    'MiscFeature'
]])

X['MSSubClass'] = X['MSSubClass'].astype(str)
X = pd.get_dummies(X, drop_first=True)

print("X table size:", X.shape)
print("NA:", X.isnull().sum().sum())

In [None]:
Y = train_df[['SalePrice']].copy()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
housePriceModelXGB = XGBRegressor(
    n_estimators=26000,
    learning_rate=0.001,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.6,
    n_jobs=-1,
    random_state=42,
    device='cuda',
    #early_stopping_rounds=100
)

start = time.time()
housePriceModelXGB.fit(X_train, Y_train)
end = time.time()

print(f"\nTraining finished in: {end - start:.3f} seconds")

predictions = housePriceModelXGB.predict(X_test)

mae = mean_absolute_error(Y_test, predictions)

print(f"MedinAverage error {mae:,.2f} dollars") 

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': housePriceModelXGB.feature_importances_
})

feature_importance['base_feature'] = feature_importance['feature'].apply(lambda x: x.split('_')[0])

aggregated_importance = feature_importance.groupby('base_feature')['importance'].sum().sort_values(ascending=False)

print("\nImportance:")
print(aggregated_importance)

In [None]:
mape = round((mae / np.mean(Y_train)) * 100, 2)
print("Model mistaken by:", mape, "%")
#print("Model mistaken by:",((mae / np.mean(Y_train)) * 100), "%")

In [None]:
resultData = {
    "modelName": "XGBoost",
    "mape": mape
}

try:
    current_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    current_dir = os.getcwd()

collectedDataPath = os.path.join(current_dir, '..', 'collectedData', 'resultsXGB.json')

if not os.path.exists(collectedDataPath):
    collectedDataPath = os.path.join(current_dir, 'collectedData', 'resultsXGB.json')

with open(collectedDataPath, 'w') as f:
    json.dump(resultData, f)