In [33]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import numpy as np
import time

In [34]:
train_df = pd.read_csv('data/train.csv')

In [35]:
#train_df.info()
#train_df[['MSSubClass', 'MSZoning', 'LotArea', 'Utilities', 'Neighborhood',
#'Condition1', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'Exterior1st',
#'ExterCond', 'Foundation', 'HeatingQC', 'Electrical', '1stFlrSF', 'Functional',
#'GarageQual', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'SaleCondition',
#'YrSold']].isnull().sum()

In [36]:
X = train_df[[
    'MSSubClass',
    'MSZoning',
    'LotArea', 
    #'Utilities', 
    'Neighborhood',
    'Condition1', 
    'OverallQual', 
    'OverallCond', 
    'YearRemodAdd', 
    'Exterior1st',
    'ExterCond', 
    'Foundation', 
    'HeatingQC', 
    'Electrical', 
    #'1stFlrSF',
    'GrLivArea',
    'Functional',
    'GarageQual', 
    'PoolQC', 
    'Fence', 
    'MiscFeature', 
    'MiscVal'
    #'SaleCondition'
    #'YrSold'
]].copy()

imputer = SimpleImputer(strategy='constant', fill_value='NA')
X[[
    'Electrical', 
    'GarageQual', 
    'PoolQC', 
    'Fence', 
    'MiscFeature'
]] = imputer.fit_transform(X[[
    'Electrical', 
    'GarageQual', 
    'PoolQC', 
    'Fence', 
    'MiscFeature'
]])

X['MSSubClass'] = X['MSSubClass'].astype(str)
X = pd.get_dummies(X, drop_first=True)

print("X table size:", X.shape)
print("NA:", X.isnull().sum().sum())

X table size: (1460, 110)
NA: 0


In [37]:
Y = train_df[['SalePrice']].copy()

In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [44]:
housePriceModelXGB = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=42,
    device='cuda'
)

start = time.time()
housePriceModelXGB.fit(X_train, Y_train)
end = time.time()

print(f"\nTraining finished in: {end - start:.3f} seconds")

predictions = housePriceModelXGB.predict(X_test)

mae = mean_absolute_error(Y_test, predictions)

print(f"MedinAverage error {mae:,.2f} dollars") 

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': housePriceModelXGB.feature_importances_
})

feature_importance['base_feature'] = feature_importance['feature'].apply(lambda x: x.split('_')[0])

aggregated_importance = feature_importance.groupby('base_feature')['importance'].sum().sort_values(ascending=False)

print("\nImportance:")
print(aggregated_importance)


Training finished in: 1.820 seconds
MedinAverage error 17,691.62 dollars

Importance:
base_feature
Neighborhood    0.192530
OverallQual     0.181827
PoolQC          0.098534
Functional      0.077514
GarageQual      0.070967
Foundation      0.068215
MSSubClass      0.065008
MSZoning        0.064726
Exterior1st     0.059044
Condition1      0.042334
GrLivArea       0.021947
ExterCond       0.010707
YearRemodAdd    0.008395
Fence           0.007648
MiscFeature     0.007018
OverallCond     0.006377
HeatingQC       0.005819
Electrical      0.004962
LotArea         0.004912
MiscVal         0.001516
Name: importance, dtype: float32


In [40]:
print("Model mistaken by:",round((mae / np.mean(Y_train)) * 100, 2), "%")
#print("Model mistaken by:",((mae / np.mean(Y_train)) * 100), "%")

Model mistaken by: 10.87 %
