In [1]:
import numpy as np 
import pandas as pd 
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

In [3]:
df = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv')
X_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [4]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [5]:
X = df.drop('SalePrice', axis = 1).copy()
y = df['SalePrice'].copy()
len(X) == len(y)

True

In [6]:
y.isna().sum()

0

In [7]:
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = list(set(X.columns) - set(categorical_cols))

In [8]:
X_encoded = pd.get_dummies(X, columns = categorical_cols)
df_test_encoded = pd.get_dummies(X_test, columns=categorical_cols)

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, random_state=42, test_size=0.25)
len(X_train), len(y_train), len(X_val), len(y_val)

(1095, 1095, 365, 365)

In [10]:
X_train, X_val = X_train.align(X_val, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

In [11]:
model = XGBRegressor(
    objective = 'reg:squarederror',
    n_estimators = 400,
    learning_rate = 0.1,
    early_stopping_rounds = 5,
    eval_metric = 'mae',
    random_state=42,
    max_depth=5
)
model.fit(
    X_train,
    y_train,
    verbose=True,
    eval_set = [(X_val, y_val)]
)

[0]	validation_0-mae:55039.82380
[1]	validation_0-mae:50452.13476
[2]	validation_0-mae:46466.35527
[3]	validation_0-mae:42804.33613
[4]	validation_0-mae:39518.64495
[5]	validation_0-mae:36735.37252
[6]	validation_0-mae:34248.29565
[7]	validation_0-mae:32136.53286
[8]	validation_0-mae:30302.56190
[9]	validation_0-mae:28727.93290
[10]	validation_0-mae:27202.18983
[11]	validation_0-mae:25819.08198
[12]	validation_0-mae:24734.14334
[13]	validation_0-mae:23852.12245
[14]	validation_0-mae:23058.10788
[15]	validation_0-mae:22297.28643
[16]	validation_0-mae:21637.44349
[17]	validation_0-mae:21079.19756
[18]	validation_0-mae:20571.81597
[19]	validation_0-mae:20034.02866
[20]	validation_0-mae:19559.97789
[21]	validation_0-mae:19230.65058
[22]	validation_0-mae:18942.15195
[23]	validation_0-mae:18648.63191
[24]	validation_0-mae:18465.04341
[25]	validation_0-mae:18269.17451
[26]	validation_0-mae:18092.67840
[27]	validation_0-mae:17959.84668
[28]	validation_0-mae:17813.91800
[29]	validation_0-mae:17

In [12]:
def eval_model(my_model, val_X, val_y):
    y_preds = model.predict(val_X)
    print(mean_absolute_error(val_y, y_preds))

In [13]:
def save_model_prediction(my_model): 
    y_predicted = my_model.predict(X_test)
    my_submission = pd.DataFrame(
        {
            'Id' : X_test['Id'],
            'SalePrice' : y_predicted
        }
    )
    my_submission.to_csv("Final Submission.csv", index = False)

In [14]:
grid = {
    'learning_rate' : [0.1, 0.05, 0.01],
    'max_depth' : [3, 4, 5, 10],
    'gamma' : [0, 0.25, 1],#encourages pruing
    'reg_lambda' : [0, 1, 10] #penalty added to prevent overfitting
}

gs_model = GridSearchCV(
    estimator = XGBRegressor(objective='reg:squarederror',
                             early_stopping_rounds=10,
                             random_state=42,
                             subsample=0.9,
                             colsample_bytree=0.5),
    param_grid=grid,
    scoring='neg_mean_absolute_error',
    verbose=1,
    n_jobs=10,
    cv=3,
)


In [15]:
# gs_model.fit(
#     X_train,
#     y_train, 
#     eval_set=[(X_val, y_val)]
# )

In [16]:
# gs_model.best_params_

In [17]:
model = XGBRegressor(
    objective = 'reg:squarederror',
    n_estimators = 400,
    learning_rate = 0.1,
    early_stopping_rounds = 5,
    eval_metric = 'mae',
    random_state=42,
    max_depth=4,
    reg_lambda=1,
    gamma=0
)
model.fit(
    X_train,
    y_train,
    verbose=True,
    eval_set = [(X_val, y_val)]
)

[0]	validation_0-mae:55306.03305
[1]	validation_0-mae:50999.63660
[2]	validation_0-mae:47069.35291
[3]	validation_0-mae:43498.15004
[4]	validation_0-mae:40380.66366
[5]	validation_0-mae:37754.06635
[6]	validation_0-mae:35060.84889
[7]	validation_0-mae:33051.39110
[8]	validation_0-mae:31188.56100
[9]	validation_0-mae:29424.13574
[10]	validation_0-mae:27927.02031
[11]	validation_0-mae:26625.19803
[12]	validation_0-mae:25345.77920
[13]	validation_0-mae:24321.27922
[14]	validation_0-mae:23312.26582
[15]	validation_0-mae:22499.62470
[16]	validation_0-mae:21719.48061
[17]	validation_0-mae:21090.26312
[18]	validation_0-mae:20536.58780
[19]	validation_0-mae:19983.28808
[20]	validation_0-mae:19616.26378
[21]	validation_0-mae:19282.36419
[22]	validation_0-mae:18945.46094
[23]	validation_0-mae:18618.71800
[24]	validation_0-mae:18390.56956
[25]	validation_0-mae:18241.46706
[26]	validation_0-mae:18075.76554
[27]	validation_0-mae:17886.31220
[28]	validation_0-mae:17768.10533
[29]	validation_0-mae:17

In [18]:
eval_model(model, X_val, y_val)

15652.799743150685


In [19]:
save_model_prediction(model)

### Observation

When Submitted, the test_data mean_absolute_error was approx. 30,000.

### Conclusion

The model performed very well on the training_data as compared to the test_data.
The model is `Overfitting`