### Importing libraries and dataset 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv('/practice/house_price/dataset/final_df.csv')
data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,1.0,60.0,65.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,706.0,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
1,2.0,20.0,80.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,978.0,...,SBrkr,TA,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
2,3.0,60.0,68.0,11250.0,7.0,5.0,2001.0,2002.0,162.0,486.0,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
3,4.0,70.0,60.0,9550.0,7.0,5.0,1915.0,1970.0,0.0,216.0,...,SBrkr,Gd,Typ,Detchd,Unf,TA,TA,Y,WD,Abnorml
4,5.0,60.0,84.0,14260.0,8.0,5.0,2000.0,2000.0,350.0,655.0,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal


In [4]:
data.shape

(2919, 75)

In [5]:
#encoading the categorical columns

data = pd.get_dummies(data, drop_first=True)

In [6]:
data.shape

(2919, 231)

In [7]:
train_df = pd.read_csv('/practice/house_price/dataset/train_df.csv')
train_df.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,1.0,60.0,65.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,706.0,...,Gd,Typ,Gd,Attchd,RFn,TA,TA,Y,WD,Normal
1,2.0,20.0,80.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,978.0,...,TA,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal
2,3.0,60.0,68.0,11250.0,7.0,5.0,2001.0,2002.0,162.0,486.0,...,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal
3,4.0,70.0,60.0,9550.0,7.0,5.0,1915.0,1970.0,0.0,216.0,...,Gd,Typ,Gd,Detchd,Unf,TA,TA,Y,WD,Abnorml
4,5.0,60.0,84.0,14260.0,8.0,5.0,2000.0,2000.0,350.0,655.0,...,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal


In [8]:
#separating train and test data

X_train = data.iloc[:1460,:]
X_test = data.iloc[1460:,:]
y_train = train_df['SalePrice']

In [9]:
print('shape of X_train:', X_train.shape)
print('shape of X_test:', X_test.shape)
print('shape of y_train:', y_train.shape)

shape of X_train: (1460, 231)
shape of X_test: (1459, 231)
shape of y_train: (1460,)


### Training the model

In [10]:
from xgboost import XGBRegressor

In [11]:
regressor = XGBRegressor()
regressor.fit(X_train, y_train)

In [12]:
y_pred = regressor.predict(X_test)

In [13]:
y_pred

array([35922.27 , 36449.285, 36577.07 , ..., 35357.973, 36583.844,
       35693.934], dtype=float32)

In [14]:
#read the sample file

sam_data = pd.read_csv('/practice/house_price/dataset/sample_submission.csv')
sam_data.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [15]:
y_test = sam_data.drop('Id', axis=1)
y_test.shape

(1459, 1)

In [16]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [17]:
MAE = mean_absolute_error(y_pred, y_test)
MAE

142381.14308362812

### hyperperameter tunning

In [18]:
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.15,0.20]
max_depth = [2, 3, 5, 10, 15]
min_child_weight=[1,2,3,4]
n_estimators = [100, 500, 900, 1100, 1500]
base_score=[0.25,0.5,0.75,1]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'booster':booster,
    'learning_rate':learning_rate,
    'max_depth':max_depth,
    'min_child_weight':min_child_weight,
    'n_estimators': n_estimators,
    'base_score':base_score
    }

In [19]:
from sklearn.model_selection import RandomizedSearchCV

In [20]:
# random search with 4-fold cross validation
random_cv = RandomizedSearchCV(estimator=regressor,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

In [21]:
random_cv.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [22]:
random_cv.best_estimator_

In [23]:
random_cv.best_params_

{'n_estimators': 900,
 'min_child_weight': 1,
 'max_depth': 15,
 'learning_rate': 0.1,
 'booster': 'gbtree',
 'base_score': 1}

## final model

In [33]:
regressor_f = XGBRegressor(base_score=1, booster='gbtree', colsample_bylevel=None, 
             colsample_bynode=None, colsample_bytree=None, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=15, min_child_weight=1, missing=1, n_estimators=900,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             silent=None, subsample=1, verbosity=1
    
)

regressor_f.fit(X_train,y_train)



In [34]:
y_pred_f = regressor_f.predict(X_test)
y_pred_f

array([36052.387, 36115.344, 36034.74 , ..., 35183.98 , 36149.055,
       35094.504], dtype=float32)

In [35]:
MAE = mean_absolute_error(y_pred_f, y_test)

In [36]:
MAE

143179.92750896225