In [1]:
import pandas as pd

In [2]:
data=pd.read_csv('housing.csv')

In [3]:
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
data.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [6]:
X=data.drop(columns=['price'])
Y=data['price']

In [8]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from xgboost import XGBRegressor

In [9]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=42)

In [11]:
num_cols=X.select_dtypes(include=['float64','int64']).columns.tolist()
cat_cols=X.select_dtypes(include=['object']).columns.tolist()

In [23]:
num_pipeline=Pipeline([
    ("scaler",StandardScaler())
])

In [24]:
cat_pipeline=Pipeline([
    ("OHE",OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

In [27]:
prep=ColumnTransformer([
    ("num_pipe",num_pipeline,num_cols),
    ("cat_pipe",cat_pipeline,cat_cols)
],remainder='passthrough')

In [28]:
pipe = Pipeline([
    ("prep", prep),
    ("model", XGBRegressor(
        objective='reg:squarederror',
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    ))
])


In [29]:
pipe.fit(X_train,Y_train)

In [30]:
Y_pred=pipe.predict(X_test)

In [31]:
print("r2 score is",r2_score(Y_test,Y_pred))
print("Mean Squared Error is",mean_squared_error(Y_test,Y_pred))
print("Mean Absolute Error is",mean_absolute_error(Y_test,Y_pred))

r2 score is 0.6023869514465332
Mean Squared Error is 1712277291008.0
Mean Absolute Error is 954088.375


In [32]:
print("Training Accuracy is",pipe.score(X_train,Y_train))
print("Testing Accuracy is",pipe.score(X_test,Y_test))

Training Accuracy is 0.9538209438323975
Testing Accuracy is 0.6023869514465332


In [33]:
pipe1 = Pipeline([
    ("prep", prep),
    ("model", XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1
    ))
])

In [34]:
param_grid = {
    "model__n_estimators": [200, 300, 500],
    "model__learning_rate": [0.01, 0.05, 0.1],
    "model__max_depth": [3, 4, 6],
    "model__min_child_weight": [1, 3, 5],
    "model__subsample": [0.7, 0.8, 1.0],
    "model__colsample_bytree": [0.7, 0.8, 1.0]
}

In [35]:
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="r2",
    cv=5,
    n_jobs=-1,
    verbose=2
)


In [36]:
grid.fit(X_train,Y_train)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits


In [37]:
Y_pred1=grid.predict(X_test)

In [38]:
print("r2 score is",r2_score(Y_test,Y_pred1))
print("Mean Squared Error is",mean_squared_error(Y_test,Y_pred1))
print("Mean Absolute Error is",mean_absolute_error(Y_test,Y_pred1))

r2 score is 0.6297394037246704
Mean Squared Error is 1594487209984.0
Mean Absolute Error is 939723.6875


In [39]:
print("Training accuracy is",grid.score(X_train,Y_train))
print("Testing accuracy is",grid.score(X_test,Y_test))

Training accuracy is 0.8143401145935059
Testing accuracy is 0.6297394037246704
