In [1]:
import pandas as pd

In [2]:
data=pd.read_csv('housing.csv')

In [3]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [6]:
X=data.drop(columns='median_house_value')

In [7]:
Y=data['median_house_value']

In [8]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [9]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=42)

In [10]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
7061,-118.02,33.93,35.0,2400.0,398.0,1218.0,408.0,4.1312,<1H OCEAN
14689,-117.09,32.79,20.0,2183.0,534.0,999.0,496.0,2.8631,NEAR OCEAN
17323,-120.14,34.59,24.0,1601.0,282.0,731.0,285.0,4.2026,NEAR OCEAN
10056,-121.0,39.26,14.0,810.0,151.0,302.0,138.0,3.1094,INLAND
15750,-122.45,37.77,52.0,3188.0,708.0,1526.0,664.0,3.3068,NEAR BAY


In [11]:
num_cols=X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols=X.select_dtypes(include=['object']).columns.tolist()

In [12]:
num_pipeline=Pipeline([
    ("Imputer",SimpleImputer(strategy='mean')),
    ("scaler",StandardScaler())
])

In [13]:
cat_pipeline=Pipeline([
    ("Imputer",SimpleImputer(strategy='most_frequent')),
    ("OHE",OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

In [14]:
prep=ColumnTransformer([
    ("num",num_pipeline,num_cols),
    ("cat",cat_pipeline,cat_cols)
],remainder='drop')

In [15]:
pipe=Pipeline([
    ("prep",prep),
    ('model',GradientBoostingRegressor(random_state=42))
])

In [16]:
pipe.fit(X_train,Y_train)

In [17]:
Y_pred=pipe.predict(X_test)

In [18]:
print("r2 score is",r2_score(Y_test,Y_pred))
print("Mean Squared Error is",mean_squared_error(Y_test,Y_pred))
print("Mean Absolute Error is",mean_absolute_error(Y_test,Y_pred))

r2 score is 0.766732807091322
Mean Squared Error is 3061725465.297935
Mean Absolute Error is 38378.51483166189


In [19]:
print("Training Accuracy is",pipe.score(X_train,Y_train))
print("Testing Accuracy is",pipe.score(X_test,Y_test))

Training Accuracy is 0.791078614496086
Testing Accuracy is 0.766732807091322


In [20]:
param_grid = {
    "model__n_estimators": [100, 200, 300],
    "model__learning_rate": [0.05, 0.1, 0.2],
    "model__max_depth": [2, 3, 4],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__subsample": [0.8, 1.0],
    "model__max_features": ["sqrt", "log2", None]
}

In [21]:
grid=GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)

In [22]:
grid.fit(X_train,Y_train)

Fitting 5 folds for each of 1458 candidates, totalling 7290 fits


In [23]:
best_model=grid.best_estimator_
best_score=grid.best_score_
best_param=grid.best_params_

In [24]:
print("Best Score is",best_score)
print("Best Parameters are",best_param)

Best Score is 0.8233221021966102
Best Parameters are {'model__learning_rate': 0.2, 'model__max_depth': 4, 'model__max_features': None, 'model__min_samples_leaf': 4, 'model__min_samples_split': 10, 'model__n_estimators': 300, 'model__subsample': 1.0}


In [25]:
Y_pred1=best_model.predict(X_test)

In [26]:
print("r2 score is",r2_score(Y_test,Y_pred1))
print("Mean absolute error is",mean_absolute_error(Y_test,Y_pred1))
print("Mean squared error is",mean_squared_error(Y_test,Y_pred1))

r2 score is 0.8305030345984492
Mean absolute error is 31813.854151140167
Mean squared error is 2224715652.422742


In [27]:
print("Training Accuracy is",grid.score(X_train,Y_train))
print("Testing Accuracy is",grid.score(X_test,Y_test))

Training Accuracy is 0.9107266062245089
Testing Accuracy is 0.8305030345984492
