In [1]:
import pandas as pd

In [2]:
data=pd.read_csv("housing.csv")

In [3]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [49]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error


In [7]:
X=data.drop(columns='median_house_value')

In [8]:
Y=data['median_house_value']

In [9]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=42)

In [27]:
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

num_idx = [X.columns.get_loc(c) for c in num_cols]
cat_idx = [X.columns.get_loc(c) for c in cat_cols]


In [12]:
num_pipeline=Pipeline([
    ("Impute",SimpleImputer(strategy='median')),
    ("Scaler",StandardScaler())
])

In [15]:
cat_pipeline=Pipeline([
    ("OHE",OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

In [28]:
prep=ColumnTransformer([
    ("NUM",num_pipeline,num_idx),
    ("CAT",cat_pipeline,cat_idx)
])

In [29]:
lr_pipe=Pipeline([
    ('prep',prep),
    ('model',LinearRegression())
])

knn_pipe=Pipeline([
    ('prep',prep),
    ('model',KNeighborsRegressor())
])

dcs_pipe=Pipeline([
    ('prep',prep),
    ('model',DecisionTreeRegressor(random_state=42))
])


In [50]:
stacking=StackingRegressor(
    estimators=[
        ('lr',lr_pipe),
        ('knn',knn_pipe),
        ('dc',dcs_pipe)
    ],
    final_estimator=Ridge(),
    cv=5,
    n_jobs=-1
)
    

In [51]:
stacking.fit(X_train,Y_train)

In [52]:
Y_pred=stacking.predict(X_test)

In [53]:
print("r2 score is",r2_score(Y_test,Y_pred))
print("Mean Squared Error is",mean_squared_error(Y_test,Y_pred))
print("Mean Absolute Error is",mean_absolute_error(Y_test,Y_pred))

r2 score is 0.76141989639286
Mean Squared Error is 3131459549.107582
Mean Absolute Error is 37775.075688128214


In [55]:
param_grid = {
    # KNN parameters (use 'knn' directly)
    'knn__model__n_neighbors': [3, 5, 7],
    'knn__model__weights': ['uniform', 'distance'],
    
    # Decision Tree parameters (use 'dc' directly)
    'dc__model__max_depth': [5, 10, 15],
    'dc__model__min_samples_split': [2, 5],
    'dc__model__min_samples_leaf': [1, 2],
    
    # Final estimator (Ridge)
    'final_estimator__alpha': [0.1, 1, 10]
}


In [56]:
grid=GridSearchCV(
    estimator=stacking,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)

In [57]:
grid.fit(X_train,Y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


In [58]:
best_model=grid.best_estimator_
best_score=grid.best_score_
best_params=grid.best_params_

In [59]:
print("Best Score is",best_score)
print("Best Parameters are",best_params)

Best Score is 0.7669502686833706
Best Parameters are {'dc__model__max_depth': 15, 'dc__model__min_samples_leaf': 2, 'dc__model__min_samples_split': 5, 'final_estimator__alpha': 1, 'knn__model__n_neighbors': 7, 'knn__model__weights': 'distance'}


In [61]:
Y_pred1=best_model.predict(X_test)

In [62]:
print("r2 score is",r2_score(Y_test,Y_pred1))
print("Mean Squared error is",mean_squared_error(Y_test,Y_pred1))
print("Mean Absolute error is",mean_absolute_error(Y_test,Y_pred1))

r2 score is 0.7684895262433389
Mean Squared error is 3038667821.846065
Mean Absolute error is 36885.39248672505
