In [1]:
import pandas as pd

In [4]:
data=pd.read_csv("housing.csv")

In [5]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
19866,-119.32,36.33,18.0,2603.0,478.0,1158.0,423.0,4.5938,150500.0,INLAND
14603,-117.16,32.8,22.0,2259.0,634.0,1213.0,601.0,2.5,177800.0,NEAR OCEAN
2051,-119.72,36.71,7.0,2456.0,463.0,1350.0,424.0,3.0179,91600.0,INLAND
9738,-121.75,36.77,25.0,1851.0,418.0,1678.0,390.0,3.2937,135300.0,<1H OCEAN
16634,-120.84,35.31,23.0,3100.0,603.0,1515.0,609.0,2.8493,196100.0,NEAR OCEAN


In [7]:
data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [8]:
X=data.drop(columns='median_house_value')

In [9]:
X.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY


In [10]:
Y=data['median_house_value']

In [13]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [14]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=42)

In [24]:
num_cols=X.select_dtypes(include=['int64','float64']).columns
cat_cols=X.select_dtypes(include=['object']).columns

In [33]:
num_pipeline=Pipeline([
    ("Imputer",SimpleImputer(strategy='median')),
    ("Scaler",StandardScaler())
])

In [40]:
cat_pipeline=Pipeline([
    ("Imputer",SimpleImputer(strategy='most_frequent')),
    ("OHE",OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

In [41]:
trf1=ColumnTransformer([
    ("num",num_pipeline,num_cols),
    ("cat",cat_pipeline,cat_cols)
])

In [42]:
trf2=KNeighborsRegressor()

In [43]:
pipe=Pipeline([
    ("trf1",trf1),
    ("trf2",trf2)
])

In [44]:
pipe.fit(X_train,Y_train)

In [45]:
Y_pred=pipe.predict(X_test)

In [46]:
score=r2_score(Y_test,Y_pred)
mse=mean_squared_error(Y_test,Y_pred)
mae=mean_absolute_error(Y_test,Y_pred)
print("r2 score is ",score)
print("MSE is",mse)
print("MAE is",mae)

r2 score is  0.710987953519632
MSE is 3793399026.4684753
MAE is 41230.18378552971


In [47]:
param_grid = {
    'trf2__n_neighbors': [3,5,7,9,11,15,21,25,35],
    'trf2__weights': ['uniform', 'distance'],
    'trf2__metric': ['euclidean', 'manhattan', 'minkowski'],
    'trf2__p': [1, 2]
}


In [48]:
grid=GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1
)

In [49]:
grid.fit(X_train,Y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [50]:
best_model=grid.best_estimator_
best_parametes=grid.best_params_
best_score=grid.best_score_

In [51]:
print(best_parametes)
print(best_score)

{'trf2__metric': 'manhattan', 'trf2__n_neighbors': 9, 'trf2__p': 1, 'trf2__weights': 'distance'}
0.7378766825620956


In [52]:
Y_pred1=best_model.predict(X_test)

In [53]:
r2=r2_score(Y_test,Y_pred1)
mse=mean_squared_error(Y_test,Y_pred1)
mae=mean_absolute_error(Y_test,Y_pred1)

In [54]:
print("r2 score is",r2)
print("Mean squared error is ",mse)
print("Mean absolute error is",mae)

r2 score is 0.7408643153825459
Mean squared error is  3401259794.2621236
Mean absolute error is 39126.89530908075
