In [1]:
import pandas as pd

In [2]:
data=pd.read_csv("housing.csv")

In [3]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [43]:
data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
18629,-121.9,37.1,23.0,1708.0,287.0,670.0,238.0,6.4517,356600.0,<1H OCEAN
10386,-117.64,33.61,14.0,5232.0,810.0,3041.0,839.0,5.826,247900.0,<1H OCEAN
13717,-117.19,34.08,22.0,2467.0,555.0,1567.0,494.0,2.6536,84700.0,INLAND
3139,-118.17,34.87,9.0,1507.0,293.0,761.0,278.0,3.0184,87900.0,INLAND
5754,-118.28,34.18,47.0,2243.0,339.0,911.0,319.0,7.4046,446800.0,<1H OCEAN


In [5]:
data['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [6]:
data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [7]:
X=data.drop(columns='median_house_value')
Y=data['median_house_value']

In [32]:
num_cols=X.select_dtypes(include=['float64']).columns
cat_cols=X.select_dtypes(include=['object']).columns

In [46]:
print(num_cols)

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')


In [33]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [34]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=42)

In [35]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
7061,-118.02,33.93,35.0,2400.0,398.0,1218.0,408.0,4.1312,<1H OCEAN
14689,-117.09,32.79,20.0,2183.0,534.0,999.0,496.0,2.8631,NEAR OCEAN
17323,-120.14,34.59,24.0,1601.0,282.0,731.0,285.0,4.2026,NEAR OCEAN
10056,-121.0,39.26,14.0,810.0,151.0,302.0,138.0,3.1094,INLAND
15750,-122.45,37.77,52.0,3188.0,708.0,1526.0,664.0,3.3068,NEAR BAY


In [45]:
X_test.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
dtype: int64

In [61]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [62]:
trf1=ColumnTransformer([
    ("num",num_pipeline,num_cols),
    ("OHE",OneHotEncoder(sparse_output=False,handle_unknown='ignore'),cat_cols)
],remainder='drop')


In [63]:
Pipe1=Pipeline([
    ("prep",trf1),
    ("RG",Ridge())
])

In [64]:
Pipe2=Pipeline([
    ("Prep",trf1),
    ("KNG",KNeighborsRegressor())
])

In [65]:
Pipe3=Pipeline([
    ("Prep",trf1),
    ("DCR",DecisionTreeRegressor())
])

In [66]:
Voting=VotingRegressor(
    estimators=[
        ("RG",Pipe1),
        ("KNG",Pipe2),
        ("DCR",Pipe3)
    ],
)

In [67]:
Voting.fit(X_train,Y_train)

In [68]:
Y_pred=Voting.predict(X_test)

In [69]:
r2=r2_score(Y_test,Y_pred)
MSE=mean_squared_error(Y_test,Y_pred)
MAE=mean_absolute_error(Y_test,Y_pred)

In [70]:
print("R2 score is",r2)
print("Mean Squared Error is", MSE)
print("Mean Absolute Error is",MAE)

R2 score is 0.7559634298812696
Mean Squared Error is 3203077860.5416374
Mean Absolute Error is 38581.81107474482


In [71]:
param_grid = {
    'RG__RG__alpha': [0.1, 1, 10, 100],
    
    'KNG__KNG__n_neighbors': [3, 5, 7, 9, 11],
    'KNG__KNG__weights': ['uniform', 'distance'],
    'KNG__KNG__p': [1, 2], 
    
    'DCR__DCR__max_depth': [5, 10, 15, 20, None],
    'DCR__DCR__min_samples_split': [2, 5, 10],
    'DCR__DCR__min_samples_leaf': [1, 2, 4]
}

In [72]:
grid=GridSearchCV(
    estimator=Voting,
    param_grid=param_grid,
    cv=10,
    scoring='r2',
    verbose=2,
    n_jobs=-1
)

In [73]:
grid.fit(X_train,Y_train)

Fitting 10 folds for each of 3600 candidates, totalling 36000 fits


In [75]:
best_model=grid.best_estimator_
best_param=grid.best_params_
best_score=grid.best_score_

In [77]:
print("Best parameters are \t ",best_param)
print("Best Score is \t",best_score)

Best parameters are 	  {'DCR__DCR__max_depth': 20, 'DCR__DCR__min_samples_leaf': 2, 'DCR__DCR__min_samples_split': 10, 'KNG__KNG__n_neighbors': 5, 'KNG__KNG__p': 1, 'KNG__KNG__weights': 'distance', 'RG__RG__alpha': 1}
Best Score is 	 0.7692520707878938


In [78]:
Y_pred1=best_model.predict(X_test)

In [79]:
r2=r2_score(Y_test,Y_pred1)
MSE=mean_squared_error(Y_test,Y_pred1)
MAE=mean_absolute_error(Y_test,Y_pred1)

In [80]:
print("r2 score is",r2)
print("Mean Squared Error is",MSE)
print("Mean Absolute Error is",MAE)

r2 score is 0.7695215725727435
Mean Squared Error is 3025121799.8414097
Mean Absolute Error is 37460.965943138035
