In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

The total_bedrooms has 207 null values, so fill it with the mean of the column

### feature enginering

In [4]:
df.total_bedrooms = df.total_bedrooms.fillna(np.mean(df.total_bedrooms))

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [7]:
df['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [8]:
df.ocean_proximity = df.ocean_proximity.map({"<1H OCEAN":1,"INLAND":2,"NEAR OCEAN":3,"NEAR BAY":4,"ISLAND":5})

In [9]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,4
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,4
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,4
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,4
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,4


In [10]:
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

Data Splitting

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Regression with Decision Tree

In [12]:
DT = DecisionTreeRegressor()
DT.fit(X_train, y_train)

DecisionTreeRegressor()

In [13]:
y_pred = DT.predict(X_test)

In [14]:
mse = mean_squared_error(y_test, y_pred)
print
print("Mean Squared Error: ", mse)

Mean Squared Error:  4833974491.953973


In [15]:
print('Training Score: ', DT.score(X_train, y_train))
print('Testing Score: ', DT.score(X_test, y_test))
print("R2: ",r2_score(y_pred,y_test))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, DT.predict(X_test))))

Training Score:  1.0
Testing Score:  0.6456092816393799
R2:  0.6501807304564239
RMSE:  69526.78974290394


### Regression with KNN

In [16]:
KNN_Reg = KNeighborsRegressor(n_neighbors=3)
KNN_Reg.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=3)

In [17]:
y_pred = KNN_Reg.predict(X_test)

In [18]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)

Mean Squared Error:  10726221040.76386


In [19]:
KNN_Reg.fit(X_train, y_train)

y_pred =KNN_Reg.predict(X_test)
print('Training Score: ', KNN_Reg.score(X_train, y_train))
print('Testing Score: ', KNN_Reg.score(X_test, y_test))
print("R2: ",r2_score(y_pred,y_test))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, KNN_Reg.predict(X_test))))

Training Score:  0.5924433146990813
Testing Score:  0.2136340011189085
R2:  -0.41619406970011674
RMSE:  103567.47095861644


### Regression with Linear Regression


In [20]:
LR = LinearRegression()
LR.fit(X_train, y_train)

LinearRegression()

In [21]:
y_pred = LR.predict(X_test)

In [22]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)

Mean Squared Error:  4728192458.218032


In [23]:
LR.fit(X_train, y_train)

y_pred =LR.predict(X_test)
print('Training Score: ', LR.score(X_train, y_train))
print('Testing Score: ', LR.score(X_test, y_test))
print("R2: ",r2_score(y_pred,y_test))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, LR.predict(X_test))))

Training Score:  0.6310752209119637
Testing Score:  0.6533644261871482
R2:  0.4565377292040653
RMSE:  68761.8532197761


### Conclusion

- Decision Tree Regression has the best performance among the three models in terms of R2 Score and MSE, although overfitting is a concern.
- KNN Regression performed the worst, with a negative R2 score and very high MSE, suggesting the model is not suitable for this data.
- Linear Regression had a lower MSE than KNN but is still relatively high, indicating issues with the linearity of the data.