In [1]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# 1.Read the data
data=pd.read_csv("housing.csv")

In [3]:
data['income_cat']=pd.cut(data['median_income'],bins=[0,1.5,3,4.5,6.0,np.inf],labels=[1,2,3,4,5])

In [4]:
# 2. Create a stratified test set based on income category
split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_idx,test_idx in split.split(data,data['income_cat']):
    strat_train_set=data.loc[train_idx]
    strat_test_set=data.loc[test_idx]

In [5]:
housing=strat_train_set.copy()

In [6]:
# 3.Seperate predictors and labels
housing_labels=housing["median_house_value"].copy()
housing=housing.drop("median_house_value",axis=1)

In [7]:
# 4. Separate numerical and categorical columns
num_attributes=housing.drop("ocean_proximity",axis=1).columns.tolist()
cat_attributes=["ocean_proximity"]

In [8]:
# 5. Pipeline
# 5.1 Numerical Pipeline 
num_pipeline=Pipeline([
    ("imputer",SimpleImputer(strategy="median")),
    ("scaler",StandardScaler()),
])
# 5.2 Categorical Pipeline
cat_pipeline=Pipeline([
    ("onehot",OneHotEncoder(handle_unknown="ignore"))
])
# 5.3 Full pipeline
full_pipeline=ColumnTransformer([
    ("num",num_pipeline,num_attributes),
    ("cat",cat_pipeline,cat_attributes),
])

In [9]:
# 6. Transform the data
housing_prepared = full_pipeline.fit_transform(housing)

In [10]:
# 7. Testing different models
# 7.1 Linear Regression
lin_reg=LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)

In [11]:
# 7.2 Decison Tree
des_tree=DecisionTreeRegressor(random_state=42)
des_tree.fit(housing_prepared,housing_labels)

In [12]:
# 7.3 Random Forst
ran_forest=RandomForestRegressor(random_state=42)
ran_forest.fit(housing_prepared,housing_labels)

In [13]:
# Predict 
lin_pred=lin_reg.predict(housing_prepared)
des_pred=des_tree.predict(housing_prepared)
ran_pred=ran_forest.predict(housing_prepared)

In [14]:
# claculate error
lin_rmse=mean_squared_error(housing_labels,lin_pred,squared=False)
des_rmse=mean_squared_error(housing_labels,des_pred,squared=False)
ran_rmse=mean_squared_error(housing_labels,ran_pred,squared=False)
print("Linear Regression RMSE:", lin_rmse)
print("Decision Tree RMSE:", des_rmse)
print("Random Forest RMSE:", ran_rmse)

Linear Regression RMSE: 68866.78550087014
Decision Tree RMSE: 0.0
Random Forest RMSE: 18337.634681213494


