In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [3]:
# 1. Load the data
housing = pd.read_csv("housing.csv")

# 2. Create a stratified test set based on income category
housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)


In [4]:
from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index , test_index in split.split(housing, housing['income_cat']):
    strat_train_set=housing.loc[train_index].drop("income_cat", axis=1)
    strat_test_set=housing.loc[test_index].drop("income_cat",axis=1)
    

In [5]:
# since we have dropped income_cat column it will throw a error 
# strat_train_set['income_cat'].value_counts()
# # strat_train_set['income_cat'].value_counts().sum()
# strat_test_set['income_cat'].value_counts()

In [6]:
# 5789/5265=1.095....

In [7]:
# 1447/1316=1.095..... #almost equal splitting in both

In [5]:
print(f"Rows in train set: {len(strat_train_set)}\nRows in test set: {len(strat_test_set)}\n")

Rows in train set: 16512
Rows in test set: 4128



In [9]:
# corr_matrix=housing.corr() it will throw error since it has categorical attribute

In [10]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [11]:
# now we will work on only training dataset ...... so we will create a copy for the training dataset

In [12]:
housing=strat_train_set.copy()


In [13]:
# Seperate the predictors and labels 

In [14]:
housing_labels=housing["median_house_value"].copy()

In [15]:
housing=housing.drop("median_house_value",axis=1)

In [16]:
# Seperate num and cat attributes

In [17]:
num_attribs = housing.drop("ocean_proximity", axis=1).columns.tolist()
cat_attribs = ["ocean_proximity"]

In [18]:
num_attribs

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']

In [19]:
cat_attribs

['ocean_proximity']

In [20]:
# making_pipelines we can do this using 2 methods first by importing and second by defining pipeline

In [21]:
# Numerical Pipeline

In [22]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

In [23]:
num_pipeline

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [24]:
# Categorical attribute

In [25]:
cat_pipeline = Pipeline([
    # ("ordinal", OrdinalEncoder())  # Use this if you prefer ordinal encoding
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [26]:
cat_pipeline

0,1,2
,steps,"[('onehot', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [27]:
# Full_pipeline

In [28]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])

In [29]:
full_pipeline

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [30]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
12655,-121.46,38.52,29,3873,797.0,2237,706,2.1736,INLAND
15502,-117.23,33.09,7,5320,855.0,2015,768,6.3373,NEAR OCEAN
2908,-119.04,35.37,44,1618,310.0,667,300,2.8750,INLAND
14053,-117.13,32.75,24,1877,519.0,898,483,2.2264,NEAR OCEAN
20496,-118.70,34.28,27,3536,646.0,1837,580,4.4964,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
15174,-117.07,33.03,14,6665,1231.0,2026,1001,5.0900,<1H OCEAN
12661,-121.42,38.51,15,7901,1422.0,4769,1418,2.8139,INLAND
19263,-122.72,38.44,48,707,166.0,458,172,3.1797,<1H OCEAN
19140,-122.70,38.31,14,3155,580.0,1208,501,4.1964,<1H OCEAN


In [31]:
# 6. Transform the data
housing_prepared = full_pipeline.fit_transform(housing)

In [32]:
housing_prepared

array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.26758118, -0.1259716 ,  1.22045984, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.        ,
         0.        ,  0.        ]])

In [33]:
print(housing_prepared.shape)

(16512, 13)


In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [35]:
# Train the model 

In [36]:
lin_reg=LinearRegression()

In [37]:
lin_reg.fit(housing_prepared, housing_labels)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [38]:
lin_preds=lin_reg.predict(housing_prepared)

In [39]:
lin_rmse=root_mean_squared_error(housing_labels, lin_preds)

In [40]:
print(lin_rmse)

69050.56219504567


In [41]:
print(f"The root mean squared error for linear regrssion is {lin_rmse}")

The root mean squared error for linear regrssion is 69050.56219504567


In [42]:
# Decsion_tree

In [43]:
dec_reg=DecisionTreeRegressor(random_state=42)
dec_reg.fit(housing_prepared,housing_labels)
dec_preds=dec_reg.predict(housing_prepared)
dec_rmse=root_mean_squared_error(housing_labels, dec_preds)

In [44]:
print(f"The root mean squared error for decison tree regressor is {dec_rmse}")

The root mean squared error for decison tree regressor is 0.0


In [45]:
# overfitting of the data

In [46]:
# Random_Forest_Regressor

In [47]:
for_reg=RandomForestRegressor(random_state=42)
for_reg.fit(housing_prepared,housing_labels)
for_preds=for_reg.predict(housing_prepared)
for_rmse=root_mean_squared_error(housing_labels, for_preds)

In [48]:
print(f"The root mean squared error for random_forest_regressor is {for_rmse}")

The root mean squared error for random_forest_regressor is 18342.366362322846


In [49]:
print(f"the rmse of linear regression is {lin_rmse}")
print(f"the rmse of decison tree regressor is {dec_rmse}")
print(f"the rmse of random forest regressor is {for_rmse}")

the rmse of linear regression is 69050.56219504567
the rmse of decison tree regressor is 0.0
the rmse of random forest regressor is 18342.366362322846


In [50]:
# Now what we can do is we can use cross validation for calculating error in decison tree regressor , also we will check for all three and compare


In [60]:
from sklearn.model_selection import cross_val_score

In [61]:
lin_reg=LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)
lin_preds=for_reg.predict(housing_prepared)
lin_rmses=-cross_val_score(lin_reg,housing_prepared, housing_labels, scoring="neg_root_mean_squared_error",cv=10)

In [62]:
dec_reg=DecisionTreeRegressor(random_state=42)
dec_reg.fit(housing_prepared,housing_labels)
dec_preds=dec_reg.predict(housing_prepared)
dec_rmses=-cross_val_score(dec_reg,housing_prepared, housing_labels, scoring="neg_root_mean_squared_error",cv=10)

In [63]:
for_reg=RandomForestRegressor(random_state=42)
for_reg.fit(housing_prepared,housing_labels)
for_preds=for_reg.predict(housing_prepared)
for_rmses=-cross_val_score(for_reg,housing_prepared, housing_labels, scoring="neg_root_mean_squared_error",cv=10)

In [64]:
print(pd.Series(lin_rmses).describe())

count       10.000000
mean     69204.322755
std       2500.382157
min      65318.224029
25%      67124.346106
50%      69404.658178
75%      70697.800632
max      73003.752739
dtype: float64


In [65]:
print(pd.Series(dec_rmses).describe())

count       10.000000
mean     69081.361563
std       2420.500173
min      64770.563939
25%      67525.053996
50%      69027.994020
75%      70675.556581
max      73280.387324
dtype: float64


In [66]:
print(pd.Series(for_rmses).describe())

count       10.000000
mean     49432.126788
std       2239.797830
min      45940.427717
25%      47726.327336
50%      49230.482778
75%      50904.664037
max      53301.087485
dtype: float64


In [None]:
# We can see here Random forest regressor is the best fit for this dataset
# linear regression model and decison tree regressor are almost giving same rmse amongst which decison tree regressor
# is a better option 