### Imports

In [4]:
import pandas as pd
import numpy as np
import matplotlib as plt 


### Reading the dataset

In [5]:
housing = pd.read_csv ('housing.csv')

In [6]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [7]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.5+ MB


In [8]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


### Finding the missing values

In [9]:
housing.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

### Split data into train and test sets

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
train, test=train_test_split(housing, test_size=0.2, random_state=42)

In [12]:
train.shape , test.shape

((16512, 10), (4128, 10))

### Cleaning data

In [13]:
train.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [14]:
test.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

### Handling Missing Values

In [15]:
from sklearn.impute import SimpleImputer 

imputer=SimpleImputer(strategy="median")

### Removing ocean_proximity

In [16]:
## Remove the ocean proximity since is a categorical data

train_num = train.drop("ocean_proximity" ,axis=1)
test_num = test.drop ("ocean_proximity" , axis =1)

In [17]:
train_num.head() ## Read the train data to see if ocean proximity has been droped

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,103000.0
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,382100.0
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,172600.0
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,93400.0
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,96500.0


In [18]:
test_num.head()  ## Read the test data to see if ocean proximity has been droped

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0
15663,-122.44,37.8,52.0,3830.0,,1310.0,963.0,3.4801,500001.0
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.725,278000.0


In [19]:
# fitting the imputer with the train data

imputer.fit(train_num)

SimpleImputer(strategy='median')

In [20]:
imputer.statistics_

array([-1.1851e+02,  3.4260e+01,  2.9000e+01,  2.1290e+03,  4.3700e+02,
        1.1670e+03,  4.1000e+02,  3.5458e+00,  1.7985e+05])

In [21]:
train_num.median().values

array([-1.1851e+02,  3.4260e+01,  2.9000e+01,  2.1290e+03,  4.3700e+02,
        1.1670e+03,  4.1000e+02,  3.5458e+00,  1.7985e+05])

In [22]:
x= imputer.transform(train_num)

### Handling Categorical Features it train data

In [23]:
train_cat=train[["ocean_proximity"]]

In [24]:
train_cat.head(10)

Unnamed: 0,ocean_proximity
14196,NEAR OCEAN
8267,NEAR OCEAN
17445,NEAR OCEAN
14265,NEAR OCEAN
2271,INLAND
17848,<1H OCEAN
6252,<1H OCEAN
9389,NEAR BAY
6113,<1H OCEAN
6061,<1H OCEAN


In [25]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder=OneHotEncoder()

train_cat_1hot=cat_encoder.fit_transform(train_cat)

train_cat_1hot

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [26]:
cat_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

### Handling Categorical Features it test data

In [27]:
test_cat=test[["ocean_proximity"]]

In [28]:
test_cat.head(10)

Unnamed: 0,ocean_proximity
20046,INLAND
3024,INLAND
15663,NEAR BAY
20484,<1H OCEAN
9814,NEAR OCEAN
13311,INLAND
7113,<1H OCEAN
7668,<1H OCEAN
18246,NEAR BAY
5723,<1H OCEAN


In [29]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder=OneHotEncoder()

test_cat_1hot=cat_encoder.fit_transform(test_cat)

test_cat_1hot

<4128x5 sparse matrix of type '<class 'numpy.float64'>'
	with 4128 stored elements in Compressed Sparse Row format>

In [30]:
cat_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

### Feature engineering

In [31]:
## Create the most importan attributes in the train data

train["rooms_per_household"]=train["total_rooms"]/train["households"]
train["bedrooms_per_room"]=train["total_bedrooms"]/train["total_rooms"]
train["population_per_household"]=train["population"]/train["households"]

In [32]:
## Create the most importan attributes in the test data

test["rooms_per_household"]=test["total_rooms"]/test["households"]
test["bedrooms_per_room"]=test["total_bedrooms"]/test["total_rooms"]
test["population_per_household"]=test["population"]/test["households"]

In [33]:
##  look at the correlation matrix for the train

corr_matrix=train.corr()

corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value          1.000000
median_income               0.690647
rooms_per_household         0.158485
total_rooms                 0.133989
housing_median_age          0.103706
households                  0.063714
total_bedrooms              0.047980
population_per_household   -0.022030
population                 -0.026032
longitude                  -0.046349
latitude                   -0.142983
bedrooms_per_room          -0.257419
Name: median_house_value, dtype: float64

In [34]:
## look at the correlation matrix for the test

corr_matrix=test.corr()

corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value          1.000000
median_income               0.677502
total_rooms                 0.134697
rooms_per_household         0.130928
housing_median_age          0.113585
households                  0.074249
total_bedrooms              0.056667
population                 -0.019003
longitude                  -0.044062
population_per_household   -0.121853
latitude                   -0.149295
bedrooms_per_room          -0.249196
Name: median_house_value, dtype: float64

### Feature Scaling

In [35]:
train_labels=train['median_house_value'].copy()

# drop the labels from the train set via the pandas drop function

train=train.drop('median_house_value', axis=1)

# axis=1 means that median_house_value should be dropped column wise, meaning, the whole column will be dropped

train_num=train.drop("ocean_proximity", axis=1)

#  re-do since the additional columns rooms_per_household, bedrooms_per_room and population_per_household where added

train_labels.head()

14196    103000.0
8267     382100.0
17445    172600.0
14265     93400.0
2271      96500.0
Name: median_house_value, dtype: float64

In [36]:
train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,NEAR OCEAN,5.017657,0.200576,3.691814
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,NEAR OCEAN,4.473545,0.232703,1.738095
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN,5.645833,0.174486,2.723214
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,NEAR OCEAN,4.002817,0.258269,3.994366
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,INLAND,6.268421,0.18094,2.3


In [37]:
test_labels=test['median_house_value'].copy()

# drop the labels from the test set via the pandas drop function

test=test.drop('median_house_value', axis=1)

# axis=1 means that median_house_value should be dropped column wise, meaning, the whole column will be dropped

test_num=test.drop("ocean_proximity", axis=1)

#  re-do since the additional columns rooms_per_household, bedrooms_per_room and population_per_household where added

test_labels.head()

20046     47700.0
3024      45800.0
15663    500001.0
20484    218600.0
9814     278000.0
Name: median_house_value, dtype: float64

In [38]:
test.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND,4.192201,,3.877437
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND,5.039384,,2.679795
15663,-122.44,37.8,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY,3.977155,,1.360332
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN,6.163636,,3.444444
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.725,NEAR OCEAN,5.492991,,2.483645


In [44]:
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([('imputer',SimpleImputer(strategy="median")),('std_scaler', StandardScaler())])

train_num_tr=num_pipeline.fit_transform(train_num)
train_num_tr

array([[ 1.27258656, -1.3728112 ,  0.34849025, ..., -0.17491646,
        -0.2117846 ,  0.05137609],
       [ 0.70916212, -0.87669601,  1.61811813, ..., -0.40283542,
         0.34218528, -0.11736222],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.08821601,
        -0.66165785, -0.03227969],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ..., -0.60675918,
         0.99951387,  0.02030568],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.40217517,
        -0.79086209,  0.00707608],
       [-1.41489815,  0.99543676,  1.85617335, ..., -0.85144571,
         1.69520292, -0.08535429]])

In [45]:
test_num_ts=num_pipeline.fit_transform(test_num)
test_num_ts

array([[ 0.25541734,  0.22194113, -0.30073951, ..., -0.43367721,
        -0.16042695,  0.52022395],
       [ 0.02976613, -0.20947715,  0.098724  , ..., -0.13050312,
        -0.16042695, -0.16293193],
       [-1.46454628,  1.03788441,  1.85636346, ..., -0.51063381,
        -0.16042695, -0.91557569],
       ...,
       [-1.2689819 ,  0.80810728, -0.30073951, ...,  0.65618087,
        -1.45960578, -0.09978765],
       [-0.120668  ,  0.5548835 ,  0.57808022, ..., -0.04116447,
        -0.48579064, -0.21494651],
       [ 0.57634349, -0.64089543, -0.93988113, ..., -0.50645235,
         0.81025277,  0.43607226]])

### Colunm Transformation

In [46]:
### traisform the training set

from sklearn.compose import ColumnTransformer

num_attribs=list(train_num)
cat_attribs=["ocean_proximity"]
full_pipeline=ColumnTransformer([("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs) ])

train_prepared = full_pipeline.fit_transform(train)
train_prepared

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, ...,  0.        ,
         1.        ,  0.        ]])

In [48]:
### transforming the test set

num_attribs=list(test_num)
cat_attribs=["ocean_proximity"]
full_pipeline=ColumnTransformer([("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs) ])

test_prepared = full_pipeline.fit_transform(test)
test_prepared

array([[ 0.25541734,  0.22194113, -0.30073951, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.02976613, -0.20947715,  0.098724  , ...,  0.        ,
         0.        ,  0.        ],
       [-1.46454628,  1.03788441,  1.85636346, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.2689819 ,  0.80810728, -0.30073951, ...,  0.        ,
         0.        ,  0.        ],
       [-0.120668  ,  0.5548835 ,  0.57808022, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.57634349, -0.64089543, -0.93988113, ...,  0.        ,
         0.        ,  0.        ]])

###  1.2 Modelling

### 1.2.1 Load Model

In [49]:
### desplaing the shape of the train prepared and test prepared dataset
train_prepared.shape

(16512, 16)

In [50]:
test_prepared.shape

(4128, 16)

### Training the model using Linear Regression model

In [51]:
from sklearn.linear_model import LinearRegression

lin_reg=LinearRegression()

lin_reg.fit(train_prepared, train_labels)

LinearRegression()

### 1.2.2 Predict Values 

#### Making  Prediction with the Test dataset Values 

In [60]:
some_data=train.iloc[:5]

some_labels=train_labels.iloc[:5]

# transform / prepare some data

some_data_prepared=full_pipeline.transform(some_data)

# make predictions

print("Predictions:", lin_reg.predict(some_data_prepared))

Predictions: [182974.12012367 292677.38786781 247791.02308928 149456.30677509
 165371.56589941]


In [61]:
print("Labels:",list(some_labels))

Labels: [103000.0, 382100.0, 172600.0, 93400.0, 96500.0]


#### Making  Prediction with the test dataset Values 

In [54]:
some_data_ts=test.iloc[:5]

some_labels_ts=test_labels.iloc[:5]

# transform / prepare some data

some_data_prepared_ts=full_pipeline.transform(some_data_ts)

# make predictions

print("Predictions:", lin_reg.predict(some_data_prepared_ts))

Predictions: [ 39136.16884551 139369.60077975 295742.47398626 284745.25969939
 266570.79093541]


In [55]:
print("Labels:",list(some_labels_ts))

Labels: [47700.0, 45800.0, 500001.0, 218600.0, 278000.0]


### 1.2.3 Evaluate Model 

### Measure regression model’s RMSE on the whole Train and Test set

In [56]:
from sklearn.metrics import mean_squared_error

#### For the training dataset

In [62]:
train_predictions=lin_reg.predict(train_prepared)

lin_mse_tr=mean_squared_error(train_labels, train_predictions)

lin_rmse_tr=np.sqrt(lin_mse_tr)
lin_rmse_tr

67593.20745775253

#### For the Testing dataset

In [58]:
test_predictions=lin_reg.predict(test_prepared)

lin_mse=mean_squared_error(test_labels, test_predictions)

lin_rmse=np.sqrt(lin_mse)
lin_rmse

69089.54570492625