# Housing Value Prediction using Support Vector Machine

### In this notebook we will work on the housing dataset and use a support vector machine to predict the housing values for California 

#### Imports

In [153]:
import numpy as np 
import pandas as pd
from sklearn.svm import SVR
from sklearn.base import BaseEstimator, TransformerMixin 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

#### Loading Data

In [133]:
#imports 
import os
import tarfile
from six.moves import urllib

#Constants
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

#Method to download the data
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data(HOUSING_URL, HOUSING_PATH)

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing_df = load_housing_data(HOUSING_PATH)
print(housing_df.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


#### Data Exploration

In [134]:
housing_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [135]:
housing_df.corr()

  housing_df.corr()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,1.0,-0.924664,-0.108197,0.044568,0.069608,0.099773,0.05531,-0.015176,-0.045967
latitude,-0.924664,1.0,0.011173,-0.0361,-0.066983,-0.108785,-0.071035,-0.079809,-0.14416
housing_median_age,-0.108197,0.011173,1.0,-0.361262,-0.320451,-0.296244,-0.302916,-0.119034,0.105623
total_rooms,0.044568,-0.0361,-0.361262,1.0,0.93038,0.857126,0.918484,0.19805,0.134153
total_bedrooms,0.069608,-0.066983,-0.320451,0.93038,1.0,0.877747,0.979728,-0.007723,0.049686
population,0.099773,-0.108785,-0.296244,0.857126,0.877747,1.0,0.907222,0.004834,-0.02465
households,0.05531,-0.071035,-0.302916,0.918484,0.979728,0.907222,1.0,0.013033,0.065843
median_income,-0.015176,-0.079809,-0.119034,0.19805,-0.007723,0.004834,0.013033,1.0,0.688075
median_house_value,-0.045967,-0.14416,0.105623,0.134153,0.049686,-0.02465,0.065843,0.688075,1.0


In [136]:
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


### Data Preprocessing

In [137]:
housing_df.pop("ocean_proximity")
"ocean_proximity" not in housing_df.columns

True

In [138]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6 

class CombinedAttributes(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        rooms_per_household = X[:, population_ix] / X[:, rooms_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]

            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        
        else:
            return np.c_[rooms_per_household, population_per_household]


In [139]:
att_adder = CombinedAttributes(True)
housing_extra_attributes = att_adder.transform(housing_df.values)
housing_extra_attributes


array([[-122.23      ,   37.88      ,   41.        , ...,    0.36590909,
           2.55555556,    0.14659091],
       [-122.22      ,   37.86      ,   21.        , ...,    0.33821665,
           2.10984183,    0.15579659],
       [-122.24      ,   37.85      ,   52.        , ...,    0.33810498,
           2.80225989,    0.12951602],
       ...,
       [-121.22      ,   39.43      ,   17.        , ...,    0.44676131,
           2.3256351 ,    0.21517303],
       [-121.32      ,   39.43      ,   18.        , ...,    0.3983871 ,
           2.12320917,    0.21989247],
       [-121.24      ,   39.37      ,   16.        , ...,    0.49802513,
           2.61698113,    0.22118492]])

In [126]:
temp_df = pd.DataFrame(housing_extra_attributes)
columns = list(housing.columns)
columns.append("rooms_per_household")
columns.append("population_per_household")
columns.append("bedrooms_per_room")
temp_df.set_axis = columns
#changing the names of the columns after preproccesing might have to change the transformer function

In [127]:
housing_df = housing_extra_attributes

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)


In [128]:
housing_labels = train_set.pop("median_house_value")
housing_labels

14196    103000.0
8267     382100.0
17445    172600.0
14265     93400.0
2271      96500.0
           ...   
11284    229200.0
11964     97800.0
5390     222100.0
860      283500.0
15795    325000.0
Name: median_house_value, Length: 16512, dtype: float64

In [154]:
svm = SVR()
svm.fit(train_set, housing_labels)
housing_predictions = svm.predict(train_set)
mse = mean_squared_error(housing_labels, housing_predictions)
np.sqrt(mse)

118715.78016441481

In [158]:
svm = SVR(kernel='linear')
svm.fit(train_set, housing_labels)
housing_predictions = svm.predict(train_set)
mse = mean_squared_error(housing_labels, housing_predictions)
np.sqrt(mse)

93857.63711986113

In [159]:
svm = SVR(kernel='rbf')
svm.fit(train_set, housing_labels)
housing_predictions = svm.predict(train_set)
mse = mean_squared_error(housing_labels, housing_predictions)
np.sqrt(mse)

118715.78016441481

In [160]:
svm = SVR(kernel='linear', gamma=3)
svm.fit(train_set, housing_labels)
housing_predictions = svm.predict(train_set)
mse = mean_squared_error(housing_labels, housing_predictions)
np.sqrt(mse)

93857.63711986113