In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
HOUSING_PATH = "/content/drive/My Drive/Colab Notebooks/ml_class/housing.csv"

In [None]:
import os
import tarfile
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
  tgz_path = os.path.join(housing_path, "housing.tgz")
  housing_tgz = tarfile.open("/content/drive/My Drive/Colab Notebooks/ml_class/housing.tgz")
  housing_tgz.extractall(path="housing.tgz")
  housing_tgz.close()
  csv_path = os.path.join(housing_path, "housing.csv")
  return pd.read_csv("/content/drive/My Drive/Colab Notebooks/ml_class/housing.csv")

housing = load_housing_data()

df=pd.read_csv("/content/drive/My Drive/Colab Notebooks/ml_class/housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
y = df["median_house_value"]
print(y.head())
print("\n")
X = df.drop("median_house_value", axis=1)
print(X.head())

0    452600.0
1    358500.0
2    352100.0
3    341300.0
4    342200.0
Name: median_house_value, dtype: float64


   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income ocean_proximity  
0       322.0       126.0         8.3252        NEAR BAY  
1      2401.0      1138.0         8.3014        NEAR BAY  
2       496.0       177.0         7.2574        NEAR BAY  
3       558.0       219.0         5.6431        NEAR BAY  
4       565.0       259.0         3.8462        NEAR BAY  


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identify numerical and categorical columns
numerical_attribs = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_attribs = X_train.select_dtypes(include=['object']).columns

# Numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

# Full pipeline
full_pipeline = ColumnTransformer([
    ('num', numerical_pipeline, numerical_attribs),
    ('cat', OneHotEncoder(), categorical_attribs)
])

# Fit and transform the training data
X_train_prepared = full_pipeline.fit_transform(X_train)

# Transform the test data using the same pipeline
X_test_prepared = full_pipeline.transform(X_test)

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import numpy as np

#Instantiate the SVR model with an RBF kernel
svr_reg = SVR(kernel='rbf', C=1.0) #Radial Basis Function(RBF) kernel,which is a popular choice for non-linear data.

#Fit the SVR model on the prepared data
svr_reg.fit(X_train_prepared,y_train)

#Predict on the training set
housing_predictions_svr = svr_reg.predict(X_test_prepared)

#Calculate the Mean Squared Error (MSE)
mse_svr = mean_squared_error(y_test, housing_predictions_svr)
print(f"MSE (SVR): {mse_svr}")

#Calculate the Root Mean Squared Error(RMSE)
rmse_svr = np.sqrt(mse_svr)
print(f"RMSE (SVR): {rmse_svr}")

MSE (SVR): 13669672829.960045
RMSE (SVR): 116917.37608225753


Support Vector Regression (SVR) is a type of Support Vector Machine (SVM) that is used for regression tasks. While SVMs are typically used for classification, SVR adapts the principles of SVM for continuous outputs. The main goal of SVR is to find a function that approximates the target values, ensuring that most predictions are within a specific distance (epsilon) from the actual target.

In SVR, instead of minimizing classification error, we minimize a different objective, which aims to fit the data within an epsilon margin. If the predictions fall within the epsilon margin, no penalty is applied, but if the predictions fall outside this margin, penalties are applied, similar to how errors are penalized in traditional regression models.

Key concepts in SVR include:

Kernel Function: The kernel function, such as the Radial Basis Function (RBF) kernel used here, transforms the data into a higher-dimensional space to capture non-linear relationships between features and target values.

C (Regularization Parameter): Controls the trade-off between having a smooth decision boundary and classifying all training points correctly. A larger C value aims to classify more points correctly by tolerating more complex models, while a smaller C simplifies the model.

Epsilon (ε): Defines a margin of tolerance where no penalty is applied if the prediction is within the margin. This helps SVR deal with outliers and minor deviations in the target values.

The model aims to minimize errors while avoiding overfitting. After fitting the model to the training data, predictions are made, and evaluation metrics such as Mean Squared Error (MSE) and Root Mean Squared Error (RMSE) are used to measure model performance.