In [1]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()

In [2]:
from sklearn.model_selection import train_test_split
strat_train_set, strat_test_set = train_test_split(
    housing, test_size=0.2,  random_state=42)

In [3]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [4]:
from sklearn.svm import SVR  
import numpy as np
svm_model = SVR(kernel="linear", C=1.0)
housing_num = housing.select_dtypes(include=[np.number])
housing_num_im=imputer.fit_transform(housing_num)

In [5]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder(sparse_output=False)
housing_cat = housing[["ocean_proximity"]]
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)


In [6]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
housing_num_std_scaled = std_scaler.fit_transform(housing_num)

In [7]:
from sklearn.pipeline import make_pipeline
from sklearn import set_config

num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
set_config(display='diagram')
num_pipeline

0,1,2
,steps,"[('simpleimputer', ...), ('standardscaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [8]:
from sklearn.compose import make_column_selector, make_column_transformer
cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))
preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)

In [9]:
housing_prepared = preprocessing.fit_transform(housing)

In [21]:
housing_prepared[:10]

array([[ 1.17299302, -1.35041487,  0.42853749,  1.57055658,  1.3767992 ,
         1.0810111 ,  1.50750741,  0.37969797,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [ 1.26802809, -1.37853628, -1.47350948, -0.8094394 , -0.90071778,
        -0.64384214, -0.87870693,  0.42006824,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-1.3529389 ,  0.98834939, -0.04697426,  1.9942892 ,  2.44108192,
         1.36319568,  2.5938282 , -0.09231969,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-1.12785585,  0.75869118, -0.28473013,  0.64655814,  0.23083338,
         0.66126154,  0.39481962,  0.68299933,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 1.79322185, -1.08326143, -1.63201339, -1.11790554, -1.18180373,
        -1.20380217, -1.25575517, -1.25556038,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [-1.46297949,  1.011783

In [10]:
svm_model.fit(housing_prepared, housing_labels)
svm_pipe = make_pipeline(preprocessing, svm_model)
svm_pipe.fit(housing, housing_labels)

0,1,2
,steps,"[('columntransformer', ...), ('svr', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('pipeline-1', ...), ('pipeline-2', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,1.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [11]:
predict=svm_pipe.predict(strat_test_set.drop("median_house_value", axis=1))
predict[:10]

array([162295.88798127, 184459.08612828, 198259.74567282, 187975.29565309,
       183881.2458248 , 177998.48457249, 172700.8513016 , 181708.61569991,
       192151.18357278, 200642.65997558])

In [14]:
import numpy as np
def mean_squared_error_self(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

In [None]:
test_data=strat_test_set.drop('median_house_value',axis=1)
# test_data[:10]
test_value=strat_test_set['median_house_value'].copy()
test_value[:10]
type(test_value)

pandas.core.series.Series

In [19]:
predict[:10]

array([162295.88798127, 184459.08612828, 198259.74567282, 187975.29565309,
       183881.2458248 , 177998.48457249, 172700.8513016 , 181708.61569991,
       192151.18357278, 200642.65997558])

In [None]:

test_value=strat_test_set['median_house_value'].copy()
test_data=strat_test_set.drop('median_house_value',axis=1)
svm_rmse = mean_squared_error_self(test_value, predict)
svm_rmse

np.float64(13139175148.721407)

In [None]:
test_value.shape

(4128,)

In [None]:
predict.shape

(4128,)

In [None]:
!pip install -U scikit-learn

