<a href="https://colab.research.google.com/github/SoheilBadri2000/hoSKlearnKerasTensorflow/blob/main/Untitled9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Import

In [None]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request
import matplotlib.pyplot as plt

def load_housing_data():
  tarball_path = Path("datasets/housing.tgz")
  if not tarball_path.is_file():
    Path("datasets").mkdir(parents=True, exist_ok=True)
    url = "https://github.com/ageron/data/raw/main/housing.tgz"
    urllib.request.urlretrieve(url, tarball_path)
    with tarfile.open(tarball_path) as housing_tarball:
      housing_tarball.extractall(path="datasets")
  return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()

In [None]:
housing_labels = housing["median_house_value"].copy()
housing = housing.drop("median_house_value", axis=1)

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
import numpy as np
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel


class ClusterSimilarity(BaseEstimator, TransformerMixin):
  def __init__(self, n_clusters=10, gamma=1.,random_state=42): # no *args or **kwargs
    self.n_clusters = n_clusters
    self.gamma = gamma
    self.random_state = random_state

  def fit(self, X, y=None, sample_weight=None):
    self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)
    self.kmeans_.fit(X, sample_weight=sample_weight)
    return self # always return self

  def transform(self, X):
    return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

  def get_feature_names_out(self, name=None):
    return[f"Cluster {i} similarity" for i in range(self.n_clusters)]



def column_ratio(X):
  return X[:,[0]]/X[:,[1]]

def ratio_name(function_transformer, feature_names_in):
  return ["ratio"] # feature names out

def ratio_pipeline():
  return make_pipeline(
      SimpleImputer(strategy="median"),
      FunctionTransformer(column_ratio, feature_names_out=ratio_name),
      StandardScaler()

  )

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler()
)

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)

cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore"))
default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

preprocessing = ColumnTransformer([
    ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
    ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
    ("people_per_house", ratio_pipeline(), ["population", "households"]),
    ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
    ("geo", cluster_simil, ["latitude", "longitude"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object))
],remainder=default_num_pipeline) # one column remaining: housing_median_age

In [None]:
housing_prepared = preprocessing.fit_transform(housing)



In [None]:
housing_prepared_df = pd.DataFrame(
    housing_prepared,
    columns = preprocessing.get_feature_names_out(),
    index = housing.index
)

housing_prepared_df

Unnamed: 0,bedrooms__ratio,rooms_per_house__ratio,people_per_house__ratio,log__total_bedrooms,log__total_rooms,log__population,log__households,log__median_income,geo__Cluster 0 similarity,geo__Cluster 1 similarity,...,geo__Cluster 6 similarity,geo__Cluster 7 similarity,geo__Cluster 8 similarity,geo__Cluster 9 similarity,cat__ocean_proximity_<1H OCEAN,cat__ocean_proximity_INLAND,cat__ocean_proximity_ISLAND,cat__ocean_proximity_NEAR BAY,cat__ocean_proximity_NEAR OCEAN,remainder__housing_median_age
0,-1.029988,0.628559,-0.049597,-1.634226,-1.129255,-1.692016,-1.561311,1.858786,0.281136,1.424140e-07,...,0.599828,0.000353,1.563580e-17,0.030667,0.0,0.0,0.0,1.0,0.0,0.982143
1,-0.888897,0.327041,-0.092512,1.312871,1.648839,1.028461,1.440853,1.852703,0.282877,1.690647e-07,...,0.619006,0.000311,2.011404e-17,0.029230,0.0,0.0,0.0,1.0,0.0,-0.607019
2,-1.291686,1.155620,-0.025843,-1.103136,-0.449227,-1.107018,-1.097678,1.567130,0.268964,1.617171e-07,...,0.618766,0.000307,1.788255e-17,0.027277,0.0,0.0,0.0,1.0,0.0,1.856182
3,-0.449613,0.156966,-0.050329,-0.811591,-0.636925,-0.947530,-0.807218,1.032566,0.263215,1.534794e-07,...,0.614666,0.000313,1.621955e-17,0.026783,0.0,0.0,0.0,1.0,0.0,1.856182
4,-0.639087,0.344711,-0.085616,-0.571285,-0.311482,-0.930649,-0.578372,0.218044,0.263215,1.534794e-07,...,0.614666,0.000313,1.621955e-17,0.026783,0.0,0.0,0.0,1.0,0.0,1.856182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,0.165994,-0.155023,-0.049110,-0.174260,-0.280761,-0.385615,-0.247885,-1.698929,0.216816,7.578801e-11,...,0.003357,0.003884,8.886321e-20,0.941756,0.0,1.0,0.0,0.0,0.0,-0.289187
20636,0.021671,0.276881,0.005021,-1.427361,-1.439476,-1.556094,-1.697841,-0.649557,0.212251,4.800116e-11,...,0.003854,0.006487,3.210384e-20,0.984476,0.0,1.0,0.0,0.0,0.0,-0.845393
20637,0.021134,-0.090318,-0.071735,0.182203,0.122265,-0.148116,0.122682,-1.516731,0.245294,8.058617e-11,...,0.005102,0.005938,5.720885e-20,0.980104,0.0,1.0,0.0,0.0,0.0,-0.924851
20638,0.093467,-0.040211,-0.091225,-0.051560,-0.133391,-0.563455,-0.171521,-1.317404,0.240178,5.806173e-11,...,0.005801,0.008737,2.619651e-20,0.992376,0.0,1.0,0.0,0.0,0.0,-0.845393



# Q1

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline

param_grid = [
        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]

svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=4)
grid_search.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
