In [None]:
import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [None]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
fetch_housing_data()

In [None]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data()
housing.head()

In [None]:
housing.info()

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
import numpy as np
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]
    

In [None]:
train_set, test_set = split_train_test(housing, 0.2)

In [None]:
train_set

In [None]:
import numpy as np
from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

housing = housing.copy()
housing["id"] = housing["longitude"] * 1000 + housing["latitude"]

train_set, test_set = split_train_test_by_id(housing, 0.2, "id")

In [None]:
len(train_set), len(test_set), len(test_set)/len(housing)

In [None]:
import pandas as pd 

housing['income_cat'] = pd.cut(housing['median_income'], bins= [0., 1.5, 3.0, 4.5, 6, np.inf])
housing

In [None]:
ax = housing['income_cat'].value_counts().sort_index().plot.bar()
ax.set_xlabel("income_cat")
ax.set_ylabel("count")
ax.set_title("Income Category Distribution")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit as SS
splitter = SS(n_splits=10, test_size=0.2, random_state=42)
strat_splits = []

for train_index, test_index in splitter.split(housing, housing['income_cat']):
    strat_train_set_n = housing.iloc[train_index]
    strat_test_set_n = housing.iloc[test_index]
    strat_splits.append([strat_train_set_n, strat_test_set_n])

In [None]:
strat_train_set, strat_test_set = train_test_split(housing, test_size=0.2, stratify=housing["income_cat"],random_state=42)

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis = 1, inplace=True)

In [None]:
housing = strat_train_set
housing.plot(kind='scatter', x='longitude', y = 'latitude', grid=True, alpha = 0.2)


In [None]:
import folium
import branca.colormap as cm

housing = strat_train_set.copy()

center = [housing["latitude"].mean(), housing["longitude"].mean()]
m = folium.Map(location=center, zoom_start=6, tiles="OpenStreetMap")

vmin = housing["median_house_value"].min()
vmax = housing["median_house_value"].max()
colormap = cm.linear.YlOrRd_09.scale(vmin, vmax) 
colormap.caption = "Median house value"
colormap.add_to(m)

pop = housing["population"].fillna(0).to_numpy()
radius = np.sqrt(pop) / 30  
radius = np.clip(radius, 1, 12)

for (lat, lon, val, r) in zip(
    housing["latitude"], housing["longitude"],
    housing["median_house_value"], radius
):
    folium.CircleMarker(
        location=[lat, lon],
        radius=float(r),
        color=None,
        fill=True,
        fill_color=colormap(val),
        fill_opacity=0.35,
        opacity=0.0,
    ).add_to(m)

m

In [None]:
corr_matrix = housing.corr(numeric_only=True)

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
scatter_matrix(housing[attributes], figsize=(12,8))
plt.show

In [None]:
housing.plot(kind='scatter', x = 'median_income', y= 'median_house_value', grid= True, alpha=0.1)

In [None]:
housing['rooms_per_house'] = housing['total_rooms']/ housing['households']
housing['bedroom_ratio'] = housing['total_bedrooms']/ housing['total_rooms']
housing['people_per_house'] = housing['population']/ housing['households']

In [None]:
corr_matrix = housing.corr(numeric_only=True)
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
housing = strat_train_set.drop('median_house_value',axis=1)
housing_labels = strat_train_set['median_house_value'].copy()

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
housing_num = housing.select_dtypes(include= [np.number])
imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
X = imputer.transform(housing_num)

In [None]:
housing_cat = housing[['ocean_proximity']]
housing_cat.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown="ignore")
housing_cat_1hot = encoder.fit_transform(housing_cat) 

In [None]:
encoder.get_feature_names_out()

In [None]:
cat_cols = encoder.get_feature_names_out(housing_cat.columns)
housing_cat_df = pd.DataFrame(
    housing_cat_1hot.toarray(),   
    columns=cat_cols,
    index=housing_cat.index
)
housing_num = housing.drop(columns=["ocean_proximity"])
housing_prepared = housing_num.join(housing_cat_df)

housing_prepared.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler(feature_range=(-1,1))
housing_num_min_max_scaled = min_max_scaler.fit_transform(housing_prepared)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
std_scaler = StandardScaler()
housing_num_std_scaled = std_scaler.fit_transform(housing_prepared)
housing_num_min_max_scaled.shape

In [None]:
X = housing_num_min_max_scaled

In [None]:
housing_test = strat_test_set.drop("median_house_value", axis=1)
test_labels = strat_test_set["median_house_value"].copy()

housing_test_num = housing_test.drop(columns=["ocean_proximity"])

housing_test_cat = housing_test[["ocean_proximity"]]

test_cat_1hot = encoder.transform(housing_test_cat)
test_cat_df = pd.DataFrame(
    test_cat_1hot.toarray(),
    columns=encoder.get_feature_names_out(housing_test_cat.columns),
    index=housing_test_cat.index
)

housing_test_prepared = housing_test_num.join(test_cat_df)

housing_test_std_scaled = std_scaler.transform(housing_test_prepared)

In [None]:
housing_num_std_scaled.shape

In [None]:
housing_test_std_scaled.shape

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

X_train = strat_train_set.drop("median_house_value", axis=1)
y_train = strat_train_set["median_house_value"].copy()

X_test  = strat_test_set.drop("median_house_value", axis=1)
y_test  = strat_test_set["median_house_value"].copy()

num_cols = X_train.drop(columns=["ocean_proximity"]).columns
cat_cols = ["ocean_proximity"]

preprocess = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])

In [None]:
X_train_prepared = preprocess.fit_transform(X_train)
X_test_prepared  = preprocess.transform(X_test)

X_train_prepared.shape, X_test_prepared.shape

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, y_train)

preds = lin_reg.predict(X_test_prepared)
rmse = np.sqrt(mean_squared_error(y_test, preds))
rmse

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np

for a in [0.1, 1.0, 10.0, 100.0]:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train_prepared, y_train)
    preds = ridge.predict(X_test_prepared)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print(a, rmse)

In [None]:
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer


In [None]:
def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler()
    )

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log1p, feature_names_out="one-to-one"), 
)

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, input_features=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

In [None]:
X_train = strat_train_set.drop("median_house_value", axis=1)
y_train = strat_train_set["median_house_value"].copy()

X_test  = strat_test_set.drop("median_house_value", axis=1)
y_test  = strat_test_set["median_house_value"].copy()

In [None]:
cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1.0, random_state=42)

preprocessing = ColumnTransformer(
    transformers=[
        ("bedrooms_ratio", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
        ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
        ("people_per_house", ratio_pipeline(), ["population", "households"]),

        ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),

        ("geo", cluster_simil, ["latitude", "longitude"]),

        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=make_pipeline(
        SimpleImputer(strategy="median"),
        StandardScaler()
    )
)

In [None]:
X_train_prepared = preprocessing.fit_transform(X_train, y_train) 
X_test_prepared  = preprocessing.transform(X_test)

X_train_prepared.shape, X_test_prepared.shape

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, y_train)

preds = lin_reg.predict(X_test_prepared)
rmse = np.sqrt(mean_squared_error(y_test, preds))
rmse

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

full_model = Pipeline([
    ("preprocess", preprocessing),
    ("model", LinearRegression())
])

full_model.fit(X_train, y_train)
preds = full_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
rmse