In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import pandas as pd
import numpy as np

# **Load Data**

In [None]:
HOUSING_PATH = "/kaggle/input/california-housing-prices/"

In [None]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data()
housing.head()

# **Look at Data**

In [None]:
housing.info()

Notes:
1. There are some null "total_bedrooms" values
2. "ocean_proximity" is of type object

In [None]:
housing["ocean_proximity"].value_counts()

Notes: very skewed data.

In [None]:
housing.describe()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
housing.hist(bins=50, figsize=(20,15))
plt.show()

# **Create Test Set**

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

This introduces risk of sampling bias

In [None]:
housing["income_cat"] = pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6.0, np.inf], labels=[1,2,3,4,5])

In [None]:
housing["income_cat"].hist()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
strat_train_set["income_cat"].value_counts() / len(strat_train_set)

In [None]:
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

Now remove "income_cat" so data is back to norm

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

# **Discover and Visualize Data to get insights**

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude")

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, 
             s=housing["population"]/100, label="population", figsize=(10,7), 
             c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True)
plt.legend()

A clustering algorithm using population denstiy might be useful, can add new attribute "distance to cluster centroid"

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

This misses out on non-linear correlations

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize = (12, 8))

Diagonal plots histograms of each attribute

In [None]:
housing.plot(kind="scatter", x="median_income", y = "median_house_value", alpha=0.1)

Some "quirks" are showing. There are some horizontal lines around:
1. 500K
2. 450K
3. 350K
4. 280K
5. few more below that

Explore some attribute combinations

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_rooms"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

# **Prepare Data for ML algorithm**

In [None]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

Data Cleaning: total_bedrooms missing values

In [None]:
#option 1
#housing.dropna(subset=["total_bedrooms"])
#option 2
#housing.drop("total_bedrooms", axis=1)
#option 3
#median = housing["total_bedrooms"].median()
#housing["total_bedrooms"].fillna(median, inplace=True)

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [None]:
housing_num = housing.drop("ocean_proximity", axis=1)

In [None]:
imputer.fit(housing_num)

It calculates medians for all attributes, we cant assume test wont have missing values

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
X = imputer.transform(housing_num)

This gives a numpy array

In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

**Handling Text and Categorial Attributes:**

In [None]:
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
ord_enc = OrdinalEncoder()
housing_cat_encoded = ord_enc.fit_transform(housing_cat)

In [None]:
housing_cat_encoded[:10]

In [None]:
ord_enc.categories_

problem: ML algorithms might assume two nearby values are more similar than two distant ones, which is not good for this case. It is good for cases such as ["bad", "good", "very good", "excellent"]

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
cat_enc = OneHotEncoder()

In [None]:
housing_cat_1hot = cat_enc.fit_transform(housing_cat)

In [None]:
housing_cat_1hot

In [None]:
housing_cat_1hot.toarray()

In [None]:
cat_enc.categories_

Note:
If attribute has many categories (e.g career) consider replacing it with a useful numerical feature. Example: replace "ocean_proximity" with "distance_from_ocean" or "country_code" with "country_gdp_per_capita" and "country_population".

Alternatively, we can replace it with a learnable low-dimension vector called an embedding. This is an example of representation learning.

**Custom Transformers:**

Class needs to implement three methods:
1. fit()
2. transform()
3. fit_transform()

We get num 3 for free if we use TransformerMixin as base class. Adding BaseEstimator as base class (while avoiding \*args and \*\*kwargs in constructor) will also give two extra methods (get_params() and set_params()) that will be useful for automatic hyperparameter tuning.

Example that adds the combined attributes discussed above:

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
rooms_idx, bedrooms_idx, population_idx, households_idx = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): #no args or kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self #nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_idx] / X[:, households_idx]
        population_per_household = X[:, population_idx] / X[:, households_idx]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_idx] / X[:, rooms_idx]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [None]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

Transformer for feature selection using CART

Note: NNs often expect an input value ranging from 0 to 1

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

All but last need to be transformers

In [None]:
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

In [None]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

In [None]:
housing_prepared = full_pipeline.fit_transform(housing)

By default, unmentioned columsn will be dropped. You can set the "remainder" hyperparameter to any transformer or to "passthrough" if you want those columns handled differently

In [None]:
type(housing_prepared)

In [None]:
housing_prepared

# **Select and Train a Model**

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

In [None]:
print("Labels:", list(some_labels))

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)

In [None]:
lin_rmse

Most districts’ median_housing_values range between 120,000 and 265,000, so a typical prediction error of 68,628 is not very satisfying. This is an example of a model underfitting the training data.

The main ways to fix underfitting are to:
1. select a more powerful model
2. feed the training algorithm with better features
3. reduce the constraints on the model (in case of regularization)

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

**Better evaluation using cross-validation**

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)

In [None]:
tree_rmse_score = np.sqrt(-scores)

In [None]:
def display_scores(s):
    print("Scores:", s)
    print("Mean:", s.mean())
    print("Standard Deviation:", s.std())

In [None]:
display_scores(tree_rmse_score)

In [None]:
scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)

In [None]:
lin_rmse_score = np.sqrt(-scores)

In [None]:
display_scores(lin_rmse_score)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

In [None]:
scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_score = np.sqrt(-scores)
display_scores(forest_rmse_score)

In [None]:
'''
import joblib
joblib.dump(my_model, "my_model.pkl")
my_model_loaded = joblib.load("my_model.pkl")
'''

Fine tuning models

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = [
    {'n_estimators': [3,10,30], 'max_features': [2,4,6,8,10]},
    {'bootstrap':[False], 'n_estimators': [3,10], 'max_features':[2,3,4]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring = 'neg_mean_squared_error', return_train_score=True)

In [None]:
grid_search.fit(housing_prepared, housing_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

Grid search can also be used for finding out whether or not to add some features

Can use RandomizedSearchCV instead

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_

In [None]:
feature_importances

In [None]:
#display next to corresponding attribs
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attribs = num_attribs + extra_attribs + cat_one_hot_attribs

In [None]:
sorted(zip(feature_importances, attribs), reverse=True)

Can drop features with lower importance

# **Evaluate on test set**

In [None]:
final_model = grid_search.best_estimator_

x_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

x_prepared = full_pipeline.transform(x_test)

final_predictions = final_model.predict(x_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse

In [None]:
from scipy import stats

Compute a 95% confidence interval for the generalization error using scipy.stats.t.interval()

In [None]:
confidence = 0.95

squared_errors = (final_predictions - y_test)**2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1, loc=squared_errors.mean(), scale=stats.sem(squared_errors)))

# **3. Question: Try adding a transformer in the preparation pipeline to select only the most important attributes.**

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        print(feature_importances.shape)
        self.ft_imp = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.ft_imp, k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

In [None]:
k = 10
top_k_feature_indices = indices_of_top_k(feature_importances, k)
top_k_feature_indices

In [None]:
np.array(attribs)[top_k_feature_indices]

In [None]:
sorted(zip(feature_importances, attribs), reverse=True)[:k]

In [None]:
preparation_and_feature_selection_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k))
])

In [None]:
housing_prepared.shape

In [None]:
housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)

In [None]:
housing_prepared_top_k_features.shape

# **4.Question: Try creating a single pipeline that does the full data preparation plus the final prediction.**

In [None]:
type(grid_search.best_estimator_)

In [None]:
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances=feature_importances, k=k)),
    ('forest_reg', RandomForestRegressor(**grid_search.best_params_))
])

In [None]:
k

In [None]:
prepare_select_and_predict_pipeline

In [None]:
prepare_select_and_predict_pipeline.fit(housing, housing_labels)

In [None]:
ome_data = housing.iloc[:4]
some_labels = housing_labels.iloc[:4]

print("Predictions:\t", prepare_select_and_predict_pipeline.predict(some_data))
print("Labels:\t\t", list(some_labels))

# **5. Question: Automatically explore some preparation options using GridSearchCV.**

In [None]:
param_grid = {
    'preparation__num__imputer__strategy': ['mean', 'median'],
    'feature_selection__k': list(range(1, len(feature_importances)))
}

In [None]:
grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

In [None]:
grid_search_prep.fit(housing, housing_labels)

In [None]:
grid_search_prep.best_params_