In [None]:
import os
import tarfile
import pandas as pd
import matplotlib.pyplot as plt
from six.moves import urllib
from	sklearn.model_selection	import	train_test_split , StratifiedShuffleSplit
from	pandas.plotting	import	scatter_matrix
from sklearn.impute import SimpleImputer
from	sklearn.preprocessing	import	LabelEncoder
from	sklearn.preprocessing	import	LabelBinarizer
from	sklearn.pipeline	import	Pipeline
from	sklearn.preprocessing	import	StandardScaler
from	sklearn.pipeline	import	FeatureUnion
from	sklearn.base	import	BaseEstimator,	TransformerMixin
from	sklearn.linear_model	import	LinearRegression
from	sklearn.metrics	import	mean_squared_error
from	sklearn.tree	import	DecisionTreeRegressor
from	sklearn.model_selection	import	cross_val_score
from	sklearn.ensemble import RandomForestRegressor
import numpy as np

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets","housing")
HOUSING_URL = DOWNLOAD_ROOT+"datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
        tgz_path = os.path.join(housing_path, "housing.tgz")
        urllib.request.urlretrieve(housing_url, tgz_path)
        housing_tgz = tarfile.open(tgz_path)
        housing_tgz.extractall(path=housing_path)
        housing_tgz.close()

def	load_housing_data(housing_path=HOUSING_PATH):
	csv_path	=	os.path.join(housing_path,	"housing.csv")
	return	pd.read_csv(csv_path)

rooms_ix,	bedrooms_ix,	population_ix,	household_ix	=	3,	4,	5,	6
class	CombinedAttributesAdder(BaseEstimator,	TransformerMixin):
    def	__init__(self,	add_bedrooms_per_room	=	True):	#	no	*args	or	**kargs
                    self.add_bedrooms_per_room	=	add_bedrooms_per_room
    def	fit(self,	X,	y=None):
                    return	self		#	nothing	else	to	do
    def	transform(self,	X,	y=None):
        rooms_per_household	=	X[:,	rooms_ix]	/	X[:,	household_ix]
        population_per_household	=	X[:,	population_ix]	/	X[:,	household_ix]
        if	self.add_bedrooms_per_room:
            bedrooms_per_room	=	X[:,	bedrooms_ix]	/	X[:,	rooms_ix]
            return	np.c_[X,	rooms_per_household,	population_per_household,
                                                                            bedrooms_per_room]
        else:
            return	np.c_[X,	rooms_per_household,	population_per_household]

        
class DataFrameSelector(BaseEstimator,	TransformerMixin):
    def	__init__(self,	attribute_names):
        self.attribute_names	=	attribute_names
    def	fit(self,	X,	y=None):
        return	self
    def	transform(self,	X):
        return	X[self.attribute_names].values

    
class MyLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)
housing = load_housing_data()
housing["income_cat"]	=	np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"]<5,5.0,inplace=True)

split=StratifiedShuffleSplit(n_splits=1,	test_size=0.2,	random_state=42)

for	train_index,test_index in split.split(housing,housing["income_cat"]):
    strain_set	=	housing.loc[train_index]
    stest_set	=	housing.loc[test_index]
    
for set_ in (strain_set,	stest_set):
    set_.drop("income_cat",	axis=1,	inplace=True)

    
housing	=strain_set.drop("median_house_value",	axis=1)
housing_labels	=	strain_set["median_house_value"].copy()


attr_adder	=	CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs	=	attr_adder.transform(housing.values)
housing_num=housing.drop("ocean_proximity",	axis=1)



housing_cat	=	housing["ocean_proximity"]
num_attribs	=	list(housing_num)
cat_attribs	=	["ocean_proximity"]

num_pipeline	=	Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer',	SimpleImputer(strategy="median")),
    ('attribs_adder',	CombinedAttributesAdder()),
    ('std_scaler',	StandardScaler()),
])


cat_pipeline	=	Pipeline([
    ('selector',	DataFrameSelector(cat_attribs)),
    ('label_binarizer',	MyLabelBinarizer()),
])


full_pipeline=FeatureUnion(transformer_list=[
    ("num_pipeline",	num_pipeline),
    ("cat_pipeline",	cat_pipeline)
])


def	display_scores(scores):
    print("Scores:",scores)
    print("Mean:",scores.mean())
    print("Standard	deviation:",scores.std())  
    
housing_prepared=full_pipeline.fit_transform(housing)
some_data	=	housing.iloc[:5]
some_labels	=	housing_labels.iloc[:5]
some_data_prepared	=	full_pipeline.transform(some_data)

#linear regression
lin_reg	=	LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)


# housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels,housing_predictions)
lin_rmse=np.sqrt(lin_mse)


#dicision tree regression
tree_reg	=	DecisionTreeRegressor()
tree_reg.fit(housing_prepared,	housing_labels)
housing_predictions	=	tree_reg.predict(housing_prepared)
tree_mse	=	mean_squared_error(housing_labels,	housing_predictions)
tree_rmse	=	np.sqrt(tree_mse)


#cross validation for linear regression with scikitlearn
lin_scores=cross_val_score(lin_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
lin_rmse_scores	=	np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)



#cross validation for dicision tree with scikitlearn

scores	=cross_val_score(tree_reg,	housing_prepared,	housing_labels,
scoring="neg_mean_squared_error",	cv=10)
tree_rmse_scores	=	np.sqrt(-scores)



    

#random forest regression

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared,housing_labels)
housing_predictions	=	forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels,	housing_predictions)
forest_rmse = np.sqrt(forest_mse)
print(forest_rsme)

# random forest cross validation with scikitlearn

scores	=cross_val_score(forest_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
forest_rmse_scores=np.sqrt(-scores)
display_scores(forest_rmse_scores)



#grid search hyperparemeter tuning

grid_search	=	GridSearchCV(forest_reg,	param_grid,	cv=5,scoring='neg_mean_squared_error')grid_search.fit(housing_prepared,	housing_labels)
grid_search.fit(housing_prepared,	housing_labels)
feature_importances	=	grid_search.best_estimator_.feature_importances_
extra_attribs	=	["rooms_per_hhold",	"pop_per_hhold",	"bedrooms_per_room"]
cat_one_hot_attribs	=	list(encoder.classes_)
attributes	=	num_attribs	+	extra_attribs	+	cat_one_hot_attribs
sorted(zip(feature_importances,	attributes),	reverse=True)


# preparing test set

final_model = grid_search.best_estimator_
X_test =stest_set.drop("median_house_value",axis=1)
y_test =stest_set["median_house_value"].copy()
X_test_prepared=full_pipeline.transform(X_test)
final_predictions=final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test,final_predictions)
final_rmse = np.sqrt(final_mse) #	=>	evaluates	to	47,766.0
final_rmse

