In [None]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, 
                      housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
fetch_housing_data()

In [None]:
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
housing = load_housing_data()
housing.head()

In [None]:
housing.info()
# there has a NAN value in  total_bedrooms

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
import matplotlib.pyplot as plt
% matplotlib inline
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# to make this notebook's output identical at every run
import numpy as np
np.random.seed(42)

In [None]:
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]


In [None]:
train_set, test_set = split_train_test(housing,0.2)
print(len(train_set),'Train + ',len(test_set),' Test')

In [None]:
# there is a anotherway to use the API in the sklearn
from sklearn.model_selection import train_test_split
train_set1, test_set1 = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
test_set1.head()

In [None]:
housing['median_income'].hist()

In [None]:
housing['income_cat'] = np.ceil(housing['median_income'] / 1.5)
housing['income_cat'].where(housing['income_cat']<5, 5.0, inplace=True)
housing['income_cat'].value_counts()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    

In [None]:
housing['income_cat'].value_counts()/len(housing)

In [None]:
strat_test_set['income_cat'].value_counts()/len(strat_train_set)

In [None]:
def income_cat_proportions(data):
    return data['income_cat'].value_counts()/len(data)
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({'Overall':income_cat_proportions(housing),
                             'Stratified':income_cat_proportions(strat_test_set),
                             'Random':income_cat_proportions(test_set),}).sort_index()

In [None]:
compare_props

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop('income_cat', axis=1, inplace=True)

Discove and visualize the data to gain insights

In [None]:
housing1 = strat_train_set.copy()
housing1.plot(kind='scatter', x='longitude',y='latitude',alpha=0.1)
plt.show()

In [None]:
housing1.plot(kind='scatter',x='longitude',y='latitude',alpha=0.4,
            s=housing1['population']/100,label='populations',
            c='median_house_value',cmap=plt.get_cmap('jet'),colorbar=True),
plt.legend()

In [None]:
import matplotlib.image as mpimg
california_img = mpimg.imread()