In [None]:
from zlib import crc32

import os
import tarfile
import urllib

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

from pandas.plotting import scatter_matrix

In [None]:
plt.style.use('fivethirtyeight')

In [None]:
housing = pd.read_csv('./data/housing.csv')

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
housing.hist(bins=50, figsize=(20, 10))
plt.show()

In [None]:
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    teset_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:teset_set_size]
    train_indices = shuffled_indices[teset_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)
len(train_set), len(test_set)

In [None]:
def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2 ** 32

In [None]:
def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
housing_with_id = housing.reset_index()
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, 'index')

In [None]:
housing_with_id['id'] = housing['longitude'] * 1000 + housing['latitude']
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, 'id')

In [None]:
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
housing['income_cat'] = pd.cut(housing['median_income'], bins=[0, 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5])
housing['income_cat'].hist()

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
strat_test_set['income_cat'].value_counts() / len(strat_test_set)

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop('income_cat', axis=1, inplace=True)

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude')

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.1)

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4,
             s=housing['population'] / 100, label='population', figsize=(10,7), c='median_house_value',
             cmap=plt.get_cmap('jet'), colorbar=True, sharex=False
             )
plt.legend()

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
corr_matrix.columns

In [None]:
attributes = ['median_income', 'median_house_value', 'total_rooms', 'housing_median_age']
scatter_matrix(housing[attributes], figsize=(20, 15))
plt.show()