In [1]:
# Goals of this step: 
#
# Partition all_data --> train_set | test_set.
# Partition train_set --> predictors | labels.
#
# As preliminary, regenerate the test and training sets from previous step.
import pandas as pd
datapath="/Users/jasonmiller/Source/MachineLearning/datasets/housing/housing.csv"
all_data=pd.read_csv(datapath)
# Consider whether most important feature needs special handling.
all_data.corr()["median_house_value"].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.688075
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049686
population           -0.024650
longitude            -0.045967
latitude             -0.144160
Name: median_house_value, dtype: float64

In [2]:
# Consider whether train/test partion is fair w.r.t. house value feature. 
# First measure distribution in all_data.
import numpy as np
def cutup(df):
    return pd.cut(df["median_income"],
                              bins=[0.0,1.5,3.0,4.5,6.0,np.inf],
                              labels=[1,2,3,4,5])
# Best metric could compare two matrices. We'll compare just the critical category: rich folks.
temp_all= cutup(all_data)
temp_all.value_counts()[5]/len(temp_all)

0.11443798449612404

In [3]:
# Try: random split.
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(all_data,test_size=0.2,random_state=42)
temp_random= cutup(train_set)
temp_random.value_counts()[5]/len(temp_random)

0.1156734496124031

In [5]:
# Try: stratified split.
from sklearn.model_selection import StratifiedShuffleSplit
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# split(x,y) expects that y is the labels. 
# Trick: Instead of y, give it it the bin labels that we generated with cutup().
for train_index,test_index in splitter.split(all_data,temp_all):
    train_set = all_data.loc[train_index]
    test_set = all_data.loc[test_index]
temp_stratified= cutup(train_set)
temp_stratified.value_counts()[5]/len(temp_stratified)

0.1144016472868217

In [7]:
# The numbers are not significantly different!
# The book concluded they were different and used the stratified shuffle.
# The difference could be a random effect.
# I will stick with random splitting since it introduces fewer metaparameters.

# Done!
train_set, test_set = train_test_split(all_data,test_size=0.2,random_state=42)
train_predictors = train_set.drop(["median_house_value"],axis=1)
train_labels = train_set["median_house_value"].copy()