In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split

In [2]:
def read_data(path, 
              save_file = True,
              return_file = True,
              set_index = None):
    '''
    Read data from data folder in csv format.
    
    Parameters
    ----------
    path: str
          path to data
    
    '''
    
    data = pd.read_csv(path, index_col = set_index)
    
    if save_file:
        joblib.dump(data, "output/data.pkl")
    
    if return_file:
        return data

def split_input_output(dataset,
                       target_column,
                       save_file = True,
                       return_file = True):
    
    output_df = dataset[target_column]
    input_df = dataset.drop([target_column],
                            axis = 1)
    
    if save_file:
        joblib.dump(output_df, "output/output_df.pkl")
        joblib.dump(input_df, "output/input_df.pkl")
    
    if return_file:
        return output_df, input_df

def split_train_test(x, y, TEST_SIZE):
    # Do not forget to stratify if classification
    x_train, x_test,\
        y_train, y_test = train_test_split(x,
                                           y,
                                           test_size=TEST_SIZE,
                                           random_state=123)

    return x_train, x_test, y_train, y_test

def split_data(data_input, data_ouput, return_file=False, TEST_SIZE=0.2):

    x_train, x_test, \
        y_train, y_test = split_train_test(
            data_input,
            data_ouput,
            TEST_SIZE)

    x_train, x_valid, \
        y_train, y_valid = split_train_test(
            x_train,
            y_train,
            TEST_SIZE)

    joblib.dump(x_train, "output/x_train.pkl")
    joblib.dump(y_train, "output/y_train.pkl")
    joblib.dump(x_valid, "output/x_valid.pkl")
    joblib.dump(y_valid, "output/y_valid.pkl")
    joblib.dump(x_test, "output/x_test.pkl")
    joblib.dump(y_test, "output/y_test.pkl")

    if return_file:
        return x_train, y_train, \
            x_valid, y_valid, \
            x_test, y_test

In [4]:
DATA_PATH = "data/train.csv"
TARGET_COLUMN = "SalePrice"
INDEX_COLUMN = "Id"
TEST_SIZE = 0.2

data_house = read_data(DATA_PATH, 
                       set_index = INDEX_COLUMN)
output_df, input_df = split_input_output(
                            data_house,
                            TARGET_COLUMN)
X_train, y_train, X_valid, y_valid, X_test, y_test = split_data(input_df,
                                                                    output_df,
                                                                   True,
                                                                   TEST_SIZE)

In [5]:
y_test.shape

(292,)

In [6]:
X_train

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1332,80,RL,55.0,10780,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,7,2006,WD,Normal
814,20,RL,75.0,9750,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,Shed,500,4,2007,COD,Normal
1262,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2009,WD,Normal
269,30,RM,71.0,6900,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1027,20,RL,73.0,9300,Pave,,Reg,Lvl,AllPub,Inside,...,143,0,,,,0,4,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,50,RL,79.0,9490,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,8,2006,WD,Normal
421,90,RM,78.0,7060,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,11,2008,WD,Alloca
410,60,FV,85.0,10800,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2008,New,Partial
481,20,RL,98.0,16033,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,3,2006,WD,Normal
