In [None]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import glob
import cv2
import os

def load_house_attributes(inputPath):
    numrical_features = ["bedrooms", "bathrooms", "area", "zipcode", "price"]
    df = pd.read_csv(inputPath, sep=" ", header=None, names=numrical_features)
    
    # create 3 new featuers, featuers**2
    df['bedrooms2']=df['bedrooms']**2
    df['bathrooms2']=df['bathrooms']**2
    df['area2']=df['area']**2
    
    # determine the unique zip codes
    zipcodes = df["zipcode"].value_counts().keys().tolist() 
    rows = df["zipcode"].value_counts().tolist() # number of data
 
    # loop over each of the unique zip codes
    for (zipcode, row) in zip(zipcodes, rows): 
        # removing any houses with less than 25 houses per zip code
        if row < 25:
            i = df[df["zipcode"] == zipcode].index
            df.drop(i, inplace=True)        
    return df  


def process_house_attributes(df, train, test):
    # initialize the column names
    continuous = ["bedrooms", "bathrooms", "area","bedrooms2", "bathrooms2", "area2"]
 
    # performing min-max scaling (range [0, 1])
    cs = MinMaxScaler()
    trainContinuous = cs.fit_transform(train[continuous])
    testContinuous = cs.transform(test[continuous])
 
    # one-hot encode the zip code categorical data (range [0, 1])
    zipBinarizer = LabelBinarizer().fit(df["zipcode"])
    trainCategorical = zipBinarizer.transform(train["zipcode"])
    testCategorical = zipBinarizer.transform(test["zipcode"])
 
    # construct the categorical features with the continuous features
    trainX = np.hstack([trainCategorical, trainContinuous])
    testX = np.hstack([testCategorical, testContinuous])
 
    # return the concatenated training and testing data
    return (trainX, testX)