In [1]:
# Libraries
import pandas as pd 
import numpy as np 

# sklearn transformers
from sklearn.preprocessing \
    import StandardScaler, SplineTransformer, PowerTransformer, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.feature_selection import VarianceThreshold

In [2]:
# Read in data
train_raw = pd.read_csv("data/train.csv")
test_raw = pd.read_csv("data/test.csv")
train = train_raw
train_raw.columns
#test.info()

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

# Functions

# Generic Data Cleaning

In [56]:
# Converts all object (string) columns to 
# be categorical.
# @param: train - data in its raw form
# @return: a pandas data frame with objects coded as categorical 
def to_cat(train):
    train[train.select_dtypes(['object']).columns] = (
        train.select_dtypes(['object'])
        .apply(lambda x: x.astype('category'))
    )
    return train

# Codes ad-hoc features to be categorical if they appear numeric
# in the raw format. 
# @param: train - data in its raw form
# @return: a pandas data frame with some columns marked as categorical.
def some_num_to_cat(train): 
    train['MSSubClass'] = train['MSSubClass'].astype('category')
    train['YearBuilt'] = train['YearBuilt'].astype('category')
    train['YearRemodAdd'] = train['YearRemodAdd'].astype('category')
    train['GarageYrBlt'] = train['GarageYrBlt'].astype('category')
    train['MoSold'] = train['MoSold'].astype('category')
    train['YrSold'] = train['YrSold'].astype('category')

    return train

# Engineering pre-spec features
# @param: train - data in its raw form 
# @return: a pandas data frame with added features
def feat_eng(train): 
    # Feature Engineering
    # NewGarage
    train['NewGarage'] = (
        np.where(train['GarageYrBlt'].isnull(), 0, 
            np.where(train['GarageYrBlt'] > train['YearBuilt'], 1, 0))
    )
    train['NewGarage'] = train['NewGarage'].astype('category')

    # YearSinceRmdl
    train['YearSinceRmdl'] = 2016 - train['YearRemodAdd']

    # Rmdl
    train['Rmdl'] = np.where(
            train['YearBuilt'] < train['YearRemodAdd'], 1, 0)
    train['Rmdl'] = train['Rmdl'].astype('category')

    # TotalPorchArea
    train['TotalPorchArea'] = (
        train['WoodDeckSF'] + train['OpenPorchSF'] + 
        train['EnclosedPorch'] + train['3SsnPorch'] + 
        train['ScreenPorch']
    )

    #PorchYes
    train['PorchYes'] = np.where(train['TotalPorchArea'] > 0, 1, 0)
    train['PorchYes'] = train['PorchYes'].astype('category')

    # TotalFinishedBsmt
    train['TotalFinishedBsmt'] = train['BsmtFinSF1'] + train['BsmtFinSF2']

    # PercentFinishedBsmt
    train['PercentFinishedBsmt'] = np.where(train['TotalBsmtSF'] > 0, 
        train['TotalFinishedBsmt'] / train['TotalBsmtSF'] * 100, 0)

    # TotalSqFt
    train['TotalSqFt'] = train['GrLivArea'] + train['TotalFinishedBsmt']

    # PercentLowQual
    train['PercentLowQual'] = train['LowQualFinSF'] * 100 / train['TotalSqFt']

    # IsNew
    train['IsNew'] = np.where(
        train['YrSold'] == train['YearRemodAdd'], 1, 0)
    train['IsNew'] = train['IsNew'].astype('category')

    # House_Age
    train['House_age'] = train['YrSold'] - train['YearRemodAdd']

    # NeighRich
    train['NeighRich'] = np.select(
        condlist = [
            train['Neighborhood'] == ('StoneBr' or 'NridgHt' or 'NoRidge'), 
            train['Neighborhood'] == ('MeadowV' or 'IDOTRR' or 'BrDale')
        ], 
        choicelist = [2, 0],
        default = 1
    )
    train['NeighRich'] = train['NeighRich'].astype('category')
    
    return train

# A helper function that converts a column to an ordinal scale.
# Scale was determined ad-hoc.
# @param: train - data in its raw form 
# @param: col_name - a string name of the column to be converted
def ord_scale_1(train, col_name):
    ret = np.select(
        condlist = [
            train[col_name] == "Ex", 
            train[col_name] == "Gd", 
            train[col_name] == "TA", 
            train[col_name] == "Fa", 
            train[col_name] == "Po"
        ], 
        choicelist = [5, 4, 3, 2, 1], 
        default = 0
    )
    return ret

def ord_scale_2(train, col_name):
    ret = np.select(
        condlist = [
            train[col_name] == "GLQ", 
            train[col_name] == "ALQ", 
            train[col_name] == "BLQ", 
            train[col_name] == "REC", 
            train[col_name] == "LwQ", 
            train[col_name] == "Unf", 
        ], 
        choicelist = [6, 5, 4, 3, 2, 1], 
        default = 0
    )
    return ret

def ord_encode(train): 
    # Ordinal Scale 1
    cols_scale_1 = ['ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual', 
                    'BsmtQual', 'BsmtCond', 'FireplaceQu', 'GarageQual', 
                    'GarageCond', 'PoolQC']
    
    for i in cols_scale_1:
        train[i] = ord_scale_1(train, i)

    # Ordinal Scale 2 
    train['BsmtFinType1'] = ord_scale_2(train, 'BsmtFinType1')
    train['BsmtFinType2'] = ord_scale_2(train, 'BsmtFinType2')

    # Ad-hoc ordeal scales 
    train['LotShape'] = np.select(
        condlist = [
            train['LotShape'] == "Reg", 
            train['LotShape'] == "IR1", 
            train['LotShape'] == "IR2", 
            train['LotShape'] == "IR3" 
        ], 
        choicelist = [3, 2, 1, 0]
    )

    train['LandSlope'] = np.select(
        condlist = [
            train['LandSlope'] == "Gtl", 
            train['LandSlope'] == "Mod", 
            train['LandSlope'] == "Sev"
        ], 
        choicelist = [2, 1, 0]
    )

    train['BsmtExposure'] = np.select(
        condlist = [
            train['BsmtExposure'] == "Gd", 
            train['BsmtExposure'] == "Av", 
            train['BsmtExposure'] == "Mn", 
            train['BsmtExposure'] == "No"
        ], 
        choicelist = [4, 3, 2, 1], 
        default = 0
    )

    train['GarageFinish'] = np.select(
        condlist = [
            train['GarageFinish'] == "Fin", 
            train['GarageFinish'] == "RFn", 
            train['GarageFinish'] == "Unf", 
        ], 
        choicelist = [3, 2, 1], 
        default = 0
    )

    train['Functional'] = np.select(
        condlist = [
            train['Functional'] == "Typ", 
            train['Functional'] == "Min1", 
            train['Functional'] == "Min2", 
            train['Functional'] == "Mod", 
            train['Functional'] == "Maj1", 
            train['Functional'] == "Maj2", 
            train['Functional'] == "Sev", 
            train['Functional'] == "Sal" 
        ], 
        choicelist = [7, 6, 5, 4, 3, 2, 1, 0]
    )

    return train

def knn_Impute(train, numeric_cols, cat_cols, neighbors = 5, 
                reverse_scale = True, reverse_dummy = True):
    # Scale the numeric columns
    scaler = StandardScaler()
    scaled_values = scaler.fit_transform(train[numeric_cols])
    train = train.drop(numeric_cols, axis = 1)
    train = train.join(pd.DataFrame(scaled_values, columns = numeric_cols))

    # Dummy the categorical columns 
    dummy = OneHotEncoder(drop = 'first')
    dummy_values = dummy.fit_transform(train[cat_cols]).toarray()
    dummy_names = dummy.get_feature_names_out().tolist()
    train = train.drop(cat_cols, axis = 1)
    train = train.join(pd.DataFrame(dummy_values, columns = dummy_names))

    # Knn imputation
    imputer = KNNImputer(n_neighbors = neighbors)
    train = pd.DataFrame(imputer.fit_transform(train), columns = train.columns)

    #print(train.head())

    # Reverse scaling
    if reverse_scale: 
        no_scale_values = scaler.inverse_transform(train[numeric_cols])
        train = train.drop(numeric_cols, axis = 1)
        train = train.join(pd.DataFrame(no_scale_values, columns = numeric_cols))

    # Reverse dummies
    if reverse_dummy: 
        no_dummy_values = dummy.inverse_transform(train[dummy_names]) 
        train = train.drop(dummy_names, axis = 1)
        train = train.join(pd.DataFrame(no_dummy_values, columns = cat_cols))

    # Reversal of dummy makes them objects again
    return to_cat(train)

def dummy_cols(train, cat_cols): 
    dummy = OneHotEncoder(drop = 'first')
    dummy_values = dummy.fit_transform(train[cat_cols]).toarray()
    dummy_names = dummy.get_feature_names_out().tolist()
    train = train.drop(cat_cols, axis = 1)
    train = train.join(pd.DataFrame(dummy_values, columns = dummy_names))

    return train

def drop_nzv(train, threshold = 0.05): 
    selector = VarianceThreshold(threshold = threshold)
    train = train.loc[:, selector.fit(train).get_support()]

    return train

def yeo_johnson(train, numeric_cols, standardize = False):
    yj = PowerTransformer(standardize = standardize)
    yj_values = yj.fit_transform(train[numeric_cols])
    train = train.drop(numeric_cols, axis = 1)
    train = train.join(pd.DataFrame(yj_values, columns = numeric_cols))

    return train

def standardize(train, numeric_cols):
    scaler = StandardScaler()
    scaled_values = scaler.fit_transform(train[numeric_cols])
    train = train.drop(numeric_cols, axis = 1)
    train = train.join(pd.DataFrame(scaled_values, columns = numeric_cols))

    return train

def add_ns_3(train, cols, degree = 3, knots = 2): 
    spliner = SplineTransformer(degree = degree, n_knots = knots, include_bias = False)

    for i in cols:
        x = train[i].values.reshape(-1, 1)
        new_col_names = [(i + "_ns" + str(j)) for j in range(1, degree + 1)]
        spline = pd.DataFrame(spliner.fit_transform(x), columns = new_col_names)
        train = train.join(spline)
        train = train.drop(i, axis = 1)
    
    return train

def drop_high_cor(df, threshold = 0.9):
    # Create correlation matrix
    corr_matrix = df.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

    # Drop features 
    return df.drop(to_drop, axis=1)

# Penalized Regression

## Data Cleaning

In [66]:
verbose = True

train = train_raw

# Drop Id
if 'Id' in train: 
    train_Id = train_raw['Id'] 
    train = train_raw.drop('Id', axis = 1)

# Drop Response
if 'SalePrice' in train: 
    train_rep = train_raw['SalePrice']
    train = train.drop('SalePrice', axis = 1)

# Add user features
train = feat_eng(train)
if verbose: print("Added user engineered features")

# Marks columns as categorical 
train = to_cat(train)
train = some_num_to_cat(train)
if verbose: print('Encoded user specified variables as categorical')

# Ordinarily encodes select variables
train = ord_encode(train)
if verbose: print("Encoded user specified variables to be ordinal")

# Track which variables are numeric and categorical 
numeric_cols = train.select_dtypes(include = np.number).columns
cat_cols = train.select_dtypes('category').columns

# Imputes missing values
train = knn_Impute(train, numeric_cols, cat_cols, reverse_dummy = True)
if verbose: print("Imputed missing values using knn with k = 5")

# Create dummy variable
train = dummy_cols(train, cat_cols)
if verbose: print("Categorical columns were convert into n-1 binary dummy variables")

# Yeo-Johnson on Numerics
train = yeo_johnson(train, numeric_cols)
if verbose: print('Yeo-Johnson Transformation of numeric columns')

# Standardized
train = standardize(train, numeric_cols)
if verbose: print('Numeric columns scaled to mean 0 and unit variance')

# Splines
train = add_ns_3(train, cols = numeric_cols)
if verbose: print('Numeric features transformed into natural cubic splines')

# Drop NZV
train = drop_nzv(train, threshold = 0.01)
if verbose: print("Dropped columns with less than 0.01 variance")

# Drop highly correlated columns
train = drop_high_cor(train, threshold = 0.99)
if verbose: print("Dropped 1 columns from every pair with >0.99 correlation")

# Add log Price back in 
train.insert(loc = 0, column = 'SalePrice', value = np.log(train_rep))
if verbose: print("Log transformation of response")

# Add Ids back in
train.insert(loc = 0, column = 'Id', value = train_Id)

Added user engineered features
Encoded user specified variables as categorical
Encoded user specified variables to be ordinal
Imputed missing values using knn with k = 5
Categorical columns were convert into n-1 binary dummy variables
Yeo-Johnson Transformation of numeric columns
Numeric columns scaled to mean 0 and unit variance
Numeric features transformed into natural cubic splines
Dropped columns with less than 0.01 variance
Dropped 1 columns from every pair with >0.99 correlation
Log transformation of SalePrice


In [586]:
# Code string columns as categorical
train['MSSubClass'] = train['MSSubClass'].astype('category')

train[train.select_dtypes(['object']).columns] = (
    train.select_dtypes(['object'])
    .apply(lambda x: x.astype('category'))
)

# Feature Engineering
# NewGarage
train['NewGarage'] = (
    np.where(train['GarageYrBlt'].isnull(), 0, 
        np.where(train['GarageYrBlt'] > train['YearBuilt'], 1, 0))
)

# YearSinceRmdl
train['YearSinceRmdl'] = 2016 - train['YearRemodAdd']

# Rmdl
train['Rmdl'] = np.where(train['YearBuilt'] < train['YearRemodAdd'], 1, 0)

# TotalPorchArea
train['TotalPorchArea'] = (
    train['WoodDeckSF'] + train['OpenPorchSF'] + 
    train['EnclosedPorch'] + train['3SsnPorch'] + 
    train['ScreenPorch']
)

#PorchYes
train['PorchYes'] = np.where(train['TotalPorchArea'] > 0, 1, 0)

# TotalFinishedBsmt
train['TotalFinishedBsmt'] = train['BsmtFinSF1'] + train['BsmtFinSF2']

# PercentFinishedBsmt
train['PercentFinishedBsmt'] = np.where(train['TotalBsmtSF'] > 0, 
    train['TotalFinishedBsmt'] / train['TotalBsmtSF'] * 100, 0)

# TotalSqFt
train['TotalSqFt'] = train['GrLivArea'] + train['TotalFinishedBsmt']

# PercentLowQual
train['PercentLowQual'] = train['LowQualFinSF'] * 100 / train['TotalSqFt']

# IsNew
train['IsNew'] = np.where(train['YrSold'] == train['YearRemodAdd'], 1, 0)

# House_Age
train['House_age'] = train['YrSold'] - train['YearRemodAdd']

# NeighRich
train['NeighRich'] = np.select(
    condlist = [
        train['Neighborhood'] == ('StoneBr' or 'NridgHt' or 'NoRidge'), 
        train['Neighborhood'] == ('MeadowV' or 'IDOTRR' or 'BrDale')
    ], 
    choicelist = [2, 0],
    default = 1
)

In [587]:
def get_col(train, x): 
    return train[x].head()

get_col(train, "NeighRich")

# Converts a categorical column to be on an ordeal scale.
# Scale was determined ad-hoc.
# @param: train - a pandas dataframe
# @param: col_name - a string name of the column to be converted
def ord_scale_1(train, col_name):
    ret = np.select(
        condlist = [
            train[col_name] == "Ex", 
            train[col_name] == "Gd", 
            train[col_name] == "TA", 
            train[col_name] == "Fa", 
            train[col_name] == "Po"
        ], 
        choicelist = [5, 4, 3, 2, 1], 
        default = 0
    )
    return ret

def ord_scale_2(train, col_name):
    ret = np.select(
        condlist = [
            train[col_name] == "GLQ", 
            train[col_name] == "ALQ", 
            train[col_name] == "BLQ", 
            train[col_name] == "REC", 
            train[col_name] == "LwQ", 
            train[col_name] == "Unf", 
        ], 
        choicelist = [6, 5, 4, 3, 2, 1], 
        default = 0
    )
    return ret

# Test
print(np.unique(ord_scale_1(train, "ExterCond")))
print(np.unique(ord_scale_1(train, "GarageQual")))
print(np.unique(ord_scale_2(train, "BsmtFinType2")))

[1 2 3 4 5]
[0 1 2 3 4 5]
[0 1 2 4 5 6]


In [588]:
# Ordinal Recoding
train['LotShape'] = np.select(
    condlist = [
        train['LotShape'] == "Reg", 
        train['LotShape'] == "IR1", 
        train['LotShape'] == "IR2", 
        train['LotShape'] == "IR3" 
    ], 
    choicelist = [3, 2, 1, 0]
)

train['LandSlope'] = np.select(
    condlist = [
        train['LandSlope'] == "Gtl", 
        train['LandSlope'] == "Mod", 
        train['LandSlope'] == "Sev"
    ], 
    choicelist = [2, 1, 0]
)

train['BsmtExposure'] = np.select(
    condlist = [
        train['BsmtExposure'] == "Gd", 
        train['BsmtExposure'] == "Av", 
        train['BsmtExposure'] == "Mn", 
        train['BsmtExposure'] == "No"
    ], 
    choicelist = [4, 3, 2, 1], 
    default = 0
)

train['GarageFinish'] = np.select(
    condlist = [
        train['GarageFinish'] == "Fin", 
        train['GarageFinish'] == "RFn", 
        train['GarageFinish'] == "Unf", 
    ], 
    choicelist = [3, 2, 1], 
    default = 0
)

train['Functional'] = np.select(
    condlist = [
        train['Functional'] == "Typ", 
        train['Functional'] == "Min1", 
        train['Functional'] == "Min2", 
        train['Functional'] == "Mod", 
        train['Functional'] == "Maj1", 
        train['Functional'] == "Maj2", 
        train['Functional'] == "Sev", 
        train['Functional'] == "Sal" 
    ], 
    choicelist = [7, 6, 5, 4, 3, 2, 1, 0]
)

In [None]:
# Drop ID variable
#train = train.drop('Id', axis = 1)

#train_copy = train
#print(train.shape)

from sklearn.preprocessing import OneHotEncoder
# Create a dummy creation rule. 
dummy = OneHotEncoder(drop = 'first')

# Get the names of all categorical columns to dummy 
dummy_col_names = train.select_dtypes('category').columns

# Get the dummy value for all the columns
dummy_col_values = dummy.fit_transform(train[dummy_col_names]).toarray()

# Drop the original columns
train = train.drop(dummy_col_names, axis = 1)

## Add the dummy cols to the original dataframe
train = train.join(pd.DataFrame(dummy_col_values, 
    columns = dummy.get_feature_names_out().tolist()))

# Reserve dummy transform
no_dummy_values = dummy.inverse_transform(dummy_col_values) 
train = train.drop(dummy.get_feature_names_out().tolist(), axis = 1)
train = train.join(pd.DataFrame(no_dummy_values, columns = dummy_col_names))
#print(train.shape)

#print(train.head())
#print("this is a seperator")
#print(train_copy.head())

In [357]:
# Extract response
if 'SalePrice' in train:
    response = train['SalePrice']
    train = train.drop('SalePrice', axis = 1)
else: 
    train = train

# Dummies
train = pd.get_dummies(train)

# Center + Scale 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train = pd.DataFrame(scaler.fit_transform(train), columns = train.columns)

# knnImpute 
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors = 5)
train = pd.DataFrame(imputer.fit_transform(train), columns = train.columns)

# Reverse center + scale for other preprocessing methods.
train = pd.DataFrame(scaler.inverse_transform(train), columns = train.columns)

## NZV - remove all variable with less than 5% variance.
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold = 0.05)
train = train.loc[:, selector.fit(train).get_support()]

# Corr
def drop_high_cor(df, threshold = 0.9):
    # Create correlation matrix
    corr_matrix = df.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

    print(to_drop)

    # Drop features 
    return df.drop(to_drop, axis=1)

train = drop_high_cor(train, threshold = 0.9)

# Splines
from sklearn.preprocessing import SplineTransformer
# Reverse dummy encoding

def add_ns_3(train, degree = 3, knots = 5): 
    cols = train.select_dtypes(include = np.number).columns
    spliner = SplineTransformer(degree = degree, n_knots = knots, include_bias = False)

    for i in cols:
        x = train[i].values.reshape(-1, 1)
        new_col_names = [(i + "_ns" + str(j)) for j in range(1, degree + 1)]
        spline = pd.DataFrame(spliner.fit_transform(x), columns = new_col_names)
        train = train.join(spline)
        train.drop(i, axis = 1)


# Yeo-Johnson 

# Log Price


['YearSinceRmdl', 'TotalFinishedBsmt', 'PercentLowQual', 'House_age']


In [358]:
from sklearn.preprocessing import SplineTransformer
degree, knots = 3, 2

In [359]:
# Unit Test
train = train.idxmax(axis = 1)
print(train)
train.isnull().sum().sum()
#train['NeighRich'].unique()
#train['SalePrice'].isnull().sum()

#test.info()
#if 'SalePrice' in train:
    #test_69 = train.drop('SalePrice', axis = 1)
#else: 
    #test_69 = train
#print(test_69.equals(test))

0       LotArea
1       LotArea
2       LotArea
3       LotArea
4       LotArea
         ...   
1455    LotArea
1456    LotArea
1457    LotArea
1458    LotArea
1459    LotArea
Length: 1460, dtype: object


0

# Random Forest

# Gradient Boosting

# Neutral Network