In [1]:
from functions import *
from settings import *
from pipeline import *

%store -r __RequiredPackages
%store -r __JupyterOptions

In [2]:
__RequiredPackages

In [3]:
__JupyterOptions

Populating the interactive namespace from numpy and matplotlib


In [211]:
# def line_smoother(data, col):
#     smoothed = [data[col][0]]
#     for i in range(1, len(data[col])-1):
#         smoothed.append(np.mean([data[col][i-1], data[col][i], data[col][i+1]]))
#     smoothed.append(data.loc[len(data)-1, col])
#     return smoothed

In [25]:
def feature_testing_pipe(data, keep_features, square_features, target, NA_means_not_there_cols=NA_means_not_there_cols, 
                         cont_impute_cols=cont_impute_cols, cat_impute_cols=cat_impute_cols, 
                         dev_seed=dev_seed, new_session=False, new_features=False):
    """
    Does all the preprocessing steps,
    uses cross validation to test the performance of several models.
    Used to quickly compare different sets of features
    
    Note: Make sure the indexes are in order without gaps
    """
    # Transform features that dont create data leakage issues
    if new_features | new_session:
        data = standard_preprocessing_function(data, new_session, NA_means_not_there_cols)
    
    # Create different splits of train and test
    kf = KFold(n_splits=5, shuffle=True, random_state=dev_seed)
    
    # Initiate empty object for further analysis
    pred_perf_dict = {'LinearRegression': [], 'KNeighborsRegressor': [], 'RandomForestRegressor': [],
                      'GradientBoostingRegressor': [], 'SVR': []}
    predictions_df = pd.DataFrame({'obs_nr': data.index})
    cv_round = 0
    
    # Split features and target
    X = data.drop(target, axis=1)
    y = data[target]
    
    # Fit models while looping through the train/test-splits
    for train_index, test_index in kf.split(X): 
        cv_round  += 1
        
        X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
        y_train, y_test = y.values[train_index], y.values[test_index]
        
        # Prepare features based on target variable in testset
        prepper = leakage_preventive_preprocessing_function(cont_impute_cols, cat_impute_cols, keep_features, square_features)
        X_train = prepper.fit_transform(X_train, y_train)
        X_test = prepper.transform(X_test)
        
        # Fit models
        pred_perf_dict, predictions_df = fit_model('LinearRegression', X_train, y_train, X_test, y_test, test_index, 
                                                   pred_perf_dict, predictions_df, cv_round)
        pred_perf_dict, predictions_df = fit_model('KNeighborsRegressor', X_train, y_train, X_test, y_test, test_index, 
                                                   pred_perf_dict, predictions_df, cv_round)
        pred_perf_dict, predictions_df = fit_model('RandomForestRegressor', X_train, y_train, X_test, y_test, test_index, 
                                                   pred_perf_dict, predictions_df, cv_round)
        pred_perf_dict, predictions_df = fit_model('GradientBoostingRegressor', X_train, y_train, X_test, y_test, test_index, 
                                                   pred_perf_dict, predictions_df, cv_round)
        pred_perf_dict, predictions_df = fit_model('SVR', X_train, y_train, X_test, y_test, test_index, 
                                                   pred_perf_dict, predictions_df, cv_round)

        #print('Completed predicting round {}'.format(cv_round))
        
    return pred_perf_dict, predictions_df
        
def fit_model(model_name, X_train, y_train, X_test, y_test, test_index, 
              pred_perf_dict, predictions_df, cv_round, n_jobs=n_jobs, dev_seed=dev_seed):
    if model_name == 'LinearRegression':
        model = LinearRegression(n_jobs=n_jobs).fit(X_train, y_train)
    elif model_name == 'KNeighborsRegressor':  
        model = neighbors.KNeighborsRegressor(n_neighbors = 7, n_jobs=n_jobs).fit(X_train, y_train)
    elif model_name == 'RandomForestRegressor':
        model = RandomForestRegressor(n_estimators=50, random_state=dev_seed, n_jobs=n_jobs).fit(X_train, y_train)
    elif model_name == 'GradientBoostingRegressor':
        model = GradientBoostingRegressor(random_state=dev_seed).fit(X_train, y_train)
    elif model_name == 'SVR':
        model = SVR(kernel = 'rbf').fit(X_train, y_train)
    
    preds = model.predict(X_test)
    pred_perf_dict[model_name].append(sqrt(mean_squared_error(y_test, preds)))
    temp_preds = pd.DataFrame({'obs_nr': test_index, model_name + str(cv_round): preds})
    predictions_df = predictions_df.merge(temp_preds, how='left', on='obs_nr')
    
    return pred_perf_dict, predictions_df

def show_performance(pred_perf_dict):
    return pd.DataFrame(pd.DataFrame(pred_perf_dict).mean(axis=0), columns=['rmsle'])

In [19]:
def standard_preprocessing_function(data, new_session, NA_means_not_there_cols):
    """
    Prepare the dataset across train and testset.
    No data leakage issues at this stage
    """
    # Replace salesprice with a log scaled version of it
    data['LogSalePrice'] = np.log(data['SalePrice'])
    # Use np.exp on predictions to scale back to actual sales price
    
    # Impute missing values where they are not at random
    data[NA_means_not_there_cols] = data[NA_means_not_there_cols].fillna('Not_present') 
    data['MasVnrArea'] = data['MasVnrArea'].fillna(0)
    data['LotFrontage'] = data['LotFrontage'].fillna(0)
    
    # Transform existing variables
    if new_session:
        data = replace_ordinal_values(data)
        data['Heating_GasA'] = np.where(data['Heating'] == 'GasA', 1, 0)
        #data['IsNew'] = np.where(data['SaleType'] == 'New', 1, 0)
    # yearmonth feature for downward trend
    
    # Create binary features
#     data['HasPorch'] = np.where(np.sum(data[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch']], axis=1) > 0, 1, 0)
#     data['HasOpenPorch'] = np.where(data['OpenPorchSF'] > 0, 1, 0)
#     data['HasWoodDeck'] = np.where(data['WoodDeckSF'] > 0, 1, 0)
    
    # Create features from multiple variables
    data['BsmtScore'] = data['BsmtFinSF1'] * data['BsmtFinType1'] + data['BsmtFinSF2'] * data['BsmtFinType2']
#     data['YearMo'] = pd.to_numeric(data['YrSold'].astype('str') + data['MoSold'].apply('{:0>2}'.format))
    
    data['AllBathsSum'] = np.sum(data[['BsmtHalfBath', 'HalfBath', 'BsmtFullBath', 'FullBath']], axis=1)
#     data['AllHalfBaths'] = np.sum(data[['BsmtHalfBath', 'HalfBath']], axis=1)
#     data['AllFullBaths'] = np.sum(data[['BsmtFullBath', 'FullBath']], axis=1)
    
#     data['NormalBathScore'] = data['FullBath'] + data['HalfBath'] * 0.5
#     data['BsmtBathScore'] = data['BsmtHalfBath'] + data['BsmtFullBath'] * 0.5
#     data['AllBathScore'] = data['NormalBathScore'] + data['BsmtBathScore']

#     data['DiffBuiltRemod'] = data['YearRemodAdd'] - data['YearBuilt']
#     data['DiffBuiltGarage'] = np.where(data['GarageYrBlt'].isna(), 0, data['GarageYrBlt'] - data['YearBuilt'])
    
#     data['TotalIndoorSF'] = np.sum(data[['1stFlrSF', '2ndFlrSF']], axis=1)  
    data['TotalSFInclBsmnt'] = np.sum(data[['TotalBsmtSF', '1stFlrSF', '2ndFlrSF']], axis=1)
#     data['TotalSFInclBsmntOpenPorch'] = np.sum(data[['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'OpenPorchSF']], axis=1)
#     data['TotalSFInclBsmntAllPorch'] = np.sum(data[['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
#                                                     'OpenPorchSF', 'EnclosedPorch', '3SsnPorch']], axis=1)
#     data['AllPorches'] = np.sum(data[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch']], axis=1)
    data['YardArea'] = data['LotArea'] - data['1stFlrSF'] - data['GarageArea']
    
#     # Create dummies
#     neighbor_dummies = pd.get_dummies(data['Neighborhood'])
#     neighbor_dummies.columns = ['Nbrh_dum_' + str(col) for col in neighbor_dummies.columns]
#     data = pd.concat([data, neighbor_dummies.iloc[:, 1:]], axis=1)
    
    # Make a more even distribution for continuous features as well
    for col in ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'TotalSFInclBsmnt', 'GarageArea', 
                #'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'TotalSFInclBsmntOpenPorch', 
                #'TotalSFInclBsmntAllPorch', 'AllPorches', 'WoodDeckSF', 'MasVnrArea',
               'LotArea', 'YardArea', 'BsmtScore']:
        data['Log' + col] = np.log1p(data[col])
        
#     # Join the not often used categories together to other categories
#     replace_garagetype = {'Attchd': 'Attached', 'Detchd': 'Detached', 'BuiltIn': 'Attached', 'CarPort': 'Not_present', 
#                           'not_present': 'not_present', '2Types': 'Attached', 'Basment': 'Detached'}
#     data['GarageTypeClean'] = data['GarageType'].replace(replace_garagetype)
#     replace_mssc = {'A': '1-story', 'B': '1-story', 'C': '1-story', 'D': '1-1/2 story', 'E': '1-1/2 story', 
#                     'F': '2-story', 'G': '2-story',  'H': '2-1/2 story', 'I': 'split', 'J': 'split', 'K': 'duplex',
#                     'L': '1-story', 'M': '1-1/2 story', 'N':'2-story', 'O': 'split', 'P': 'duplex'}
#     data['MSSubClassGR'] = data['MSSubClass'].replace(replace_mssc)
    
    # Remove outliers
    data = data[data['LogTotalSFInclBsmnt'] < 8.9].reset_index(drop=True)
    
    return data

In [27]:
class leakage_preventive_preprocessing_function():
    
    def __init__(self, cont_impute_cols, cat_impute_cols, keep_features, square_features):
        self.cont_impute_cols = cont_impute_cols
        self.cat_impute_cols = cat_impute_cols
        self.keep_features = keep_features
        self.square_features = square_features
        self.mean_enc_dict = {}
    
    def fit(self, X, y):
        # Fit regression to impute NAs for GarageYrBlt
        self.reg = LinearRegression().fit(X.loc[X['GarageYrBlt'].notna(), ['GarageYrBlt']], 
                                          y[X['GarageYrBlt'].notna()])
        self.avg_houseprice_nogarage = np.mean(y[X['GarageYrBlt'].isna()])
        self.garage_yearbuilt_impute = (self.avg_houseprice_nogarage - self.reg.intercept_) / self.reg.coef_[0]
        
        # Fit imputer to impute missing values
        self.num_imputer = SimpleImputer(missing_values=np.nan, strategy='median').fit(X[self.cont_impute_cols])
        self.cat_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent').fit(X[self.cat_impute_cols])
        
        # Get lambda for boxcox
        #self.TotalSF_lambda_ = stats.boxcox_normmax(X['TotalSFInclBsmnt'] + 0.001, brack=(-1.0, 1.0))
        
        # Mean encode based on the current split
        X[target] = y
        for col in ['Neighborhood', 'KitchenQual', 'MSSubClass', 'HeatingQC', 'MSZoning', 'HouseStyle']:
            self.mean_enc_dict[col] = X.groupby(col)[target].mean()
            
#         # Capture smoothed line encodings YearMo trend
#         self.mean_yearmo = X.groupby('YearMo')[target].mean().reset_index()
#         self.mean_yearmo['YearMo_smoothline'] = line_smoother(self.mean_yearmo, target) 
#         X = X.merge(self.mean_yearmo[['YearMo', 'YearMo_smoothline']], how='left', on='YearMo')
            
        X = X.drop([target], axis=1)
        
        # Save mean of target when ME's could not be created due to specific split
        self.target_mean = np.mean(y)
        
        # Transform mean encodings for standard scaler 
        for col in ['Neighborhood', 'KitchenQual', 'MSSubClass', 'HeatingQC', 'MSZoning', 'HouseStyle']:
            X['ME_' + col] = X[col].map(self.mean_enc_dict[col])
            
            # Median impute NA's for the encoding
            X.loc[X['ME_' + col].isnull(), 'ME_' + col] = self.target_mean
            
#         # Create interaction terms between mean encodings
#         X['GarageTypeFinish'] = X['ME_GarageType'] * X['ME_GarageFinish']
#         X['GarageTypeQual'] = X['ME_GarageType'] * X['GarageQual']
        
        # Save the mean and stds to standardscale
#         self.YearBuilt_scaler = preprocessing.StandardScaler().fit(np.array(X['YearBuilt']).reshape(-1, 1))
#         self.YearRemodAdd_scaler = preprocessing.StandardScaler().fit(np.array(X['YearRemodAdd']).reshape(-1, 1))
#         self.DiffBuiltRemod_scaler = preprocessing.StandardScaler().fit(np.array(X['DiffBuiltRemod']).reshape(-1, 1))
        
        self.standardscaler = preprocessing.StandardScaler().fit(X[keep_features])
    
    def transform(self, X):
        # Impute missing values based on specific strategy
        X.loc[X['GarageYrBlt'].isna(), 'GarageYrBlt'] = self.garage_yearbuilt_impute
        
        # Use imputer to impute missing values
        X[self.cont_impute_cols] = self.num_imputer.transform(X[self.cont_impute_cols])
        X[self.cat_impute_cols] = self.cat_imputer.transform(X[self.cat_impute_cols])
        
        # Use lambda value to boxcox transform
        #X['BCTotalSFInclBsmnt'] = X['TotalSFInclBsmnt'] + 0.001
        #X['BCTotalSFInclBsmnt'] = stats.boxcox(X['BCTotalSFInclBsmnt'], self.TotalSF_lambda_)
        
        # Transform mean encodings
        for col in ['Neighborhood', 'KitchenQual', 'MSSubClass', 'HeatingQC', 'MSZoning', 'HouseStyle']:
            X['ME_' + col] = X[col].map(self.mean_enc_dict[col])
            
            # Median impute NA's for the encoding
            X.loc[X['ME_' + col].isnull(), 'ME_' + col] = self.target_mean
            
#         # Create interaction terms between mean encodings
#         X['GarageTypeFinish'] = X['ME_GarageType'] * X['ME_GarageFinish']
#         X['GarageTypeQual'] = X['ME_GarageType'] * X['GarageQual']
            
        # Use the mean and std to StandardiseScale
#         X['StScYearBuilt'] = self.YearBuilt_scaler.transform(np.array(X['YearBuilt']).reshape(-1, 1))
#         X['StScYearRemodAdd'] = self.YearBuilt_scaler.transform(np.array(X['YearRemodAdd']).reshape(-1, 1))
#         X['StScDiffBuiltRemod'] = self.YearBuilt_scaler.transform(np.array(X['DiffBuiltRemod']).reshape(-1, 1))

#         # Add the smoothed mean encoding for YearMo
#         X = X.merge(self.mean_yearmo[['YearMo', 'YearMo_smoothline']], how='left', on='YearMo')

        X[keep_features] = self.standardscaler.transform(X[keep_features])
    
        # After all transformations add squared features
        X = X[keep_features]
        for col in self.square_features:
            X['sq_' + col] = X[col]*X[col]
    
        #print(X[keep_features].head())
        
        return X
    
    def fit_transform(self, X, y):
        self.fit(X, y)
        X = self.transform(X)
        return X

In [14]:
# Import data
data = pd.read_csv('prep_train.csv')
validation = pd.read_csv('prep_test.csv')

In [7]:
# Set baseline with mean and median constant predictions
constant_mean   = [np.mean(data[target])] * len(data[target])
constant_median = [np.median(data[target])] * len(data[target])

actuals = np.array(data[target])
mean_preds   = sqrt(mean_squared_error(actuals, constant_mean))
median_preds = sqrt(mean_squared_error(actuals, constant_median))

print("Mean prediction at:   {0:.5f}".format(mean_preds))
print("Median prediction at: {0:.5f}".format(median_preds))

Mean prediction at:   0.39932
Median prediction at: 0.39995


In [73]:
keep_features = [
        'OverallQual',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target)
show_performance(pred_perf_dict)

Unnamed: 0,rmsle
LinearRegression,0.229947
KNeighborsRegressor,0.237087
RandomForestRegressor,0.229287
GradientBoostingRegressor,0.229494
SVR,0.229702


In [98]:
# Try and find out how different area types compare
keep_features = [
        'OverallQual',
        'GrLivArea',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        '1stFlrSF', 
        '2ndFlrSF'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target)
preds2 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'TotalIndoorSF'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds3 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        '1stFlrSF', 
        '2ndFlrSF',
        'GrLivArea'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target)
preds4 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2, preds3, preds4])

Unnamed: 0,rmsle
LinearRegression,0.204217
KNeighborsRegressor,0.281872
RandomForestRegressor,0.215825
GradientBoostingRegressor,0.193606
SVR,0.275269

Unnamed: 0,rmsle
LinearRegression,0.19671
KNeighborsRegressor,0.246562
RandomForestRegressor,0.1932
GradientBoostingRegressor,0.179792
SVR,0.243634

Unnamed: 0,rmsle
LinearRegression,0.203007
KNeighborsRegressor,0.279621
RandomForestRegressor,0.214315
GradientBoostingRegressor,0.191205
SVR,0.271755

Unnamed: 0,rmsle
LinearRegression,0.196403
KNeighborsRegressor,0.247989
RandomForestRegressor,0.191132
GradientBoostingRegressor,0.179707
SVR,0.243619


In [100]:
keep_features = [
        'OverallQual',
        'TotalBsmtSF',
        '1stFlrSF', 
        '2ndFlrSF'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'TotalSFInclBsmnt'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2])

Unnamed: 0,rmsle
LinearRegression,0.196697
KNeighborsRegressor,0.225451
RandomForestRegressor,0.184216
GradientBoostingRegressor,0.177042
SVR,0.221341

Unnamed: 0,rmsle
LinearRegression,0.197209
KNeighborsRegressor,0.234937
RandomForestRegressor,0.20668
GradientBoostingRegressor,0.17753
SVR,0.228962


In [108]:
keep_features = [
        'OverallQual',
        'LogTotalBsmtSF',
        'Log1stFlrSF', 
        'Log2ndFlrSF'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2])

Unnamed: 0,rmsle
LinearRegression,0.190614
KNeighborsRegressor,0.184297
RandomForestRegressor,0.184278
GradientBoostingRegressor,0.176834
SVR,0.17926

Unnamed: 0,rmsle
LinearRegression,0.185215
KNeighborsRegressor,0.186901
RandomForestRegressor,0.206448
GradientBoostingRegressor,0.177589
SVR,0.17917


In [112]:
# Check if garage sf or cars is better (strange corners, etc.)
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'LogGarageArea'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2])

Unnamed: 0,rmsle
LinearRegression,0.174317
KNeighborsRegressor,0.178254
RandomForestRegressor,0.193082
GradientBoostingRegressor,0.170883
SVR,0.168978

Unnamed: 0,rmsle
LinearRegression,0.180188
KNeighborsRegressor,0.17742
RandomForestRegressor,0.179898
GradientBoostingRegressor,0.170174
SVR,0.170336


In [121]:
# Check if boxcox is better than log scaling
keep_features = [
        'OverallQual',
        'BCTotalSFInclBsmnt',
        'GarageCars'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
show_performance(pred_perf_dict)

#Nope

Unnamed: 0,rmsle
LinearRegression,0.175291
KNeighborsRegressor,0.176511
RandomForestRegressor,0.193033
GradientBoostingRegressor,0.17087
SVR,0.169231


In [135]:
# Check if mean encoding or dummies work better for the neighborhoods
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        *dummie_cols
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2])
# Barely makes a difference, lets stick with mean encoding to keep it easier

Unnamed: 0,rmsle
LinearRegression,0.160549
KNeighborsRegressor,0.167709
RandomForestRegressor,0.168614
GradientBoostingRegressor,0.154145
SVR,0.159973

Unnamed: 0,rmsle
LinearRegression,0.158313
KNeighborsRegressor,0.173963
RandomForestRegressor,0.16974
GradientBoostingRegressor,0.154309
SVR,0.159939


In [140]:
# Check if adding the porches to totalSF will help
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogOpenPorchSF',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmntOpenPorch',
        'GarageCars',
        'ME_Neighborhood',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmntAllPorch',
        'GarageCars',
        'ME_Neighborhood',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds3 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogOpenPorchSF',
        'LogEnclosedPorch', 
        'Log3SsnPorch', 
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds4 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogAllPorches',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds5 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2, preds3, preds4, preds5])

Unnamed: 0,rmsle
LinearRegression,0.160051
KNeighborsRegressor,0.181946
RandomForestRegressor,0.167875
GradientBoostingRegressor,0.153525
SVR,0.160827

Unnamed: 0,rmsle
LinearRegression,0.160437
KNeighborsRegressor,0.167378
RandomForestRegressor,0.168618
GradientBoostingRegressor,0.154018
SVR,0.159574

Unnamed: 0,rmsle
LinearRegression,0.161433
KNeighborsRegressor,0.167485
RandomForestRegressor,0.167577
GradientBoostingRegressor,0.156026
SVR,0.160651

Unnamed: 0,rmsle
LinearRegression,0.15943
KNeighborsRegressor,0.189555
RandomForestRegressor,0.166518
GradientBoostingRegressor,0.153973
SVR,0.162762

Unnamed: 0,rmsle
LinearRegression,0.160593
KNeighborsRegressor,0.184049
RandomForestRegressor,0.165972
GradientBoostingRegressor,0.155032
SVR,0.161752


In [142]:
# Doesnt do much, maybe if we make them dummies
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'HasOpenPorch',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'HasPorch',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2])
# Doesnt do mutch either

Unnamed: 0,rmsle
LinearRegression,0.160164
KNeighborsRegressor,0.169851
RandomForestRegressor,0.168784
GradientBoostingRegressor,0.153201
SVR,0.160996

Unnamed: 0,rmsle
LinearRegression,0.160578
KNeighborsRegressor,0.170882
RandomForestRegressor,0.167338
GradientBoostingRegressor,0.15441
SVR,0.161418


In [144]:
# What about wooden deck?
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'HasWoodDeck',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogWoodDeckSF',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2])
# Too small differences to keep them

Unnamed: 0,rmsle
LinearRegression,0.159597
KNeighborsRegressor,0.170479
RandomForestRegressor,0.166955
GradientBoostingRegressor,0.153508
SVR,0.160534

Unnamed: 0,rmsle
LinearRegression,0.159269
KNeighborsRegressor,0.177172
RandomForestRegressor,0.16663
GradientBoostingRegressor,0.153695
SVR,0.16005


In [147]:
# Lot area is probably going to be the same but lets try anyway
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogLotArea',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2])
# well keep yard area as its a little better

Unnamed: 0,rmsle
LinearRegression,0.157093
KNeighborsRegressor,0.170011
RandomForestRegressor,0.164839
GradientBoostingRegressor,0.152224
SVR,0.156722

Unnamed: 0,rmsle
LinearRegression,0.157331
KNeighborsRegressor,0.1697
RandomForestRegressor,0.163998
GradientBoostingRegressor,0.151912
SVR,0.156825


In [152]:
# Now that we have some of the most important features, check for outliers that might have a big influence
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
show_performance(pred_perf_dict)
# Removing two outliers in totalSF vs logprice makes a big difference for the regression models

9.371863806132854
8.835355971121606


Unnamed: 0,rmsle
LinearRegression,0.152042
KNeighborsRegressor,0.168519
RandomForestRegressor,0.159457
GradientBoostingRegressor,0.15207
SVR,0.152005


In [155]:
# Lot area is probably going to be the same but lets try anyway
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'FullBath', 'HalfBath', 
        'BsmtFullBath', 'BsmtHalfBath', 
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'NormalBathScore',
        'BsmtBathScore',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathScore',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds3 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllHalfBaths',
        'AllFullBaths'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds4 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds5 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2, preds3, preds4, preds5])
# Apparently simply adding all the baths up works best

Unnamed: 0,rmsle
LinearRegression,0.145138
KNeighborsRegressor,0.16804
RandomForestRegressor,0.153702
GradientBoostingRegressor,0.14666
SVR,0.148655

Unnamed: 0,rmsle
LinearRegression,0.146835
KNeighborsRegressor,0.165162
RandomForestRegressor,0.15232
GradientBoostingRegressor,0.146207
SVR,0.148664

Unnamed: 0,rmsle
LinearRegression,0.147731
KNeighborsRegressor,0.164705
RandomForestRegressor,0.153767
GradientBoostingRegressor,0.145905
SVR,0.148805

Unnamed: 0,rmsle
LinearRegression,0.145398
KNeighborsRegressor,0.166624
RandomForestRegressor,0.153529
GradientBoostingRegressor,0.146946
SVR,0.147682

Unnamed: 0,rmsle
LinearRegression,0.145558
KNeighborsRegressor,0.164823
RandomForestRegressor,0.152992
GradientBoostingRegressor,0.144543
SVR,0.146867


In [159]:
# Since yearbuilt has the highest correlation with logsaleprice lets add that one first
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearRemodAdd',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds3 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearRemodAdd',
        'DiffBuiltRemod'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds4 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2, preds3, preds4])
# Adding both creates best result for the GDBR but the different scales messess with some vars

Unnamed: 0,rmsle
LinearRegression,0.1442
KNeighborsRegressor,0.201225
RandomForestRegressor,0.148117
GradientBoostingRegressor,0.140869
SVR,0.336266

Unnamed: 0,rmsle
LinearRegression,0.14115
KNeighborsRegressor,0.180645
RandomForestRegressor,0.146082
GradientBoostingRegressor,0.140484
SVR,0.356484

Unnamed: 0,rmsle
LinearRegression,0.140834
KNeighborsRegressor,0.220265
RandomForestRegressor,0.144089
GradientBoostingRegressor,0.137819
SVR,0.331479

Unnamed: 0,rmsle
LinearRegression,0.140834
KNeighborsRegressor,0.220859
RandomForestRegressor,0.145058
GradientBoostingRegressor,0.139878
SVR,0.342782


In [161]:
# Since yearbuilt has the highest correlation with logsaleprice lets add that one first
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'STYearBuilt'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'STYearRemodAdd',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'STYearBuilt',
        'STYearRemodAdd'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds3 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'STYearRemodAdd',
        'DiffBuiltRemod'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds4 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2, preds3, preds4])
# Allready a lot better for the SVR, no effect on other algos

Unnamed: 0,rmsle
LinearRegression,0.1442
KNeighborsRegressor,0.201225
RandomForestRegressor,0.148179
GradientBoostingRegressor,0.140869
SVR,0.171991

Unnamed: 0,rmsle
LinearRegression,0.14115
KNeighborsRegressor,0.180645
RandomForestRegressor,0.146084
GradientBoostingRegressor,0.140484
SVR,0.15541

Unnamed: 0,rmsle
LinearRegression,0.140834
KNeighborsRegressor,0.220265
RandomForestRegressor,0.144074
GradientBoostingRegressor,0.137819
SVR,0.180926

Unnamed: 0,rmsle
LinearRegression,0.140834
KNeighborsRegressor,0.220859
RandomForestRegressor,0.14503
GradientBoostingRegressor,0.139878
SVR,0.17873


In [164]:
# Since yearbuilt has the highest correlation with logsaleprice lets add that one first
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'StScYearBuilt'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'StScYearRemodAdd',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'StScYearBuilt',
        'StScYearRemodAdd'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds3 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'StScYearRemodAdd',
        'StScDiffBuiltRemod'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds4 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2, preds3, preds4])
# Improvers quite a lot for KNN and SVR

Unnamed: 0,rmsle
LinearRegression,0.1442
KNeighborsRegressor,0.162729
RandomForestRegressor,0.148248
GradientBoostingRegressor,0.140797
SVR,0.145876

Unnamed: 0,rmsle
LinearRegression,0.14115
KNeighborsRegressor,0.164191
RandomForestRegressor,0.146101
GradientBoostingRegressor,0.140482
SVR,0.143166

Unnamed: 0,rmsle
LinearRegression,0.140834
KNeighborsRegressor,0.161598
RandomForestRegressor,0.144016
GradientBoostingRegressor,0.137808
SVR,0.142047

Unnamed: 0,rmsle
LinearRegression,0.140834
KNeighborsRegressor,0.162367
RandomForestRegressor,0.145213
GradientBoostingRegressor,0.139873
SVR,0.176178


In [173]:
# Since yearbuilt has the highest correlation with logsaleprice lets add that one first
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearRemodAdd',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds3 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearRemodAdd',
        'DiffBuiltRemod'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds4 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2, preds3, preds4])
# We better standard scale all of them
# We keep both yearbuolt and remodelled as multicolliniarity does not seems to be a big issue yet

Unnamed: 0,rmsle
LinearRegression,0.1442
KNeighborsRegressor,0.150923
RandomForestRegressor,0.148235
GradientBoostingRegressor,0.140781
SVR,0.147884

Unnamed: 0,rmsle
LinearRegression,0.14115
KNeighborsRegressor,0.151278
RandomForestRegressor,0.146172
GradientBoostingRegressor,0.140533
SVR,0.148798

Unnamed: 0,rmsle
LinearRegression,0.140834
KNeighborsRegressor,0.149231
RandomForestRegressor,0.144091
GradientBoostingRegressor,0.137827
SVR,0.146474

Unnamed: 0,rmsle
LinearRegression,0.140834
KNeighborsRegressor,0.148805
RandomForestRegressor,0.145145
GradientBoostingRegressor,0.139893
SVR,0.145328


In [178]:
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'GarageYrBlt'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'DiffBuiltGarage'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2])

# Adding the year built for garage does not add anything after that

Unnamed: 0,rmsle
LinearRegression,0.141037
KNeighborsRegressor,0.153189
RandomForestRegressor,0.144167
GradientBoostingRegressor,0.139161
SVR,0.144175

Unnamed: 0,rmsle
LinearRegression,0.140888
KNeighborsRegressor,0.155325
RandomForestRegressor,0.144602
GradientBoostingRegressor,0.138598
SVR,0.146336


In [186]:
# Lets see if any other garage vars can still add some
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_GarageFinish', 
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_GarageType'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_GarageTypeClean'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds3 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'GarageQual'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds4 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2, preds3, preds4])

Unnamed: 0,rmsle
LinearRegression,0.140345
KNeighborsRegressor,0.149585
RandomForestRegressor,0.144225
GradientBoostingRegressor,0.138385
SVR,0.144939

Unnamed: 0,rmsle
LinearRegression,0.140576
KNeighborsRegressor,0.151648
RandomForestRegressor,0.144131
GradientBoostingRegressor,0.13795
SVR,0.145472

Unnamed: 0,rmsle
LinearRegression,0.140674
KNeighborsRegressor,0.151359
RandomForestRegressor,0.144327
GradientBoostingRegressor,0.137284
SVR,0.145564

Unnamed: 0,rmsle
LinearRegression,0.139735
KNeighborsRegressor,0.151727
RandomForestRegressor,0.144678
GradientBoostingRegressor,0.136936
SVR,0.145483


In [189]:
# Or a combination of the vars
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'GarageTypeFinish',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'GarageTypeQual'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2])
# All too little to go on, so we leave out the other garage vars alltother

Unnamed: 0,rmsle
LinearRegression,0.140283
KNeighborsRegressor,0.149761
RandomForestRegressor,0.144266
GradientBoostingRegressor,0.138161
SVR,0.14514

Unnamed: 0,rmsle
LinearRegression,0.139703
KNeighborsRegressor,0.151245
RandomForestRegressor,0.144204
GradientBoostingRegressor,0.13691
SVR,0.14495


In [194]:
# Do the other qual vars improve anything?
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ExterQual',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'BsmtQual',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'KitchenQual',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds3 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2, preds3])
# Its a little improvement but lets keep kitchenqual

Unnamed: 0,rmsle
LinearRegression,0.140317
KNeighborsRegressor,0.151559
RandomForestRegressor,0.144388
GradientBoostingRegressor,0.137929
SVR,0.14701

Unnamed: 0,rmsle
LinearRegression,0.140922
KNeighborsRegressor,0.151377
RandomForestRegressor,0.143748
GradientBoostingRegressor,0.137764
SVR,0.146691

Unnamed: 0,rmsle
LinearRegression,0.138966
KNeighborsRegressor,0.149298
RandomForestRegressor,0.143966
GradientBoostingRegressor,0.137637
SVR,0.144592


In [198]:
# Does the mean encoding work better than the continuous var as it is now?
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
show_performance(pred_perf_dict)
# No difference

Unnamed: 0,rmsle
LinearRegression,0.138964
KNeighborsRegressor,0.149824
RandomForestRegressor,0.143968
GradientBoostingRegressor,0.137637
SVR,0.144517


In [None]:
# Should we add foundation?
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_Foundation'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
show_performance(pred_perf_dict)
# Again no improvement

In [206]:
# MSsubclass is still a cotinuous var while actually categorical
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClassGR'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2])
# Little better with the normal version

Unnamed: 0,rmsle
LinearRegression,0.138804
KNeighborsRegressor,0.153706
RandomForestRegressor,0.142744
GradientBoostingRegressor,0.135451
SVR,0.143702

Unnamed: 0,rmsle
LinearRegression,0.139026
KNeighborsRegressor,0.152983
RandomForestRegressor,0.142937
GradientBoostingRegressor,0.136937
SVR,0.144408


In [209]:
# Add heating
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'ME_HeatingQC',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2])
# tiny improvement

Unnamed: 0,rmsle
LinearRegression,0.138298
KNeighborsRegressor,0.151367
RandomForestRegressor,0.142181
GradientBoostingRegressor,0.13503
SVR,0.143503

Unnamed: 0,rmsle
LinearRegression,0.138421
KNeighborsRegressor,0.150521
RandomForestRegressor,0.142217
GradientBoostingRegressor,0.135051
SVR,0.143407


In [210]:
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'FireplaceQu'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds3 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2, preds3])
# Keep both

Unnamed: 0,rmsle
LinearRegression,0.135785
KNeighborsRegressor,0.149998
RandomForestRegressor,0.14169
GradientBoostingRegressor,0.133341
SVR,0.141751

Unnamed: 0,rmsle
LinearRegression,0.136361
KNeighborsRegressor,0.149398
RandomForestRegressor,0.140714
GradientBoostingRegressor,0.132493
SVR,0.140971

Unnamed: 0,rmsle
LinearRegression,0.135825
KNeighborsRegressor,0.151032
RandomForestRegressor,0.140888
GradientBoostingRegressor,0.132807
SVR,0.140938


In [220]:
# Check the yearmo trend
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_YearMo',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'YearMo_smoothline',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2])
# dont keep it

Unnamed: 0,rmsle
LinearRegression,0.136441
KNeighborsRegressor,0.157748
RandomForestRegressor,0.140872
GradientBoostingRegressor,0.133917
SVR,0.144943

Unnamed: 0,rmsle
LinearRegression,0.136022
KNeighborsRegressor,0.154871
RandomForestRegressor,0.142674
GradientBoostingRegressor,0.133975
SVR,0.141744


In [223]:
# Check GasA dummie
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'Heating_GasA'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
show_performance(pred_perf_dict)
# Nope

Unnamed: 0,rmsle
LinearRegression,0.135862
KNeighborsRegressor,0.155604
RandomForestRegressor,0.140012
GradientBoostingRegressor,0.132767
SVR,0.141482


In [231]:
# Check masvnrarea
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'LogMasVnrArea'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
show_performance(pred_perf_dict)
# Nope

Unnamed: 0,rmsle
LinearRegression,0.135489
KNeighborsRegressor,0.153077
RandomForestRegressor,0.141111
GradientBoostingRegressor,0.133171
SVR,0.144171


In [233]:
# Check masvnrtype
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_MasVnrType'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
show_performance(pred_perf_dict)
# Nope

Unnamed: 0,rmsle
LinearRegression,0.135731
KNeighborsRegressor,0.152667
RandomForestRegressor,0.140348
GradientBoostingRegressor,0.13312
SVR,0.141147


In [235]:
# Check mszoning
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_MSZoning'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
show_performance(pred_perf_dict)
# A tiny bit of improvement

Unnamed: 0,rmsle
LinearRegression,0.135079
KNeighborsRegressor,0.153694
RandomForestRegressor,0.139581
GradientBoostingRegressor,0.13279
SVR,0.143123


In [236]:
#CentralAir
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_MSZoning',
        'CentralAir'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
show_performance(pred_perf_dict)
# tiny bit

Unnamed: 0,rmsle
LinearRegression,0.133871
KNeighborsRegressor,0.154038
RandomForestRegressor,0.139504
GradientBoostingRegressor,0.132892
SVR,0.139072


In [240]:
#CentralAir
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_MSZoning',
        'ME_SaleCondition'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
show_performance(pred_perf_dict)
# Some went up but most down

Unnamed: 0,rmsle
LinearRegression,0.133773
KNeighborsRegressor,0.156406
RandomForestRegressor,0.139823
GradientBoostingRegressor,0.133523
SVR,0.14127


In [245]:
# Is house new?
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_MSZoning',
        'IsNew'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True, new_session=True)
show_performance(pred_perf_dict)
# Nope

Unnamed: 0,rmsle
LinearRegression,0.1346
KNeighborsRegressor,0.156778
RandomForestRegressor,0.140791
GradientBoostingRegressor,0.13334
SVR,0.14249


In [247]:
# Housestyle vs mssubclass
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_MSZoning',
        'ME_HouseStyle'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_MSZoning',
        #'ME_HouseStyle'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        #'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_MSZoning',
        'ME_HouseStyle'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds3 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2, preds3])
# Just keep as is

Unnamed: 0,rmsle
LinearRegression,0.134803
KNeighborsRegressor,0.154983
RandomForestRegressor,0.139704
GradientBoostingRegressor,0.132454
SVR,0.141357

Unnamed: 0,rmsle
LinearRegression,0.135079
KNeighborsRegressor,0.153694
RandomForestRegressor,0.139581
GradientBoostingRegressor,0.13279
SVR,0.143123

Unnamed: 0,rmsle
LinearRegression,0.135098
KNeighborsRegressor,0.155776
RandomForestRegressor,0.140096
GradientBoostingRegressor,0.134425
SVR,0.139739


In [248]:
# basement finish type
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_MSZoning',
        'BsmtFinType1'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
show_performance(pred_perf_dict)
# Nope

Unnamed: 0,rmsle
LinearRegression,0.133187
KNeighborsRegressor,0.155495
RandomForestRegressor,0.13988
GradientBoostingRegressor,0.133238
SVR,0.141223


In [254]:
# Basementscore
keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_MSZoning',
        'BsmtScore'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds1 = show_performance(pred_perf_dict)

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_MSZoning',
        'LogBsmtScore'
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_features=True)
preds2 = show_performance(pred_perf_dict)

display_side_by_side(*[preds1, preds2])
# Improves a bit

Unnamed: 0,rmsle
LinearRegression,0.131189
KNeighborsRegressor,0.153897
RandomForestRegressor,0.137753
GradientBoostingRegressor,0.132359
SVR,0.142519

Unnamed: 0,rmsle
LinearRegression,0.132242
KNeighborsRegressor,0.15508
RandomForestRegressor,0.137688
GradientBoostingRegressor,0.132323
SVR,0.142355


In [20]:
# Add sq features
square_features = ['YearBuilt']

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_MSZoning',
        'BsmtScore',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, target, new_session=True, new_features=True)
show_performance(pred_perf_dict)

Unnamed: 0,rmsle
LinearRegression,0.13074
KNeighborsRegressor,0.154771
RandomForestRegressor,0.138236
GradientBoostingRegressor,0.132101
SVR,0.139437


In [26]:
# Add sq features
square_features = ['YearBuilt', 'YearRemodAdd']

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_MSZoning',
        'BsmtScore',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, square_features, target, new_features=True)
show_performance(pred_perf_dict)

sq_YearBuilt
sq_YearRemodAdd
sq_YearBuilt
sq_YearRemodAdd
sq_YearBuilt
sq_YearRemodAdd
sq_YearBuilt
sq_YearRemodAdd
sq_YearBuilt
sq_YearRemodAdd
sq_YearBuilt
sq_YearRemodAdd
sq_YearBuilt
sq_YearRemodAdd
sq_YearBuilt
sq_YearRemodAdd
sq_YearBuilt
sq_YearRemodAdd
sq_YearBuilt
sq_YearRemodAdd


Unnamed: 0,rmsle
LinearRegression,0.130529
KNeighborsRegressor,0.155395
RandomForestRegressor,0.137993
GradientBoostingRegressor,0.132267
SVR,0.13741


In [28]:
# Add sq features
square_features = ['YearBuilt', 'YearRemodAdd', 'LogTotalSFInclBsmnt']

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_MSZoning',
        'BsmtScore',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, square_features, target, new_features=True)
show_performance(pred_perf_dict)

Unnamed: 0,rmsle
LinearRegression,0.12936
KNeighborsRegressor,0.153657
RandomForestRegressor,0.138766
GradientBoostingRegressor,0.13117
SVR,0.145612


In [32]:
# Add sq features
square_features = ['YearBuilt', 'YearRemodAdd', 'LogTotalSFInclBsmnt', 'FireplaceQu']

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_MSZoning',
        'BsmtScore',
]

pred_perf_dict, predictions_df = feature_testing_pipe(data, keep_features, square_features, target, new_features=True)
show_performance(pred_perf_dict)

Unnamed: 0,rmsle
LinearRegression,0.129167
KNeighborsRegressor,0.15382
RandomForestRegressor,0.138523
GradientBoostingRegressor,0.131116
SVR,0.145724


In [None]:
# Final feature set
square_features = ['YearBuilt', 'YearRemodAdd', 'LogTotalSFInclBsmnt', 'FireplaceQu']

keep_features = [
        'OverallQual',
        'LogTotalSFInclBsmnt',
        'GarageCars',
        'ME_Neighborhood',
        'LogYardArea',
        'AllBathsSum',
        'YearBuilt',
        'YearRemodAdd',
        'ME_KitchenQual',
        'ME_MSSubClass',
        'HeatingQC',
        'Fireplaces',
        'FireplaceQu',
        'ME_MSZoning',
        'BsmtScore'
]