In [1]:
from functions import *
from settings import *

%store -r __RequiredPackages
%store -r __JupyterOptions

In [2]:
__RequiredPackages

In [3]:
__JupyterOptions

Populating the interactive namespace from numpy and matplotlib


In [4]:
# Import data
data = pd.read_csv('train.csv')
validation = pd.read_csv('test.csv')

In [29]:
def feature_testing_pipe(data, keep_features, target, NA_means_not_there_cols=NA_means_not_there_cols, 
                         cont_impute_cols=cont_impute_cols, cat_impute_cols=cat_impute_cols, 
                         dev_seed=dev_seed, new_session=False, new_features=False):
    """
    Does all the preprocessing steps,
    uses cross validation to test the performance of several models.
    Used to quickly compare different sets of features
    
    Note: Make sure the indexes are in order without gaps
    """
    # Transform features that dont create data leakage issues
    if new_features | new_session:
        data = standard_preprocessing_function(data, new_session, NA_means_not_there_cols)
    
    # Create different splits of train and test
    kf = KFold(n_splits=5, shuffle=True, random_state=dev_seed)
    
    # Initiate empty object for further analysis
    pred_perf_dict = {'LinearRegression': [], 'KNeighborsRegressor': [], 'RandomForestRegressor': [],
                      'GradientBoostingRegressor': [], 'SVR': []}
    predictions_df = pd.DataFrame({'obs_nr': data.index})
    cv_round = 0
    
    # Split features and target
    X = data.drop(target, axis=1)
    y = data[target]
    
    # Fit models while looping through the train/test-splits
    for train_index, test_index in kf.split(X): 
        cv_round  += 1
        
        X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
        y_train, y_test = y.values[train_index], y.values[test_index]
        
        # Prepare features based on target variable in testset
        prepper = leakage_preventive_preprocessing_function(cont_impute_cols, cat_impute_cols, keep_features)
        X_train = prepper.fit_transform(X_train, y_train)
        X_test = prepper.transform(X_test)
        
        # Fit models
        pred_perf_dict, predictions_df = fit_model('LinearRegression', X_train, y_train, X_test, y_test, test_index, 
                                                   pred_perf_dict, predictions_df, cv_round)
        pred_perf_dict, predictions_df = fit_model('KNeighborsRegressor', X_train, y_train, X_test, y_test, test_index, 
                                                   pred_perf_dict, predictions_df, cv_round)
        pred_perf_dict, predictions_df = fit_model('RandomForestRegressor', X_train, y_train, X_test, y_test, test_index, 
                                                   pred_perf_dict, predictions_df, cv_round)
        pred_perf_dict, predictions_df = fit_model('GradientBoostingRegressor', X_train, y_train, X_test, y_test, test_index, 
                                                   pred_perf_dict, predictions_df, cv_round)
        pred_perf_dict, predictions_df = fit_model('SVR', X_train, y_train, X_test, y_test, test_index, 
                                                   pred_perf_dict, predictions_df, cv_round)

        #print('Completed predicting round {}'.format(cv_round))
        
    return pred_perf_dict, predictions_df
        
def fit_model(model_name, X_train, y_train, X_test, y_test, test_index, 
              pred_perf_dict, predictions_df, cv_round, n_jobs=n_jobs, dev_seed=dev_seed):
    if model_name == 'LinearRegression':
        model = LinearRegression(n_jobs=n_jobs).fit(X_train, y_train)
    elif model_name == 'KNeighborsRegressor':  
        model = neighbors.KNeighborsRegressor(n_neighbors = 7, n_jobs=n_jobs).fit(X_train, y_train)
    elif model_name == 'RandomForestRegressor':
        model = RandomForestRegressor(n_estimators=50, random_state=dev_seed, n_jobs=n_jobs).fit(X_train, y_train)
    elif model_name == 'GradientBoostingRegressor':
        model = GradientBoostingRegressor(random_state=dev_seed).fit(X_train, y_train)
    elif model_name == 'SVR':
        model = SVR(kernel = 'rbf').fit(X_train, y_train)
    
    preds = model.predict(X_test)
    pred_perf_dict[model_name].append(sqrt(mean_squared_error(y_test, preds)))
    temp_preds = pd.DataFrame({'obs_nr': test_index, model_name + str(cv_round): preds})
    predictions_df = predictions_df.merge(temp_preds, how='left', on='obs_nr')
    
    return pred_perf_dict, predictions_df

def show_performance(pred_perf_dict):
    return pd.DataFrame(pd.DataFrame(pred_perf_dict).mean(axis=0), columns=['rmsle'])

def standard_preprocessing_function(data, new_session, NA_means_not_there_cols, scale_cont_cols):
    """
    Prepare the dataset across train and testset.
    No data leakage issues at this stage
    """
    # Replace salesprice with a log scaled version of it
    data['LogSalePrice'] = np.log(data['SalePrice'])
    # Use np.exp on predictions to scale back to actual sales price
    
    # Impute missing values where they are not at random
    data[NA_means_not_there_cols] = data[NA_means_not_there_cols].fillna('Not_present') 
    data['LotFrontage'] = data['LotFrontage'].fillna(0)
    
    # Create binary features
    data['AllBathsSum'] = np.sum(data[['BsmtHalfBath', 'HalfBath', 'BsmtFullBath', 'FullBath']], axis=1)
    data['TotalSFInclBsmnt'] = np.sum(data[['TotalBsmtSF', '1stFlrSF', '2ndFlrSF']], axis=1)
    data['YardArea'] = data['LotArea'] - data['1stFlrSF'] - data['GarageArea']
    
    # Make a more even distribution for continuous features as well
    for col in [scale_cont_cols]:
        data['Log' + col] = np.log1p(data[col])
        
    # Transform existing variables
    data = replace_ordinal_values(data)
    
    # Remove outliers
    data = data[data['LogTotalSFInclBsmnt'] < 8.9].reset_index(drop=True)
    
    return data


class leakage_preventive_preprocessing_function():
    
    def __init__(self, cont_impute_cols, cat_impute_cols, mean_enc_cols, keep_features):
        self.cont_impute_cols = cont_impute_cols
        self.cat_impute_cols = cat_impute_cols
        self.mean_enc_cols = mean_enc_cols
        self.keep_features = keep_features
        self.mean_enc_dict = {}
    
    def fit(self, X, y):
        # Fit regression to impute NAs for GarageYrBlt
        self.reg = LinearRegression().fit(X.loc[X['GarageYrBlt'].notna(), ['GarageYrBlt']], 
                                          y[X['GarageYrBlt'].notna()])
        self.avg_houseprice_nogarage = np.mean(y[X['GarageYrBlt'].isna()])
        self.garage_yearbuilt_impute = (self.avg_houseprice_nogarage - self.reg.intercept_) / self.reg.coef_[0]
        
        # Fit imputer to impute missing values
        self.num_imputer = SimpleImputer(missing_values=np.nan, strategy='median').fit(X[self.cont_impute_cols])
        self.cat_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent').fit(X[self.cat_impute_cols])
        
        # Mean encode based on the current split
        X[target] = y
        for col in [self.mean_enc_cols]:
            self.mean_enc_dict[col] = X.groupby(col)[target].mean()
        X = X.drop([target], axis=1)
        
        # Save mean of target when ME's could not be created due to specific split
        self.target_mean = np.mean(y)
        
        # Transform mean encodings for standard scaler 
        for col in [self.mean_enc_cols]:
            X['ME_' + col] = X[col].map(self.mean_enc_dict[col])
            
            # Median impute NA's for the encoding
            X.loc[X['ME_' + col].isnull(), 'ME_' + col] = self.target_mean
        
        self.standardscaler = preprocessing.StandardScaler().fit(X[keep_features])
    
    def transform(self, X):
        # Impute missing values based on specific strategy
        X.loc[X['GarageYrBlt'].isna(), 'GarageYrBlt'] = self.garage_yearbuilt_impute
        
        # Use imputer to impute missing values
        X[self.cont_impute_cols] = self.num_imputer.transform(X[self.cont_impute_cols])
        X[self.cat_impute_cols] = self.cat_imputer.transform(X[self.cat_impute_cols])
        
        # Transform mean encodings
        for col in [self.mean_enc_cols]:
            X['ME_' + col] = X[col].map(self.mean_enc_dict[col])
            
            # Median impute NA's for the encoding
            X.loc[X['ME_' + col].isnull(), 'ME_' + col] = self.target_mean

        X[keep_features] = self.standardscaler.transform(X[keep_features])
        
        return X[keep_features]
    
    def fit_transform(self, X, y):
        self.fit(X, y)
        X = self.transform(X)
        return X
    
def replace_ordinal_values(X):
    """
    Check whether any numerical variables are actually categorical and vice versa
    """
    clean_up_dict = {
        # Categorical to numerical
                    "LotShape": {'IR3': 0, 'IR2': 1, 'IR1': 2, 'Reg': 3},
                    "LandSlope": {"Gtl": 2, "Mod": 1, "Sev": 0},
                    "ExterQual": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
                    "ExterCond": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
                    "BsmtQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Not_present": 0},
                    "BsmtCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Not_present": 0},
                    "BsmtExposure": {"Gd": 4, "Av": 3, "Mn": 2, "No": 1, "Not_present": 0},
                    "BsmtFinType1": {"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec": 3, "LwQ": 2, "Unf": 1, "Not_present": 0},
                    "BsmtFinType2": {"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec": 3, "LwQ": 2, "Unf": 1, "Not_present": 0},
                    "HeatingQC": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
                    "CentralAir": {"Y": 1, "N": 0},
                    "KitchenQual": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
                    "Functional": {"Typ": 7, "Min1": 6, "Min2": 5, "Mod": 4, "Maj1": 3, "Maj2": 2, "Sev": 1, "Sal": 0},
                    "FireplaceQu": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Not_present": 0},
                    "GarageFinish": {"Fin": 3, "RFn": 2, "Unf": 1, "Not_present": 0},
                    "GarageQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Not_present": 0},
                    "GarageCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Not_present": 0},
                    "PavedDrive": {"Y": 2, "P": 1, "N": 0},
                    "PoolQC": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Not_present": 0},
                    "Fence": {"GdPrv": 4, "MnPrv": 3, "GdWo": 2, "MnWw": 1, "Not_present": 0},
        # Numerical to categorical
                    "MSSubClass": {20: "A", 30: "B", 40: "C", 45: "D", 50: "E", 60: "F", 70: "G", 75: "H",
                                   80: "I", 85: "J", 90: "K", 120: "L", 150: "M", 160: "N", 180: "O", 190: "P"}
    }
    X.replace(clean_up_dict, inplace=True)
    
    return X

In [7]:
# Divide the actual train in train and test
X = data.drop(['SalePrice', 'LogSalePrice'], axis=1)
y = data['LogSalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testset_size, random_state=dev_seed)

len(X_train), len(X_test), len(y_train), len(y_test)

(1095, 365, 1095, 365)

In [15]:
X_train = standard_preprocessing_function(X_train, NA_means_not_there_cols)
X_test = standard_preprocessing_function(X_test, NA_means_not_there_cols)
X_validation = standard_preprocessing_function(X_validation, NA_means_not_there_cols)

prepper = leakage_preventive_preprocessing_function(cont_impute_cols, cat_impute_cols)
X_train = prepper.fit_transform(X_train, y_train)
X_test = prepper.transform(X_test)
validation = prepper.transform(validation)

In [26]:
clean_up_dict = {
    # Categorical to numerical
                "LotShape": {'IR3': 0, 'IR2': 1, 'IR1': 2, 'Reg': 3},
                "LandSlope": {"Gtl": 2, "Mod": 1, "Sev": 0},
                "ExterQual": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
                "ExterCond": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
                "BsmtQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Not_present": 0},
                "BsmtCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Not_present": 0},
                "BsmtExposure": {"Gd": 4, "Av": 3, "Mn": 2, "No": 1, "Not_present": 0},
                "BsmtFinType1": {"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec": 3, "LwQ": 2, "Unf": 1, "Not_present": 0},
                "BsmtFinType2": {"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec": 3, "LwQ": 2, "Unf": 1, "Not_present": 0},
                "HeatingQC": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
                "CentralAir": {"Y": 1, "N": 0},
                "KitchenQual": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
                "Functional": {"Typ": 7, "Min1": 6, "Min2": 5, "Mod": 4, "Maj1": 3, "Maj2": 2, "Sev": 1, "Sal": 0},
                "FireplaceQu": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Not_present": 0},
                "GarageFinish": {"Fin": 3, "RFn": 2, "Unf": 1, "Not_present": 0},
                "GarageQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Not_present": 0},
                "GarageCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Not_present": 0},
                "PavedDrive": {"Y": 2, "P": 1, "N": 0},
                "PoolQC": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Not_present": 0},
                "Fence": {"GdPrv": 4, "MnPrv": 3, "GdWo": 2, "MnWw": 1, "Not_present": 0},
    # Numerical to categorical
                "MSSubClass": {20: "A", 30: "B", 40: "C", 45: "D", 50: "E", 60: "F", 70: "G", 75: "H",
                               80: "I", 85: "J", 90: "K", 120: "L", 150: "M", 160: "N", 180: "O", 190: "P"}
}
data.replace(clean_up_dict, inplace=True)

In [None]:
data.to_csv('prep_train.csv', index=False)
validation.to_csv('prep_test.csv', index=False)