## Project: Predicting house sale prices
## Part 2: Test sample scoring and final model selection
**Data:** housing data for the city of Ames, Iowa, USA, 2006 to 2010  
**Data description:** https://s3.amazonaws.com/dq-content/307/data_description.txt  
**Source:** https://www.tandfonline.com/doi/abs/10.1080/10691898.2011.11889627  
**Source pdf:** https://www.tandfonline.com/doi/pdf/10.1080/10691898.2011.11889627?needAccess=true


In [1]:
# Set-up auto-reload functions for faster debugging 
# (automatically refreshes changes in subpackages codes)
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [2]:
# Import parent directory (main project directory)
# for packages importing
import sys
import os

# Getting the parent directory name in which your script is running
parent = os.path.dirname(os.path.realpath(''))

# adding the parent directory to
# the sys.path.
sys.path.append(parent)

# now we can import the module in the parent
# directory.

In [3]:
# Project packages import
import gp23package.data.make_dataset as gp23md
import gp23package.explore_visualise.eda as gp23eda
import gp23package.features.build_features as gp23feat
import gp23package.models.hyperparameters_model as gp23hyperparam
import gp23package.models.train_model as gp23train
# Pylance highligting package issue (not to be worried about)
# https://github.com/microsoft/pylance-release/blob/main/TROUBLESHOOTING.md#unresolved-import-warnings

# Standard Python libraries import
from IPython.display import display, HTML #  tidied-up display
from time import time #  project timer
from itertools import chain # for list iterations

# plots
import matplotlib.pyplot as plt
import seaborn as sn

# Statistics
from scipy import stats
from scipy.stats import mstats

# Sklearn
from sklearn.feature_selection import (SelectKBest, chi2, f_regression, RFE, mutual_info_regression,
                                      SequentialFeatureSelector, SelectFromModel)
from sklearn.preprocessing import (normalize, MinMaxScaler)
from sklearn.metrics import (mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score)
from sklearn.model_selection import (train_test_split, cross_val_score, KFold, GridSearchCV)
from sklearn.linear_model import (LinearRegression, Ridge, ElasticNet, Lasso, TheilSenRegressor, RANSACRegressor,
                                  HuberRegressor , SGDRegressor, Lars, ElasticNet, RidgeCV)

#statsmodels
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.compat import lzip
import statsmodels as sm
import statsmodels.stats.stattools as smt
import statsmodels.stats.diagnostic as smd

# Other
from dython import nominal # Correlations between categorical variables

# Necessary packages
import gp23package
import numpy as np
import pandas as pd
import pickle # dump models

#turning on plot display in JN
%matplotlib inline 
# Setting pandas display options
pd.options.display.max_columns = 300
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 250

### 8. Importing models & data transformations

In [5]:
#Loading saved models and transformation objects
filename = 'model_list.sav'
file = os.path.join(parent, 'models', filename)
with open(file, 'rb') as file_open:
    test_model_list = pickle.load(file_open)

filename = 'boxCox_dict.sav'
file = os.path.join(parent, 'data\processed', filename)
with open(file, 'rb') as file_open:
    test_boxCox_dict = pickle.load(file_open)

filename = 'data_optbin.sav'
file = os.path.join(parent, 'data\processed', filename)
with open(file, 'rb') as file_open:
    test_data_optbin = pickle.load(file_open)

filename = 'var_dict.sav'
file = os.path.join(parent, 'data\processed', filename)
with open(file, 'rb') as file_open:
    scoring_dict = pickle.load(file_open)

# Loading test datasets
filename = 'X_test.csv'
file = os.path.join(parent, 'data\interim', filename)
X_test = pd.read_csv(file)

filename = 'y_test.csv'
file = os.path.join(parent, 'data\interim', filename)
y_test = pd.read_csv(file)


### 9. Scoring new data - test dataset

In [8]:
# MANUAL_INPUT
# Scoring new vector of data
# For this we need to have list of used variables for final models
# If we have more than one model we would have to prepare more list concatenating all lists
# Attention: for scoring we do need to use original variable list with exactly the same order of variables
# In our current case var_dict["BKWD_20"]

scoring_vars = list(set(var_dict["BKWD_20"] + var_dict["FWD_10"]))


scoring_varsRaw = []
# List of prefixes differentiating engineered variables from original variables
prefixes = ["WOE_", "Box_"]

# taking all variable variations for final models
for var in chosen_bestModels["variable_set"].unique():
    # for each variable in variable set
    for i in var_dict[var]:
        # Removing prefixes from prefixes list
        for prefix in prefixes:
            i = i.removeprefix(prefix)
    # Adding raw variable name to scoring vars (if there are more than 1 variable lists there can be duplicates)
        scoring_varsRaw.append(i)

# Deduplicating variable names
# Final list will probably come in different order than used source lists
scoring_varsRaw = list(set(scoring_varsRaw))
print('*** raw scoring_vars ***')
print(scoring_varsRaw,'\n')

# Remark: in this project we divided data to train_valid / test samples after preliminary cleaning. In real-world cases
# we will not have access to incoming data, hence all the preliminary data cleaning would have to be done on new data
# as well. 

# Limiting number of input variables
X_test = X_test[scoring_varsRaw]

# Transforming y metric
# we are not winsorizing test data outcome variable as this would lead to false fit metrics
y2_test = np.log(y_test) 

# Creating empty subset of final scoring variavbles (engineered)
scoringFinal_vars = list(set(scoring_varsRaw).intersection(discrete))
print('*** discrete variables intersection ***')
print(scoringFinal_vars,'\n')
print('*** Variables used for scoring ***')
print(scoring_vars,'\n')

KeyError: 'variable_set'

In [None]:
# Transforming input variables
# WoE transformation for categorical variables (common values between categorical vars and scoring_vars)
for i in set(categorical).intersection(scoring_varsRaw) :
    X_test["WOE_"+i] = var_transform(data = X_test, var_name = i, optbin_dict = data_optbin)   
# Box-Cox power transformation for continuous variables (common values between continuous and scoring_vars)
# We use lambdas from boxCox_dict dictionary (lambda values calculated on train_valid dataset). This is to prevent
# Target information leakage between samples
for i in set(continuous).intersection(scoring_varsRaw) :
    X_test["Box_"+i] = stats.boxcox(x = X_test[i]+1 , lmbda = boxCox_dict[i])