# Business understanding and set-up

This notebook provides a simplified way to **load, inspect, evaluate and save existing datasets and models** that may be used for the interactive web app.

## Libraries and dashboard

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import joblib
import os, glob
import math

# Evaluation Metrics
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer, fbeta_score, accuracy_score, confusion_matrix, f1_score, precision_recall_curve, recall_score, precision_score, roc_auc_score
from scipy.sparse import csr_matrix
import scipy.stats as stats

In [2]:
# Dashboard
dataset_loc = "paris"   # "berlin", "paris", "amsterdam", "barcelona"
dataset_date = "2020-03-16"  # berlin: "2020-03-17", paris: "2020-03-16", amsterdam: "2020-03-14", barcelona: "2020-03-16", 
model_run = "2020-08-26"     # date of dataset/model creation (determines subfolder for saves of datasets/models)

pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 150)
pd.options.display.max_seq_items = 300

As mentioned further above, it is necessary to **explicitly declare target** as the notebook was previously set-up to enable analysis and modeling on varying target features.

## Global functions and variables

In [3]:
# "save_load": Function for saving and loading datasets/models (joblib)
def save_load(data=False, title="unknown", file_format="pkl", function="save", dataset_loc=dataset_loc, dataset_date=dataset_date, model_run=model_run):
    if function=="save":
        if file_format=="pkl":
            joblib.dump(data, f"data/{dataset_loc}_{dataset_date}/{model_run}/{title}.pkl")
        elif file_format=="app":
            joblib.dump(data, f"data/{dataset_loc}_{dataset_date}/{title}.pkl")
        else:
            print("Please enter a valid file_format (default is pkl; 'app' for data used in web app).")
    elif function=="load":
        if file_format=="pkl":
            return joblib.load(f"data/{dataset_loc}_{dataset_date}/{model_run}/{title}.pkl")
        elif file_format=="csv":
            return pd.read_csv(f"data/{dataset_loc}_{dataset_date}/{title}.csv")
        elif file_format=="csv.gz":
            return pd.read_csv(f"data/{dataset_loc}_{dataset_date}/{title}.csv.gz")
        elif file_format=="geojson":
            return pd.read_csv(f"data/{dataset_loc}_{dataset_date}/{title}.geojson")
        else:
            print("Please enter a valid file_format (default is pkl).")

In [4]:
# "model_eval": Function for final evaluation of "best model"
def model_eval(y, y_pred, model="reg"):
    """
    Please always specify the type of model:
    Regression: model="reg"
    Binary Classification: model="bclf"
    Multiclass Classification: model="clf"
    """
    if model=="reg":
        print("MSE: {:.2f}".format(mean_squared_error(y, y_pred)))
        print("RMSE: {:.2f}".format(
        mean_squared_error(y, y_pred, squared=False)))
        print("MAE: {:.2f}".format(mean_absolute_error(y, y_pred)))
        print("R2: {:.2f}".format(r2_score(y, y_pred)))
        print("MAPE: {:.2f}".format(mean_absolute_percentage_error(y, y_pred)))
        print("MAPE median: {:.2f}".format(median_absolute_percentage_error(y, y_pred)))

    elif model=="bclf":
        print("Accuracy: {:.2f}".format(accuracy_score(y, y_pred)))
        print("Recall: {:.2f}".format(recall_score(y, y_pred)))
        print("Precision: {:.2f}".format(precision_score(y, y_pred)))
        print("F1 Score: {:.2f}".format(f1_score(y, y_pred)))
        print("ROC/AUC: {:.2f}".format(roc_auc_score(y, y_pred)))
        print("Confusion Matrix: \n" + str(confusion_matrix(y, y_pred)))

    elif model=="clf":
        print("Accuracy: {:.2f}".format(accuracy_score(y, y_pred)))
        print("Recall: {:.2f}".format(recall_score(y, y_pred, average='weighted')))
        print("Precision: {:.2f}".format(precision_score(y, y_pred, average='weighted')))
        print("F1 Score: {:.2f}".format(f1_score(y, y_pred, average='weighted')))
        print("Confusion Matrix: \n" + str(confusion_matrix(y, y_pred)))
    
    else:
        print("Please revise your parameters (e.g. provide a valid model).")

In [5]:
# "mean_absolute_percentage_error": Function for mean absolute percentage error (MAPE)
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [6]:
# "median_absolute_percentage_error": Function for median absolute percentage error (MAPE median)
def median_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.median(np.abs((y_true - y_pred) / y_true)) * 100

In [7]:
# "get_feat_importances": Function for retrieving feature importances
def get_feat_importances(model, column_names):
    model=model
    feat_importances = pd.DataFrame(model.feature_importances_,
                 columns=['weight'],
                 index=column_names)
    feat_importances.sort_values('weight', inplace=True, ascending=False)
    return feat_importances

## Inspect datasets

**Import raw dataset**

In [8]:
# Import raw dataset as DataFrame (as csv-file)
data_raw = save_load(title="listings", file_format="csv.gz", function="load")

  if (await self.run_code(code, result,  async_=asy)):


In [9]:
# Assign data_raw to data (in order to always keep a freshly imported data_raw) and set id as index
data = data_raw.copy()
data.set_index('id', inplace=True)

**Import data_clean or data_engineered**

In [10]:
# Import data_engineered
data_engineered = save_load(title="data_engineered", function="load")

**Import X_test and preprocessor**

In [11]:
X_test = save_load(title="X_test", function="load")
preprocessor = save_load(title="preprocessor", function="load")
X_test_prep = preprocessor.transform(X_test)

**Import existing model**

In [12]:
# Load existing model
#best_model = save_load(title="best_model_xgb_reg", function="load")
#best_cv = save_load(title="best_cv_xgb_reg", function="load")

**Inspect datasets**

In [26]:
list(set(data_engineered.zipcode))

['zip_75012',
 'zip_75019',
 'zip_75013',
 'zip_75005',
 'zip_75016',
 'zip_75006',
 'zip_75015',
 'zip_75116',
 'zip_75001',
 'zip_75003',
 'zip_75017',
 'zip_75002',
 'zip_75011',
 'zip_75009',
 'zip_75018',
 'zip_75020',
 'zip_75004',
 'zip_other',
 'zip_75010',
 'zip_75014',
 'zip_75008',
 'zip_75007']

# Model selection for web application

**data_engineered**

In [14]:
# Load or assign data_engineered
#APP_data_engineered = save_load(title="data_engineered", function="load", dataset_loc=dataset_loc, dataset_date=dataset_date, model_run=model_run)
#APP_data_engineered = data_engineered.copy()

In [15]:
# Save best model and cv
#save_load(APP_data_engineered, title="APP_data_engineered", file_format="app", function="save")

**Best model**

In [16]:
# Load or assign best model
#APP_best_model = save_load(title="best_model_xgb_reg", function="load", dataset_loc=dataset_loc, dataset_date=dataset_date, model_run=model_run)
#APP_best_model = best_model_xgb_reg

In [17]:
# Save best model and cv
#save_load(APP_best_model, title="APP_best_model", file_format="app", function="save")

**Preprocessor**

In [18]:
# Load or assign preprocessor
#APP_preprocessor = save_load(title="preprocessor", function="load", dataset_loc=dataset_loc, dataset_date=dataset_date, model_run=model_run)
#APP_preprocessor = X_train_prep_preprocessor

In [19]:
# Save preprocessor
#save_load(APP_preprocessor, title="APP_preprocessor", file_format="app", function="save")

**X_test**

In [20]:
# Load or assign X_test
#APP_X_test = save_load(title="X_test", function="load", dataset_loc=dataset_loc, dataset_date=dataset_date, model_run=model_run)
#APP_X_test = X_test

In [21]:
# Save X_test
#save_load(APP_X_test, title="APP_X_test", file_format="app", function="save")

**MAPE_median**

In [22]:
# Load or assign MAPE_median
#APP_MAPE_median = save_load(title="MAPE_median_xgb_reg", function="load", dataset_loc=dataset_loc, dataset_date=dataset_date, model_run=model_run)
#APP_MAPE_median = MAPE_median_xgb_reg

In [23]:
# Save MAPE_median
#save_load(APP_MAPE_median, title="APP_MAPE_median", file_format="app", function="save")

**Zipcode**

In [27]:
# Load or assign zipcode
#APP_zipcode = save_load(title="MAPE_median_xgb_reg", function="load", dataset_loc=dataset_loc, dataset_date=dataset_date, model_run=model_run)
#APP_zipcode = list(set(data_engineered.zipcode))
#APP_zipcode.sort()

In [29]:
# Save zipcode
#save_load(APP_zipcode, title="APP_zipcode", file_format="app", function="save")