In [41]:
# Import libraries

# General
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, make_pipeline # Same, but with the latter it is not necessary to name estimator and transformer
#from imblearn.pipeline import Pipeline as Imb_Pipe
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN

# Feature Selection
from sklearn.feature_selection import SelectKBest, chi2, f_classif, GenericUnivariateSelect, mutual_info_classif
import eli5

# Predictive Modeling (Models)
from sklearn.dummy import DummyClassifier, DummyRegressor
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_predict, cross_val_score, cross_validate, KFold
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC, NuSVC, SVR
from sklearn.linear_model import LinearRegression, LogisticRegression, PassiveAggressiveRegressor, ElasticNet, SGDRegressor, RANSACRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor, GradientBoostingRegressor, VotingClassifier, RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, IsolationForest
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from xgboost import XGBClassifier, XGBRegressor
from scipy.stats import randint

# Evaluation Metrics
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer, fbeta_score, accuracy_score, confusion_matrix, f1_score, precision_recall_curve, recall_score, precision_score
from scipy.sparse import csr_matrix

# Neural Networks
from keras import models, layers, optimizers, regularizers
from keras.utils.vis_utils import model_to_dot
from IPython.display import SVG

In [2]:
# Import data_engineered
data = pd.read_pickle("saves/data_engineered.pkl")

In [3]:
# Alternative: Import from csv
#data_types_engineered = pd.read_csv('saves/types_engineered.csv')['types']
#data = pd.read_csv("saves/data_engineered.csv", dtype=data_types_engineered.to_dict())
#data.set_index('id', inplace=True)

In [4]:
# Dashboard
target = 'price_log' # for regression: 'occupancy_rate', 'price_log' | for classification: 'occupancy_class'
drop_cols = ['occupancy_rate'] # additional columns to drop: 'occupancy_class', 'occupancy_rate'
scoring = 'r2' # for regression: 'neg_mean_squared_error', 'r2', 'neg_mean_poisson_deviance', 'neg_median_absolute_error' | for classification: "f1", "recall", "precision", "accuracy", "roc_auc"
test_size = 0.2
random_state = 42

# Preprocessing (Train/Test Split and Pipeline)

In [5]:
# Drop columns
data = data.drop(drop_cols, axis=1)

## Preprocessing pipeline

In [6]:
# Create list for categorical predictors/features (used in "Scaling with Preprocessing Pipeline") 
cat_features = list(data.columns[data.dtypes==object])
#cat_features.remove("neighbourhood")
#cat_features.remove("zipcode")
cat_features

['cancellation_policy',
 'neighbourhood',
 'property_type',
 'room_type',
 'zipcode']

In [7]:
# Create list for numerical predictors/features (removing target column, used in "Scaling with Preprocessing Pipeline")
num_features = list(data.columns[data.dtypes!=object])
num_features.remove(target)
num_features

['accommodates',
 'accommodates_per_bed',
 'am_balcony',
 'am_breakfast',
 'am_child_friendly',
 'am_elevator',
 'am_essentials',
 'am_nature_and_views',
 'am_pets_allowed',
 'am_private_entrance',
 'am_smoking_allowed',
 'am_tv',
 'am_white_goods',
 'availability_90',
 'bathrooms_log',
 'bedrooms',
 'calculated_host_listings_count',
 'first_review_days',
 'host_is_superhost',
 'instant_bookable',
 'last_review_days',
 'latitude',
 'longitude',
 'maximum_nights',
 'minimum_nights_log',
 'occupancy_class',
 'price_extra_fees_sqrt',
 'price_extra_people',
 'review_scores_rating_sqrt',
 'text_len_sqrt',
 'wk_mth_discount']

In [8]:
# Build preprocessor pipeline
# Pipeline for numerical features
num_pipeline = Pipeline([
    ('imputer_num', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

# Pipeline for categorical features 
cat_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
    ('1hot', OneHotEncoder(drop='first', handle_unknown='error'))
])

# Complete pipeline
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

In [9]:
# Function for getting column names after preprocessing
def get_column_names_from_ColumnTransformer(column_transformer):    
    col_name = []
    for transformer_in_columns in column_transformer.transformers_[:-1]:#the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names()
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += names.tolist()
        elif isinstance(names,list):
            col_name += names    
        elif isinstance(names,str):
            col_name.append(names)
    return col_name

## Train/test split

In [10]:
# Define predictors and target variable
X = data.drop([target], axis=1)
y = data[target]

In [11]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size,
                                                        random_state=random_state,
                                                        shuffle=True)
#                                                   stratify=y) # Use stratify=y if labels are inbalanced (e.g. most wines are 5 or 6; check with value_counts()!)

In [12]:
# Saving preprocessed X_train and X_test
X_train_prep_preprocessor = preprocessor.fit(X_train)
X_train_prep_cols = get_column_names_from_ColumnTransformer(X_train_prep_preprocessor)

X_train_prep = X_train_prep_preprocessor.transform(X_train)
X_train_num_prep = num_pipeline.fit_transform(X_train[num_features])
X_test_prep = X_train_prep_preprocessor.transform(X_test)

## Outlier Detection

In [13]:
# Preprocess data
#train_outl = num_pipeline.fit_transform(X_train[num_features], y_train)

In [14]:
# Fit DBSCAN model
#outl_model = DBSCAN(eps=3.0, min_samples=10).fit(train_outl)
#outl_labels = outl_model.labels_

In [15]:
# Display results (# of outliers)
#pd.Series(outl_labels).value_counts()

In [16]:
# Illustrate results
#plt.figure(figsize=(10,10))
#
#unique_labels = set(outl_labels)
#colors = ['blue', 'red']
#
#for color,label in zip(colors, unique_labels):
#    sample_mask = [True if l == label else False for l in outl_labels]
#    plt.plot(train_outl[:,0][sample_mask], train_outl[:, 1][sample_mask], 'o', color=color);
#plt.xlabel('accommodates_per_bed');
#plt.ylabel('accommodates_per_room');

Interpreting the results:

- https://www.kaggle.com/kevinarvai/outlier-detection-practice-uni-multivariate
- https://datascience.stackexchange.com/questions/46092/how-do-we-interpret-the-outputs-of-dbscan-clustering

## Perform Feature Selection (add most useful to modeling pipeline)

In [17]:
# Set X_fs to desired variable
X_fs = X_train[num_features]    # X_train_prep, X_train_num_prep, X_train[num_features]
#X_fs = pd.DataFrame(X_fs, columns = X_train_prep_cols)

**GenericUnivariateSelect** (Classification and Regression)

In [18]:
# Apply GenericUnivariateSelect
trans_GUS = GenericUnivariateSelect(score_func=lambda X, y: X.mean(axis=0), mode='k_best', param=15) #mode='percentile', 'k_best'
X_train_GUS = trans_GUS.fit_transform(X_fs, y_train)

**mutual_info_classif** (Classification)

In [19]:
# Fit mutual_info_classif
#X_train_mic = mutual_info_classif(X_fs, y_train)

In [20]:
# Plot feature importance
#plt.subplots(1, figsize=(26, 1))
#sns.heatmap(X_train_mic[:, np.newaxis].T, cmap='Blues', cbar=False, linewidths=1, annot=True)
#plt.yticks([], [])
#plt.gca().set_xticklabels(X_fs.columns, rotation=45, ha='right', fontsize=12)
#plt.suptitle("Feature Importance (mutual_info_classif)", fontsize=18, y=1.2)
#plt.gcf().subplots_adjust(wspace=0.2)
#pass

In [21]:
# Apply GenericUnivariateSelect to reduce features (optional)
#trans_mic = GenericUnivariateSelect(score_func=mutual_info_classif, mode='k_best', param=15) #mode='percentile', 'k_best', 
#X_train_mic_GUS = trans_mic.fit_transform(X_fs, y_train)

In [22]:
# Print kept features
#print("We started with {0} features but retained only {1} of them!".format(
#    X_fs.shape[1] - 1, X_train_mic_GUS.shape[1]))

#columns_retained_Select = X_fs.columns[trans_mic.get_support()].values
#pd.DataFrame(X_train_mic_GUS, columns=columns_retained_Select).head()

**chi2** (Classification)

**mutual_info_regression** (Regression)

# Predictive Modeling: Regression ("price_log")

## Apply Regression Models

In [23]:
# Print current setting for TARGET
target_upper = target.upper()
y_upper = y_train.name.upper()
print("You are currently using " + f"\033[1m{target_upper}\033[0m" + " as the target")
print("The target variable y is currently set to " + f"\033[1m{y_upper}\033[0m")

You are currently using [1mPRICE_LOG[0m as the target
The target variable y is currently set to [1mPRICE_LOG[0m


In [42]:
# Select models for comparison
models={'Baseline': DummyRegressor(strategy='mean'),
        'LinReg': LinearRegression(),
        'Passive Aggressive' : PassiveAggressiveRegressor(),
#        'RANSAC' : RANSACRegressor(),
        'ElasticNet' : ElasticNet(),
        'Stochastic Gradient Descent': SGDRegressor(max_iter=1000, tol=1e-3),
        'Decision Tree': DecisionTreeRegressor(criterion="mse", max_depth=3,random_state=random_state),
        'Random Forest': RandomForestRegressor(random_state=random_state, max_features='sqrt', n_jobs=-1),
        'Gradient Boost': GradientBoostingRegressor(random_state=random_state),
        'XGBoost': XGBRegressor(),
        'AdaBoost': AdaBoostRegressor(random_state=random_state),
        'SVR': SVR()
       }

In [43]:
# Calculate and display results
results = pd.DataFrame(columns=['Model', 'MSE', 'RMSE', 'R2'])
i = 0
for m in models.items():
    # Building a full pipeline with our preprocessor and a Classifier
    pipe = Pipeline([('preprocessor', preprocessor), (m[0], m[1])])
    # Making predictions on the training set using cross validation as well as calculating the probabilities
    y_train_pred = cross_val_predict(pipe,
                                     X_train,
                                     y_train.values.ravel(),
                                     cv=5,
                                     verbose=4,
                                     n_jobs=-1)
    # Calculating metrices
    temp = pd.DataFrame(
        {
            'Model': m[0],
            'MSE': "{:.2f}".format(mean_squared_error(y_train, y_train_pred)),
            'RMSE': "{:.2f}".format(mean_squared_error(
                y_train, y_train_pred, squared=False)),
            'MAE': "{:.2f}".format(mean_absolute_error(y_train, y_train_pred)),
            'R2': "{:.2f}".format(r2_score(y_train, y_train_pred))
        },
        index=[i])
    i += 1
    results = pd.concat([results, temp])
results

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.4s remaining:    3.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.6s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_

Unnamed: 0,Model,MSE,RMSE,R2,MAE
0,Baseline,0.33,0.58,-0.0,0.46
1,LinReg,0.14,0.38,0.57,0.29
2,Passive Aggressive,0.24,0.49,0.27,0.38
3,ElasticNet,0.33,0.58,-0.0,0.46
4,Stochastic Gradient Descent,0.25,0.5,0.26,0.38
5,Decision Tree,0.2,0.45,0.4,0.35
6,Random Forest,0.13,0.37,0.6,0.28
7,Gradient Boost,0.13,0.36,0.61,0.27
8,XGBoost,0.13,0.36,0.62,0.27
9,AdaBoost,0.18,0.43,0.45,0.33


## Reg Model 1: XGBoost

In [26]:
# Print current setting for TARGET
target_upper = target.upper()
y_upper = y_train.name.upper()
print("You are currently using " + f"\033[1m{target_upper}\033[0m" + " as the target")
print("The target variable y is currently set to " + f"\033[1m{y_upper}\033[0m")

You are currently using [1mPRICE_LOG[0m as the target
The target variable y is currently set to [1mPRICE_LOG[0m


In [27]:
# Create pipeline to use in RandomSearchCV and GridSearchCV
pipeline_xgb_reg = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_reg', XGBRegressor(n_estimators=110,
                              random_state=random_state,
                              max_depth=5,
                              max_features=20,
                              scoring=scoring,
                              n_jobs=-1))
])

### Hyperparameter Pre-Tuning with RandomizedSearchCV

In [28]:
# Display possible hyperparameters for XGBoost Regressor
test_xgb_reg = XGBRegressor()
test_xgb_reg.get_params().keys()

dict_keys(['base_score', 'booster', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'gamma', 'importance_type', 'learning_rate', 'max_delta_step', 'max_depth', 'min_child_weight', 'missing', 'n_estimators', 'n_jobs', 'nthread', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'seed', 'silent', 'subsample', 'verbosity'])

**Default values for XGBRegressor** (as base for hyperparameter search):

max_depth=3, learning_rate=0.1, n_estimators=100, verbosity=1, silent=None, objective='reg:linear', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, importance_type='gain'

In [29]:
# Define hyperparameter distribution
param_distribs_xgb_reg = {
    'xgb_reg__n_estimators': randint(low=10, high=200),
    'xgb_reg__max_depth': randint(low=1, high=10),
    'xgb_reg__learning_rate': [0.05, 0.1, 0.2, 0.3]
}

In [30]:
# Create and fit RandomizedSearchCV, save "best_model"
rnd_xgb_reg = RandomizedSearchCV(pipeline_xgb_reg, param_distribs_xgb_reg, cv=5, n_iter=20, 
                           return_train_score=True, verbose=4, n_jobs=-1, random_state=random_state)

best_model_rnd_xgb_reg = rnd_xgb_reg.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.6min finished




In [31]:
# Display best_score_, best_params_ and best_estimator_
print('Best score:\n{:.2f}'.format(rnd_xgb_reg.best_score_))
print("Best parameters:\n{}".format(rnd_xgb_reg.best_params_))
#print("Best estimator:\n{}".format(grid_rf_reg.best_estimator_))

Best score:
0.64
Best parameters:
{'xgb_reg__learning_rate': 0.05, 'xgb_reg__max_depth': 8, 'xgb_reg__n_estimators': 184}


**Determine Feature Importance**

In [32]:
# Fit XGB pipeline
pipeline_xgb_reg_fi = pipeline_xgb_reg.fit(X_train, y_train)



In [33]:
# Get feature names from pipeline
onehot_columns = list(
    pipeline_xgb_reg_fi.named_steps['preprocessor'].named_transformers_['cat'].
    named_steps['1hot'].get_feature_names(input_features=cat_features))
features_prep_list = list(num_features)
features_prep_list.extend(onehot_columns)

In [34]:
# Determine and print feature importance
eli5.explain_weights(pipeline_xgb_reg_fi.named_steps['xgb_reg'], top=50, feature_names=features_prep_list)

Weight,Feature
0.4021,room_type_Private room
0.0434,bedrooms
0.0344,room_type_Shared room
0.0275,zipcode_zip_13359
0.0258,neighbourhood_Mitte
0.0187,property_type_Boutique hotel
0.0175,bathrooms_log
0.0175,accommodates
0.0161,am_tv
0.0142,neighbourhood_Wedding


### Hyperparameter Tuning with GridSearchCV

In [35]:
# Define hyperparameter grid
param_grid_xgb_reg = [
    {'xgb_reg__n_estimators': [190, 230, 290], 'xgb_reg__max_features': [45, 60, 75]},
    {'xgb_reg__bootstrap': [False], 'xgb_reg__n_estimators': [190, 230, 290], 'xgb_reg__max_features': [45, 60, 75]},
]

In [36]:
# Create and fit GridSearchCV, save "best_model"
grid_xgb_reg = GridSearchCV(pipeline_xgb_reg, param_grid_xgb_reg, cv=5, 
                           return_train_score=True, verbose=4, n_jobs=-1)

grid_xgb_reg.fit(X_train, y_train)
best_model_xgb_reg = grid_xgb_reg.best_estimator_

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   49.6s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  4.1min finished




In [37]:
# Display best_score_, best_params_ and best_estimator_
print('Best score:\n{:.2f}'.format(grid_xgb_reg.best_score_))
print("Best parameters:\n{}".format(grid_xgb_reg.best_params_))
#print("Best estimator:\n{}".format(grid_rf_reg.best_estimator_))

Best score:
0.64
Best parameters:
{'xgb_reg__max_features': 45, 'xgb_reg__n_estimators': 290}


**Detailed evaluation with training set**

In [38]:
# Predict target with "best model"
y_train_pred_xgb_reg = best_model_xgb_reg.predict(X_train_prep)



ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [None]:
# Final evaluation of "best model"
print("MSE: {:.2f}".format(mean_squared_error(y_test, y_train_pred_xgb_reg))),
print("RMSE: {:.2f}".format(mean_squared_error(y_test, y_train_pred_xgb_reg, squared=False))),
print("MAE: {:.2f}".format(mean_absolute_error(y_test, y_train_pred_xgb_reg))),
print("R2: {:.2f}".format(r2_score(y_test, y_train_pred_xgb_reg))),

In [None]:
# Display confidence interval (scipy stats)
confidence = 0.95
squared_errors = (y_train_pred_xgb_reg - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc=squared_errors.mean(), 
                          scale=stats.sem(squared_errors)))

## Reg Model 2: Supply Vector Machine

In [45]:
# Print current setting for TARGET
target_upper = target.upper()
y_upper = y_train.name.upper()
print("You are currently using " + f"\033[1m{target_upper}\033[0m" + " as the target")
print("The target variable y is currently set to " + f"\033[1m{y_upper}\033[0m")

You are currently using [1mPRICE_LOG[0m as the target
The target variable y is currently set to [1mPRICE_LOG[0m


In [47]:
# Create pipeline to use in RandomSearchCV and GridSearchCV
pipeline_svm_reg = Pipeline([('preprocessor', preprocessor),
                             ('svm_reg',
                              SVR(kernel='rbf',
                                  C=1,
                                  degree=3))])

### Hyperparameter Pre-Tuning with RandomizedSearchCV

In [48]:
# Display possible hyperparameters for Supply Vector Machine
test_svr_reg = SVR()
test_svr_reg.get_params().keys()

dict_keys(['C', 'cache_size', 'coef0', 'degree', 'epsilon', 'gamma', 'kernel', 'max_iter', 'shrinking', 'tol', 'verbose'])

**Default values for Supply Vector Machine** (as base for hyperparameter search):

kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1

In [49]:
# Define hyperparameter distribution
param_distribs_svm_reg = {
    'svm_reg__kernel': ['linear', 'poly', 'rbf'],
    'svm_reg__C': [0.1,0.5,1,2,5,10,50,100,500,1000],
    'svm_reg__degree': randint(low=1, high=5)
}

In [None]:
# Create and fit RandomizedSearchCV, save "best_model"
rnd_svm_reg = RandomizedSearchCV(pipeline_svm_reg, param_distribs_svm_reg, cv=5, n_iter=5, 
                           return_train_score=True, verbose=4, n_jobs=-1, random_state=random_state)

best_model_rnd_svm_reg = rnd_svm_reg.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
# Display best_score_, best_params_ and best_estimator_
print('Best score:\n{:.2f}'.format(rnd_svm_reg.best_score_))
print("Best parameters:\n{}".format(rnd_svm_reg.best_params_))
#print("Best estimator:\n{}".format(grid_rf_reg.best_estimator_))

## Final Evaluation with Testing Set

In [None]:
# Transform X_test for final evaluation
#X_test_prep = preprocessor.transform(X_test)

In [None]:
# Predict target with "best model"
#y_pred_rf_reg = best_model_rf_reg.predict(X_test_prep)

In [None]:
# Final evaluation of "best model"
#print("MSE: {:.2f}".format(mean_squared_error(y_test, y_pred_rf_reg))),
#print("RMSE: {:.2f}".format(mean_squared_error(y_test, y_pred_rf_reg, squared=False))),
#print("MAE: {:.2f}".format(mean_absolute_error(y_test, y_pred_rf_reg))),
#print("R2: {:.2f}".format(r2_score(y_test, y_pred_rf_reg))),

In [None]:
# Display confidence interval (scipy stats)
#confidence = 0.95
#squared_errors = (y_pred_rf_reg - y_test) ** 2
#np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
#                         loc=squared_errors.mean(), 
#                         scale=stats.sem(squared_errors)))

# Predictive Modeling: Neural Networks ("price_log")

In [None]:
# Print current setting for TARGET
target_upper = target.upper()
y_upper = y_train.name.upper()
print("You are currently using " + f"\033[1m{target_upper}\033[0m" + " as the target")
print("The target variable y is currently set to " + f"\033[1m{y_upper}\033[0m")

In [None]:
# Build the model
model_nn_seq = models.Sequential()
model_nn_seq.add(layers.Dense(128, input_shape=(X_train_prep.shape[1],), kernel_regularizer=regularizers.l1(0.005), activation='relu'))
model_nn_seq.add(layers.Dense(256, kernel_regularizer=regularizers.l1(0.005), activation='relu'))
model_nn_seq.add(layers.Dense(256, kernel_regularizer=regularizers.l1(0.005), activation='relu'))
model_nn_seq.add(layers.Dense(512, kernel_regularizer=regularizers.l1(0.005), activation='relu'))
model_nn_seq.add(layers.Dense(1, activation='linear'))

# Compile the model
model_nn_seq.compile(loss='mean_squared_error',
            optimizer='adam',
            metrics=['mean_squared_error'])

# Model summary
print(model_nn_seq.summary())

# Visualize the neural network
#SVG(model_to_dot(model_nn_seq, show_layer_names=False, show_shapes=True).create(prog='dot', format='svg'))


In [None]:
# Train the model
#model_nn_seq_start = time.time()

model_nn_seq_history = model_nn_seq.fit(X_train_prep,
                  y_train,
                  epochs=20,
                  batch_size=256,
                  validation_split=0.2)

#model_nn_seq_end = time.time()

#print(f"Time taken to run: {round((model_nn_seq_end - model_nn_seq_start)/60,1)} minutes")

In [None]:
# Evaluate the model
#model_nn_seq_model_evaluation(model_nn_seq, skip_epochs=2, X_train=X_train, X_test=X_test)

#score_nn_seq = model_nn_seq.evaluate(X_train_prep, y_train,verbose=1)
#print(score_nn_seq)