## 1. |mp0rt

In [1]:
# importing all the libraries needed for project, should all be in Pipfile

from category_encoders import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
import datetime
import joblib
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport

In [2]:
# import listings.csv, set the index to the listing id, and turn off truncation for df display
df_list = pd.read_csv('listings.csv').set_index('id')
pd.set_option('display.max_columns', 500)

## 2. Wr4ng|

In [3]:
def wrangle(X):
    """A function to clean our data"""
    
    #Make a copy of the df
    X = X.copy()

    ## DROP unwanted columns; HiCard, urls, empty, redundant, etc.           80>|
    drop_cols=['listing_url', 'scrape_id', 'last_scraped', 'name', 'description', 
               'neighborhood_overview', 'picture_url', 'host_url',
               'host_name', 'host_since', 'host_location', 'host_about', 
               'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 
               'neighbourhood', 'neighbourhood_group_cleansed', 'bathrooms', 
               'amenities', 'calendar_updated', 'has_availability',
               'calendar_last_scraped', 'minimum_minimum_nights', 
               'maximum_minimum_nights', 'minimum_maximum_nights', 
               'maximum_maximum_nights']
    X.drop(columns=drop_cols, inplace=True)
    
    ## A FUNC and execution thereof tallying number of host contacts that have been verified.
    def num_contact(items):
        items = items.replace('[','').replace(']','').replace("'",'').split(',')
        return len(items)
    X['host_verifications'] = X['host_verifications'].apply(num_contact)
    
    ## Changing columns to BOOL vals     
    # Convert "t/f" columns to BOOl columns.
    def strToBool(text):
      if text == 't':
        return True
      else:
        return False
    X['host_has_profile_pic'] = X['host_has_profile_pic'].apply(strToBool)
    X['host_identity_verified'] = X['host_identity_verified'].apply(strToBool)
    X['host_is_superhost'] = X['host_is_superhost'].apply(strToBool)
    X['instant_bookable'] = X['instant_bookable'].apply(strToBool)
    
    # Convert license column to BOOL column.
    def licenseToBool(text):      
        if type(text) != float: 
            return True
        else:
            return False
    X['license'] = X['license'].apply(lambda x: licenseToBool(x))  
    
    
    ## Shorten/rename neighbourhood_cleansed' column as 'neighborhood'
    X = X.rename({'neighbourhood_cleansed' : 'neighborhood'}, axis=1)
    
    ### BATHROOM FUNCTIONS
    ## Change 'bathroom_text' column name to 'num_bathroom', i.e. number of baths
    X = X.rename({'bathrooms_text' : 'num_bathroom'}, axis=1)
    # Drop 2 null values in the 'num_bathroom' column
    X = X[X['num_bathroom'].isnull() == False]
    # Engineer a feature to specify if baths are shared or not
    def parse_baths_shared(string):
        if 'shared' in string.lower(): # .lower because there is one value to remove
            return True                # "Shared Half Bath", we do
        else:
            return False
    X['shared_bath'] = X['num_bathroom'].apply(parse_baths_shared)
    
    # Parse floats from the bathrooms_text column
    def parse_baths(text):
        if 'half-bath' in text.lower():
            return 0.5

        text = text.split(' ')
        return float(text[0])
    X['num_baths'] = X['num_bathroom'].apply(parse_baths)
    X = X.drop(columns='num_bathroom')
    ### END BATH
    
    # Change 'price' data type from OBJECT to INT
    X['price'] = X['price'].apply(lambda x: float(x.replace('$',"").replace(',','')))
    
    # Change the data types of columns 'first_review' and 'last_review'
    def datestojd(dt_str):
        if type(dt_str) == float:
            return dt_str
        else:
            jul_date = pd.to_datetime(dt_str)
        return jul_date.to_julian_date()
    X['first_review'] = X['first_review'].apply(lambda x: datestojd(x))
    X['last_review'] = X['last_review'].apply(lambda x: datestojd(x))
        
    return X

## €dα


In [22]:
# Create a df for pandas profiling
df_profile = wrangle(df_list)

In [None]:
# Using pandas profiling to look at features and their properties
profile = ProfileReport(df_profile)
profile.to_notebook_iframe()

# m0d3|

In [27]:
# Create a df for train/test split and randomforestregssor models
df_model = wrangle(df_list)

## Train//tesT :: :: :: SΦLi_|_

In [28]:
# TARGET
target = 'price'

# INITial target/feature separation
y_init = df_model[target]
X_init = df_model.drop(columns=target)

# Print to check sizes
print(X_init.shape)
print(y_init.shape)

In [30]:
# Splitting testing set and SUBSEQuent set, which will be split again.
X_subseq, X_test, y_subseq, y_test = train_test_split(X_init, y_init, test_size=0.1, random_state=13)

# Display test and subseq shape 
print(X_test.shape)
print(X_subseq.shape)

In [33]:
# Second split making Train and Val sets
X_train, X_val, y_train, y_val = train_test_split(X_subseq, y_subseq, test_size=0.1114, random_state=17)

# display shape, VAL set should be the same size as test set
print(X_val.shape)
print(X_train.shape)

### B4SSLI3N

In [43]:
# Establish baseline prediction
baseline_pred = [y_train.mean()] * len(y_train)

# Establish baseline mean absolute error
baseline_mae = mean_absolute_error(y_train,baseline_pred)


print("Baseline Prediction: ", baseline_pred[0])
print("Baseline MAE: ", baseline_mae)

Baseline Prediction:  147.78483525208415
Baseline MAE:  82.89001996891902


## PIGH:P:LINE

### GrdSRCH M0de|
    - initial hyperparam search

In [13]:
# pipeline for gridsearch
pipe_01 = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    RandomForestRegressor(random_state = 13,
                           n_jobs = -2)
)

# range of params to try out
params_01 = {
    # 'key__hyperparam'(which step in pipeline you want looked at) : 'value'     
    'randomforestregressor__max_depth' : range(200, 1201, 100),
    'randomforestregressor__n_estimators' :  range(50, 351, 10),
    'randomforestregressor__max_samples' : np.arange(0.1, 0.9, 0.1)
    }

In [None]:
# model_01 :: for initial grid search
model_01 = GridSearchCV(
    pipe_01,
    param_grid = params_01,
    n_jobs = -2,
    cv = 5,
    verbose = 3
)

In [44]:
# WARNING: This is around 34,000 fits; make sure you're CPU can run locally
# before running cell
model_01.fit(X_train, y_train)

NameError: name 'model_01' is not defined

### F34_|_ :: :: :: |3p0rT :
    - Using pipeline 'pipe_01'

In [16]:
# setting a variable for importances
importances = pipe_01.named_steps["randomforestregressor"].feature_importances_

# seting variable for feature
features = X_train.columns

# setting variable for FEATURE IMPORTANCES, and sorting (ascending)
feature_importances = pd.Series(importances, index=features).sort_values()

# printing top 20 feature importances (ascending), and displaying their weight
feature_importances.tail(20)

review_scores_rating      0.015461
review_scores_checkin     0.015807
property_type             0.016095
availability_365          0.017377
availability_30           0.017386
reviews_per_month         0.017642
first_review              0.019246
minimum_nights            0.020169
availability_90           0.022007
beds                      0.023287
availability_60           0.025618
accommodates              0.025731
longitude                 0.027004
host_id                   0.028576
latitude                  0.029103
minimum_nights_avg_ntm    0.047569
bedrooms                  0.054194
neighborhood              0.058383
number_of_reviews         0.085911
num_baths                 0.243179
dtype: float64

### 5tr|pd M0d3| :

     -Using 10 importance features

In [None]:
# smaller model with top 10 feats (2nd pipeline and model)

# TRAINing set for 2nd model
X_train2 = X_train[['host_id', 'last_review', 'minimum_nights', 'latitude',
                   'minimum_nights_avg_ntm', 'first_review', 'bedrooms',
                   'number_of_reviews', 'neighborhood', 'num_baths']]

# VALidation set for 2nd model
X_val2 = X_val[['host_id', 'last_review', 'minimum_nights', 'latitude',
                'minimum_nights_avg_ntm', 'first_review', 'bedrooms',
                'number_of_reviews', 'neighborhood', 'num_baths']]

In [None]:
# Pipeline number 2; also served as model number 2, for initial heroku deployment
pipe_02 = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    RandomForestRegressor(
        n_estimators = 250,
        max_depth = 150,
        max_samples = 0.5,
        random_state = 13,
        n_jobs = -2)
)

In [None]:
# !!!Create this file for folder in app>ml_models folder of price_prediction_api
#    will ALSO need to change file_path in services>prices.py as well
filename = 'working_model.sav'
joblib.dump(pipe_02, filename)

### 3|GR :: :: :: M0d3| 

In [36]:
# Grabbing the TOP 20 feature models for a gridsearchCV using relevant features
X_train3 = X_train[['review_scores_rating', 'review_scores_checkin', 'property_type',
       'availability_365', 'availability_30', 'reviews_per_month',
       'first_review', 'minimum_nights', 'availability_90', 'beds',
       'availability_60', 'accommodates', 'longitude', 'host_id', 'latitude',
       'minimum_nights_avg_ntm', 'bedrooms', 'neighborhood',
       'number_of_reviews', 'num_baths']]

X_val3 = X_val[['review_scores_rating', 'review_scores_checkin', 'property_type',
       'availability_365', 'availability_30', 'reviews_per_month',
       'first_review', 'minimum_nights', 'availability_90', 'beds',
       'availability_60', 'accommodates', 'longitude', 'host_id', 'latitude',
       'minimum_nights_avg_ntm', 'bedrooms', 'neighborhood',
       'number_of_reviews', 'num_baths']]

In [None]:
# 3rd Pipeline for gridsearch
pipe_03 = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    RandomForestRegressor(random_state = 13,
                           n_jobs = -2)
)

# parameter ranges for Pipe_03
params_03 = {
    # 'key__hyperparam'(which step in pipeline you want looked at) : 'value'     
    'randomforestregressor__max_depth' : range(50, 200, 10),
    'randomforestregressor__n_estimators' :  range(350, 550, 10),
    'randomforestregressor__max_samples' : np.arange(0.05, 0.5, 0.05)
    }

In [None]:
# GridSearchCV (model number 3) for pipe_03 
model_03 = GridSearchCV(
    pipe_03,
    param_grid = params_03,
    n_jobs = -2,
    cv = 5,
    verbose = 3
)

In [None]:
# WARNING ~13k fits, make sure CPU or server can handle
model_03.fit(X_train3, y_train)

In [None]:
# display best parameters for best score of grid search
model_03.best_params_

In [None]:
# Display model 3 mean absolute error
print('Training MAE:', mean_absolute_error(y_train, model_03.predict(X_train3)))
print('Validation MAE:', mean_absolute_error(y_val, model_03.predict(X_val3)))

In [None]:
# Tuned params for another model 3 grid search, tweaking depth, lower due to last
# fit's best params
params_03_2 = {
    # 'key__hyperparam'(which step in pipeline you want looked at) : 'value'     
    'randomforestregressor__max_depth' : range(20, 150, 10),
    'randomforestregressor__n_estimators' :  range(350, 550, 10),
    'randomforestregressor__max_samples' : np.arange(0.05, 0.5, 0.05)
    }

In [None]:
# model 3 DOUBLING CV folds
model_03_2 = GridSearchCV(
    pipe_03,
    param_grid = params_03_2,
    n_jobs = -2,
    cv = 10,
    verbose = 3
)

In [None]:
# WARNING: ~23k fits, make sure you have time/RAM/processors to run
model_03_2.fit(X_train3, y_train)

In [None]:
model_03_2.best_params_

In [None]:
# display Model number 3.2 mean absolute errors
print('Training MAE:', mean_absolute_error(y_train, model_03_2.predict(X_train3)))
print('Validation MAE:', mean_absolute_error(y_val, model_03_2.predict(X_val3)))

### _|_UN1nG :
    - Using best params from model 3 and 3.2 to tune hyper params further

In [19]:
# PIPELINE number 4
pipe_04 = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    RandomForestRegressor(random_state = 13,
                           n_jobs = -2)
)

# parameter ranges for 'pipe_04'
params_04 = {
    # 'key__hyperparam'(which step in pipeline you want looked at) : 'value'     
    'randomforestregressor__max_depth' : range(10, 70, 5),
    'randomforestregressor__n_estimators' :  range(480, 600, 5),
    'randomforestregressor__max_samples' : np.arange(0.05, 0.13, 0.025)
    }

In [None]:
# Model number 4
model_04 = GridSearchCV(
    pipe_04,
    param_grid = params_04,
    n_jobs = -2,
    cv = 5,
    verbose = 3
)

In [None]:
# WARNING: not as bad as before, less than 7k > fits < 10k, still may take 
# some time/power
model_04.fit(X_train3, y_train)

In [None]:
# Model 4 mean absolute errors
print('Training MAE:', mean_absolute_error(y_train, model_04.predict(X_train3)))
print('Validation MAE:', mean_absolute_error(y_val, model_04.predict(X_val3)))

In [None]:
# Best parameters for Model 4
model_04.best_params_

In [20]:
# Tuning a little more, making model 4.2

# Params for model 4.2
params_04_2 = {
    # 'key__hyperparam'(which step in pipeline you want looked at) : 'value'     
    'randomforestregressor__max_depth' : range(20, 40, 4),
    'randomforestregressor__n_estimators' :  range(480, 530, 5),
    'randomforestregressor__max_samples' : np.arange(0.025, 0.1, 0.025)
    }

In [21]:
# Model/GridSearch for model 4.2
model_04_2 = GridSearchCV(
    pipe_04,
    param_grid = params_04_2,
    n_jobs = -2,
    cv = 10,
    verbose = 3
)

In [None]:
# warning: not so bad, only ~2000 fits, smallest grid search of them all
model_04_2.fit(X_train3, y_train)

In [None]:
# Model 4.2 mean absolute errors
print('Training MAE:', mean_absolute_error(y_train, model_04_2.predict(X_train3)))
print('Validation MAE:', mean_absolute_error(y_val, model_04_2.predict(X_val3)))

In [None]:
# Display the best hyperparameter settings for model 4.2
model_04_2.best_params_

# MVP M0d3| :

In [None]:
"""The best grid search results by MVP launch came from model 4.2"""

# Create pipeline/model 'best_v_01', best version/MVP number 1.0
# using best params from 'model_04_2' ^ one cell up
best_v_01 = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    RandomForestRegressor(
        n_estimators = 250,
        max_depth = 150,
        max_samples = 0.5,
        random_state = 13,
        n_jobs = -2)
)

In [None]:
# Fit for heroku app 
best_v_01.fit(X_train3, y_train)

In [None]:
"""MVP File creatino and dump using MVP model 'best_v_01'"""

# !!!Create this file for folder in app>ml_models folder of price_prediction_api
#    will ALSO need to change file_path in services>prices.py as well
# !!!replace 'working_model.sav' with 'best_model_v_01.sav' generated below
filename = 'best_model_v_01.sav'
joblib.dump(best_v_01, filename)