In [3]:
#Import Dependencies
import pandas as pd
import numpy as np
import tensorflow

import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder, MinMaxScaler 
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

In [4]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\users\melis\anaconda3\envs\pythondata\lib\site-packages (0.0)


In [5]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [7]:
#This is the code used to create bins for price bins in the cleaning notebook.  It's just here for reference.
bins = [0]
group_names = []

for i in range (0,80):
    x = round((i*10 + 9.99),3)
    bins.append(x)
    name = "bin " + str(i*10) + "-" + str(x)
    group_names.append(name)
    
print(group_names)    

['bin 0-9.99', 'bin 10-19.99', 'bin 20-29.99', 'bin 30-39.99', 'bin 40-49.99', 'bin 50-59.99', 'bin 60-69.99', 'bin 70-79.99', 'bin 80-89.99', 'bin 90-99.99', 'bin 100-109.99', 'bin 110-119.99', 'bin 120-129.99', 'bin 130-139.99', 'bin 140-149.99', 'bin 150-159.99', 'bin 160-169.99', 'bin 170-179.99', 'bin 180-189.99', 'bin 190-199.99', 'bin 200-209.99', 'bin 210-219.99', 'bin 220-229.99', 'bin 230-239.99', 'bin 240-249.99', 'bin 250-259.99', 'bin 260-269.99', 'bin 270-279.99', 'bin 280-289.99', 'bin 290-299.99', 'bin 300-309.99', 'bin 310-319.99', 'bin 320-329.99', 'bin 330-339.99', 'bin 340-349.99', 'bin 350-359.99', 'bin 360-369.99', 'bin 370-379.99', 'bin 380-389.99', 'bin 390-399.99', 'bin 400-409.99', 'bin 410-419.99', 'bin 420-429.99', 'bin 430-439.99', 'bin 440-449.99', 'bin 450-459.99', 'bin 460-469.99', 'bin 470-479.99', 'bin 480-489.99', 'bin 490-499.99', 'bin 500-509.99', 'bin 510-519.99', 'bin 520-529.99', 'bin 530-539.99', 'bin 540-549.99', 'bin 550-559.99', 'bin 560-569.

In [8]:
file = "data_cleaning/Output/cleaned_data.csv"
df = pd.read_csv(file)
df.head()

Unnamed: 0,zipcode,room_type,accommodates,bedrooms,bathrooms,beds,minimum_nights,number_of_reviews,price,price_bin
0,94117,Entire home/apt,3,1.0,1.0,2.0,1,240,170.0,bin 170-179.99
1,94110,Entire home/apt,5,2.0,1.0,3.0,30,111,235.0,bin 230-239.99
2,94117,Private room,2,1.0,4.0,1.0,32,19,65.0,bin 60-69.99
3,94117,Private room,2,1.0,4.0,1.0,32,8,65.0,bin 60-69.99
4,94117,Entire home/apt,4,2.0,1.5,2.0,5,28,703.0,bin 700-709.99


## Create features (X) and target (y)

In [9]:
#this is your X
data = df.drop(["price", "price_bin"], axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,zipcode,room_type,accommodates,bedrooms,bathrooms,beds,minimum_nights,number_of_reviews
0,94117,Entire home/apt,3,1.0,1.0,2.0,1,240
1,94110,Entire home/apt,5,2.0,1.0,3.0,30,111
2,94117,Private room,2,1.0,4.0,1.0,32,19
3,94117,Private room,2,1.0,4.0,1.0,32,8
4,94117,Entire home/apt,4,2.0,1.5,2.0,5,28


In [10]:
#this is your y.  Convert to strings, even though it's already an array of strings, otherwise model will choke.
target = df["price_bin"].map(lambda x:str(x))

df.groupby("price_bin").count().index

Index(['bin 10-19.99', 'bin 100-109.99', 'bin 110-119.99', 'bin 120-129.99',
       'bin 130-139.99', 'bin 140-149.99', 'bin 150-159.99', 'bin 160-169.99',
       'bin 170-179.99', 'bin 180-189.99', 'bin 190-199.99', 'bin 20-29.99',
       'bin 200-209.99', 'bin 210-219.99', 'bin 220-229.99', 'bin 230-239.99',
       'bin 240-249.99', 'bin 250-259.99', 'bin 260-269.99', 'bin 270-279.99',
       'bin 280-289.99', 'bin 290-299.99', 'bin 30-39.99', 'bin 300-309.99',
       'bin 310-319.99', 'bin 320-329.99', 'bin 330-339.99', 'bin 340-349.99',
       'bin 350-359.99', 'bin 360-369.99', 'bin 370-379.99', 'bin 380-389.99',
       'bin 390-399.99', 'bin 40-49.99', 'bin 400-409.99', 'bin 410-419.99',
       'bin 420-429.99', 'bin 430-439.99', 'bin 440-449.99', 'bin 450-459.99',
       'bin 460-469.99', 'bin 470-479.99', 'bin 480-489.99', 'bin 490-499.99',
       'bin 50-59.99', 'bin 500-509.99', 'bin 510-519.99', 'bin 520-529.99',
       'bin 530-539.99', 'bin 540-549.99', 'bin 550-559.99', '

## LabelEncode features (X) - room_type and zipcode

In [11]:
#code for multi-column label encoder from stack overlow:
#https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)


In [12]:
#LabelEncode room_type and zipcode.  No need to one-hot encode

data2=MultiColumnLabelEncoder(columns = ['room_type', 'zipcode']).fit_transform(data)
data2.head()

Unnamed: 0,zipcode,room_type,accommodates,bedrooms,bathrooms,beds,minimum_nights,number_of_reviews
0,14,0,3,1.0,1.0,2.0,1,240
1,8,0,5,2.0,1.0,3.0,30,111
2,14,1,2,1.0,4.0,1.0,32,19
3,14,1,2,1.0,4.0,1.0,32,8
4,14,0,4,2.0,1.5,2.0,5,28


In [13]:
data2.groupby(['room_type', 'zipcode' ]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,accommodates,bedrooms,bathrooms,beds,minimum_nights,number_of_reviews
room_type,zipcode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,16,16,16,16,16,16
0,1,228,228,228,228,228,228
0,2,300,300,300,300,300,300
0,3,6,6,6,6,6,6
0,4,134,134,134,134,134,134
...,...,...,...,...,...,...,...
2,17,5,5,5,5,5,5
2,19,2,2,2,2,2,2
2,23,1,1,1,1,1,1
2,24,39,39,39,39,39,39


##  Create Train Test Split


In [15]:
from sklearn.model_selection import train_test_split

#had to use a smaller test size .1
#otherwise we have labels in the test split that are not in the train split, and the model chokes
X_train, X_test, y_train, y_test = train_test_split(data2, target, random_state=4, test_size = .1)

## Encode y labels (label encoder and one hot encode)

In [21]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

#77 columns in y_test and y_train
len(y_test_categorical[0])
y_test_categorical[0]
len(y_train_categorical[0])

78

## Scale the features/data (X)
Scale the data using the MinMaxScaler and perform some feature selection

In [22]:
X_minmax = MinMaxScaler().fit(X_train)

X_train_scaled = X_minmax.transform(X_train)
X_test_scaled= X_minmax.transform(X_test)

X_train_scaled

array([[2.69230769e-01, 0.00000000e+00, 1.33333333e-01, ...,
        1.42857143e-01, 2.00000002e-08, 0.00000000e+00],
       [4.23076923e-01, 0.00000000e+00, 1.33333333e-01, ...,
        1.42857143e-01, 1.00000001e-08, 2.07064555e-02],
       [3.84615385e-01, 5.00000000e-01, 6.66666667e-02, ...,
        7.14285714e-02, 1.00000001e-08, 6.09013398e-03],
       ...,
       [3.84615385e-02, 0.00000000e+00, 6.66666667e-02, ...,
        7.14285714e-02, 0.00000000e+00, 9.62241169e-02],
       [9.23076923e-01, 0.00000000e+00, 2.00000000e-01, ...,
        1.42857143e-01, 2.90000003e-07, 8.89159562e-02],
       [8.46153846e-01, 0.00000000e+00, 6.66666667e-02, ...,
        7.14285714e-02, 2.90000003e-07, 1.82704019e-02]])

## Train the Model (Deep Learning)

In [23]:
#get number of feature columns.  This will be the input dimension for the model.
X_train_scaled.shape
#X_train_scaled.shape[1]

(7007, 8)

In [24]:
y_train_categorical.shape

#y_train_categorical.shape[1]

(7007, 78)

In [25]:
#create the deep learning model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Create an empty sequential model
model = Sequential()

# Add the first layer where the input dimensions are the 8 columns of the training data
#dense is units, but don't know of what
model.add(Dense(100, activation='relu', input_dim=X_train_scaled.shape[1]))

# Add a second hidden layer
model.add(Dense(100, activation='relu'))

# Add output layer.  There are 78 columns in y_train_categorical.shape[1]
model.add(Dense(78, activation="softmax"))

In [26]:
# Compile the model using categorical_crossentropy for the loss function, the adam optimizer,
# and add accuracy to the training metrics
model.compile(loss="categorical_crossentropy",
              optimizer="adam", metrics=['accuracy'])

In [27]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               900       
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 78)                7878      
Total params: 18,878
Trainable params: 18,878
Non-trainable params: 0
_________________________________________________________________


In [None]:
 # Use the training data to fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Train on 7007 samples
Epoch 1/100
7007/7007 - 1s - loss: 3.6558 - accuracy: 0.0588
Epoch 2/100
7007/7007 - 1s - loss: 3.2687 - accuracy: 0.0945
Epoch 3/100
7007/7007 - 0s - loss: 3.1983 - accuracy: 0.0983
Epoch 4/100
7007/7007 - 0s - loss: 3.1727 - accuracy: 0.1060
Epoch 5/100
7007/7007 - 1s - loss: 3.1555 - accuracy: 0.1089
Epoch 6/100
7007/7007 - 0s - loss: 3.1440 - accuracy: 0.1112
Epoch 7/100
7007/7007 - 0s - loss: 3.1284 - accuracy: 0.1180
Epoch 8/100
7007/7007 - 0s - loss: 3.1194 - accuracy: 0.1167
Epoch 9/100
7007/7007 - 0s - loss: 3.1093 - accuracy: 0.1193
Epoch 10/100
7007/7007 - 0s - loss: 3.1022 - accuracy: 0.1243
Epoch 11/100
7007/7007 - 0s - loss: 3.0955 - accuracy: 0.1199
Epoch 12/100
7007/7007 - 0s - loss: 3.0888 - accuracy: 0.1243
Epoch 13/100
7007/7007 - 0s - loss: 3.0818 - accuracy: 0.1277
Epoch 14/100
7007/7007 - 1s - loss: 3.0750 - accuracy: 0.1272
Epoch 15/100
7007/7007 - 1s - loss: 3.0702 - accuracy: 0.1274
Epoch 16/100
7007/7007 - 1s - loss: 3.0650 - accuracy: 0.

## Evaluate the model

In [None]:
X_train_scaled.shape

In [None]:
y_train_categorical.shape

In [None]:
y_test_categorical

y_test_categorical.shape

In [None]:
X_test_scaled.shape

In [None]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

## Score the model

In [None]:
print(f"Deep Learning Training Data Score r2: {model.score(X_train_scaled, y_train_categorical)}")
print(f"Deep Learning, Neural network Testing Data Score r2: {model.score(X_test_scaled, y_test_categorical)}")


## Hyperparameter Tuning

Use GridSearchCV to tune the model's parameters

In [None]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [250, 300, 350],
              'max_depth': [125, 150, 175]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train_categorical)

In [None]:
print(grid.best_params_)
print(grid.best_score_)

In [None]:
# Training score:
grid.score(X_train_scaled, y_train_categorical)

In [None]:
# Testing score:
grid.score(X_test_scaled, y_test_categorical)

In [None]:
# Make prediction and save to variable for report.
predictions = grid.predict(X_test_scaled)

In [None]:
# Print Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test_categorical, predictions))

In [None]:
#????????????????????  WHAT IS THIS?
%matplotlib notebook
from yellowbrick.classifier import ClassificationReport
viz = ClassificationReport(RandomForestClassifier())
viz.fit(X_train_scaled, y_train)
viz.score(X_test_scaled, y_test)
viz.finalize()
viz.show(outpath="classificationreport.png")

## Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'models.sav'
joblib.dump(model, filename)

### Correlation Matrix

In [None]:
corrMatrix = df.corr()
print (corrMatrix)
sn.heatmap(corrMatrix, annot=True)
plt.tight_layout()
plt.autoscale()

plt.savefig('correlation_matrix.png', bbox_inches='tight', pad_inches=0.0)
plt.show()