In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf

In [2]:
#Read in the csv
airbnb_df = pd.read_csv('listings_clean_nan.csv')
airbnb_df.head()

Unnamed: 0.1,Unnamed: 0,source,name,host_id,host_url,host_name,host_since,host_location,host_response_time,host_response_rate,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,reviews_per_month
0,0,city scrape,Home in Kapaau · ★4.94 · Studio · 1 bed · 1 bath,31314577,https://www.airbnb.com/users/show/31314577,Marc,4/16/2015,"Waimea, HI",within an hour,100%,...,4.81,5.0,4.88,4.88,4.81,f,4,4,0,1.41
1,1,city scrape,Rental unit in Pahala · ★4.56 · 2 bedrooms · 2...,13268454,https://www.airbnb.com/users/show/13268454,Madaline,3/18/2014,"San Francisco, CA",a few days or more,0%,...,4.56,5.0,4.78,4.33,4.44,f,1,1,0,0.09
2,2,city scrape,Cabin in Mountain View · 1 bedroom · 2 beds · ...,251605183,https://www.airbnb.com/users/show/251605183,Kathleen,3/27/2019,"Kurtistown, HI",within an hour,100%,...,,,,,,t,3,1,2,
3,3,city scrape,Home in Kailua-Kona · 2 bedrooms · 3 beds · 1 ...,504489199,https://www.airbnb.com/users/show/504489199,Kelly And Zain,3/8/2023,"Kailua-Kona, HI",within an hour,100%,...,,,,,,f,3,3,0,
4,4,city scrape,Rental unit in Waikoloa Village · ★New · 1 bed...,336576760,https://www.airbnb.com/users/show/336576760,Tim,2/16/2020,"Myrtle Beach, SC",within a few hours,97%,...,,,,,,f,79,79,0,


In [3]:
# Look at the shape
airbnb_df.shape

(34040, 61)

In [4]:
# Look at what would drop
airbnb_df.dropna().shape

(0, 61)

In [5]:
# Look at the columns
airbnb_df.columns

Index(['Unnamed: 0', 'source', 'name', 'host_id', 'host_url', 'host_name',
       'host_since', 'host_location', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_has_profile_pic',
       'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'latitude', 'longitude',
       'property_type', 'room_type', 'accommodates', 'bathrooms_text',
       'bedrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights',
       'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated',
       'has_availability', 'availability_30', 'availability_60',
       'availability_90', 'availability_365', 'calendar_last_scraped',
       'number_of_reviews', 'number_of_reviews_ltm', 'number_of

In [6]:
#Create a Data Frame with needed columns for ML algorithm
airbnb_sh_df = airbnb_df [['host_response_rate','host_acceptance_rate','host_is_superhost',
            'host_identity_verified','price',
            'review_scores_rating','instant_bookable','reviews_per_month']]
airbnb_sh_df.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_identity_verified,price,review_scores_rating,instant_bookable,reviews_per_month
0,100%,98%,t,t,$175.00,4.94,f,1.41
1,0%,0%,f,f,$225.00,4.56,f,0.09
2,100%,93%,f,t,$100.00,,t,
3,100%,97%,t,t,$102.00,,f,
4,97%,21%,t,t,$209.00,,f,


In [7]:
# Look at counts
airbnb_sh_df.count()

host_response_rate        31487
host_acceptance_rate      32403
host_is_superhost         34018
host_identity_verified    34038
price                     32748
review_scores_rating      25869
instant_bookable          34040
reviews_per_month         25858
dtype: int64

In [8]:
# Drop NA rows
airbnb_sh_df_dna = airbnb_sh_df.dropna()

In [9]:
# Look at Counts after dropping NAs
airbnb_sh_df_dna.count()

host_response_rate        23548
host_acceptance_rate      23548
host_is_superhost         23548
host_identity_verified    23548
price                     23548
review_scores_rating      23548
instant_bookable          23548
reviews_per_month         23548
dtype: int64

In [10]:
#Look at datatypes
airbnb_sh_df_dna.dtypes

host_response_rate         object
host_acceptance_rate       object
host_is_superhost          object
host_identity_verified     object
price                      object
review_scores_rating      float64
instant_bookable           object
reviews_per_month         float64
dtype: object

In [11]:
# Look at shape
airbnb_sh_df_dna.shape

(23548, 8)

In [12]:
# Convert host response rate to integer
airbnb_sh_df_dna.host_response_rate = airbnb_sh_df_dna.host_response_rate.str.replace('%','').astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airbnb_sh_df_dna.host_response_rate = airbnb_sh_df_dna.host_response_rate.str.replace('%','').astype('int')


In [13]:
# Convert host acceptance rate to integer
airbnb_sh_df_dna.host_acceptance_rate = airbnb_sh_df_dna.host_acceptance_rate.str.replace('%','').astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airbnb_sh_df_dna.host_acceptance_rate = airbnb_sh_df_dna.host_acceptance_rate.str.replace('%','').astype('int')


In [14]:
# Convert price to float
airbnb_sh_df_dna['price'] = airbnb_sh_df_dna['price'].str.replace('$', '').str.replace(',', '').astype('float64')

  airbnb_sh_df_dna['price'] = airbnb_sh_df_dna['price'].str.replace('$', '').str.replace(',', '').astype('float64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airbnb_sh_df_dna['price'] = airbnb_sh_df_dna['price'].str.replace('$', '').str.replace(',', '').astype('float64')


In [15]:
# Encode Super Host column
airbnb_sh_df_dna['host_is_superhost'] = airbnb_sh_df_dna['host_is_superhost'].replace({'t': 1, 'f': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airbnb_sh_df_dna['host_is_superhost'] = airbnb_sh_df_dna['host_is_superhost'].replace({'t': 1, 'f': 0})


In [16]:
# Encode Host Profile column
# airbnb_sh_df_dna['host_has_profile_pic'] = airbnb_sh_df_dna['host_has_profile_pic'].replace({'t': 1, 'f': 0})

In [17]:
# Encode Host Identity column
airbnb_sh_df_dna['host_identity_verified'] = airbnb_sh_df_dna['host_identity_verified'].replace({'t': 1, 'f': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airbnb_sh_df_dna['host_identity_verified'] = airbnb_sh_df_dna['host_identity_verified'].replace({'t': 1, 'f': 0})


In [18]:
# Encode Availability column
# airbnb_sh_df_dna['has_availability'] = airbnb_sh_df_dna['has_availability'].replace({'t': 1, 'f': 0})

In [19]:
# Encode Instant Bookable column
airbnb_sh_df_dna['instant_bookable'] = airbnb_sh_df_dna['instant_bookable'].replace({'t': 1, 'f': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airbnb_sh_df_dna['instant_bookable'] = airbnb_sh_df_dna['instant_bookable'].replace({'t': 1, 'f': 0})


In [20]:
# View Updated Data Frame
airbnb_sh_df_dna

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_identity_verified,price,review_scores_rating,instant_bookable,reviews_per_month
0,100,98,1,1,175.0,4.94,0,1.41
1,0,0,0,0,225.0,4.56,0,0.09
6,100,100,1,0,800.0,4.82,1,0.51
7,100,100,1,1,500.0,4.85,1,0.91
8,100,100,1,1,125.0,4.96,0,2.74
...,...,...,...,...,...,...,...,...
34034,89,99,0,0,288.0,4.81,1,1.35
34035,100,99,0,1,292.0,5.00,0,0.13
34036,100,98,1,1,345.0,5.00,1,1.72
34038,100,100,0,1,600.0,5.00,0,0.59


In [21]:
#Look at datatypes
airbnb_sh_df_dna.dtypes

host_response_rate          int64
host_acceptance_rate        int64
host_is_superhost           int64
host_identity_verified      int64
price                     float64
review_scores_rating      float64
instant_bookable            int64
reviews_per_month         float64
dtype: object

In [22]:
# Remove Super Host target from features data
y = airbnb_sh_df_dna.host_is_superhost.values
X = airbnb_sh_df_dna.drop(columns="host_is_superhost").values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [23]:
# Preprocess numerical data for the neural network

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [24]:
# View X to confirm the data scaled
X

array([[1.00e+02, 9.80e+01, 1.00e+00, ..., 4.94e+00, 0.00e+00, 1.41e+00],
       [0.00e+00, 0.00e+00, 0.00e+00, ..., 4.56e+00, 0.00e+00, 9.00e-02],
       [1.00e+02, 1.00e+02, 0.00e+00, ..., 4.82e+00, 1.00e+00, 5.10e-01],
       ...,
       [1.00e+02, 9.80e+01, 1.00e+00, ..., 5.00e+00, 1.00e+00, 1.72e+00],
       [1.00e+02, 1.00e+02, 1.00e+00, ..., 5.00e+00, 0.00e+00, 5.90e-01],
       [1.00e+02, 7.60e+01, 1.00e+00, ..., 5.00e+00, 1.00e+00, 1.10e-01]])

# Compile, Train and Evaluate the Model - Attempt 5

In [26]:
# Define the deep learning model
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=20, activation="relu", input_dim=7))
nn_model.add(tf.keras.layers.Dense(units=40, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
184/184 - 1s - loss: 0.5048 - accuracy: 0.7496 - 616ms/epoch - 3ms/step
Loss: 0.5048128962516785, Accuracy: 0.7496178150177002


In [27]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

184/184 - 0s - loss: 0.5048 - accuracy: 0.7496 - 257ms/epoch - 1ms/step
Loss: 0.5048128962516785, Accuracy: 0.7496178150177002


# Compile, Train and Evaluate the Model - Attempt 6

In [28]:
# Define the deep learning model
nn_model2 = tf.keras.models.Sequential()
nn_model2.add(tf.keras.layers.Dense(units=20, activation="relu", input_dim=7))
nn_model2.add(tf.keras.layers.Dense(units=40, activation="relu"))
nn_model2.add(tf.keras.layers.Dense(units=60, activation="relu"))
nn_model2.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model2.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model2.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model2.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
184/184 - 0s - loss: 0.4647 - accuracy: 0.7753 - 367ms/epoch - 2ms/step
Loss: 0.46470871567726135, Accuracy: 0.7752675414085388


In [29]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model2.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

184/184 - 0s - loss: 0.4647 - accuracy: 0.7753 - 276ms/epoch - 2ms/step
Loss: 0.46470871567726135, Accuracy: 0.7752675414085388


## Compile, Train and Evaluate the Model - Attempt 7

In [30]:
# Compile, Train and Evaluate the Model - Attempt 3
number_input_features = X_train.shape[1]
hidden_nodes_layer1 = 50  # Adjusted number of neurons
hidden_nodes_layer2 = 30  # Adjusted number of neurons
hidden_nodes_layer3 = 10 # added extra layer
nn = tf.keras.models.Sequential()
# First hidden layer
# Changed activation to "sigmoid"
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="sigmoid"))
# Second hidden layer
# Changed activation to "sigmoid"
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="sigmoid"))
# Third hidden layer
# Changed activation to "tanh"
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="tanh"))
# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
# Check the structure of the model
nn.summary()

# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 50)                400       
                                                                 
 dense_11 (Dense)            (None, 30)                1530      
                                                                 
 dense_12 (Dense)            (None, 10)                310       
                                                                 
 dense_13 (Dense)            (None, 1)                 11        
                                                                 
Total params: 2251 (8.79 KB)
Trainable params: 2251 (8.79 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 1

In [31]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

184/184 - 0s - loss: 0.5040 - accuracy: 0.7461 - 405ms/epoch - 2ms/step
Loss: 0.5040199756622314, Accuracy: 0.7460505962371826
