# ADS Capstone Project: Airbnb Pricing Prediction
#### By Renetta Nelson, Michael Nguyen and Jacqueline Urenda

In [59]:
#convert 
sample_indices = np.random.choice(X_train.shape[0], 5000, replace=False)
X_train_sample = X_train.iloc[sample_indices, :]
y_train_sample = y_train.iloc[sample_indices]

In [60]:
fit baseline model
baseline_rf = RandomForestRegressor(random_state=42)
baseline_rf.fit(X_train_sample, y_train_sample)

RandomForestRegressor(random_state=42)

In [61]:
#importance
importances = baseline_rf.feature_importances_

In [63]:
#sort by importance(10)
indices = np.argsort(importances)[-10:]
X_train_reduced = X_train.iloc[:, indices]

In [65]:
top_features = X_train.columns[indices]

# Select top features from the sample
X_train_sample_reduced = X_train_sample[top_features]

In [66]:
#set up hyperparameters
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
max_features = ['auto', 'sqrt']
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth}

In [67]:
rf = RandomForestRegressor(random_state=42)

In [68]:
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

In [69]:
#fit random search
rf_random.fit(X_train_sample_reduced, y_train_sample)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [70]:
best_params = rf_random.best_params_
print("Best parameters: ", best_params)

Best parameters:  {'n_estimators': 400, 'max_features': 'auto', 'max_depth': 30}


In [72]:
rf_best = RandomForestRegressor(n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], max_features=best_params['max_features'], random_state=42)
rf_best.fit(X_train_reduced, y_train)

RandomForestRegressor(max_depth=30, n_estimators=400, random_state=42)

In [73]:
# Reduce test set
X_test_reduced = X_test[top_features]

# Predict on test set
y_pred = rf_best.predict(X_test_reduced)

In [75]:
#mse
mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE: {mse}")

Test MSE: 49695.08159226718


In [76]:
#SHAP and interpret
importances = rf_best.feature_importances_
feature_importance_df = pd.DataFrame({"Feature": top_features, "Importance": importances})
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)
print(feature_importance_df)

                  Feature  Importance
2              listing_id    0.401886
9          minimum_nights    0.332864
8  host_neighbourhood_cat    0.160420
5            accommodates    0.041656
7                bedrooms    0.018130
6                bathroom    0.014212
1     review_scores_value    0.011840
0       property_type_cat    0.010397
4       number_of_reviews    0.005509
3  review_scores_location    0.003087


In [78]:
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
n = len(y_test)  # Number of observations
p = X_test.shape[1]  # Number of predictors
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

In [79]:
metrics = pd.DataFrame({"Metric": ["MSE", "RMSE", "R2", "Adjusted R2", "MAPE"],
                        "Value": [mse, rmse, r2, adjusted_r2, mape]})

In [84]:
print(tabulate(metrics))

-  -----------  ------------
0  MSE          49695.1
1  RMSE           222.924
2  R2               0.971798
3  Adjusted R2      0.971798
4  MAPE            11.7113
-  -----------  ------------
