# Data Modelling

In [33]:
# importing standard librbaries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
%matplotlib inline

In [34]:
# importing pre-processed data - look in 'Data Preparation - Seatle AirBnB' for detailed steps

clean_listings_df = pd.read_csv('./listings_clean.csv')

## Feature Scaling 

In [35]:
# At this step we will normalise and standartise our data by doing it within sisngle pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
    ('norm_scaler', MinMaxScaler()),   
])

In [36]:
# We receive our output in array
clean_listings_tr = num_pipeline.fit_transform(clean_listings_df)

# Building new df with tranformed data

clean_listings_tr = pd.DataFrame(clean_listings_tr, columns = clean_listings_df.columns,
                                  index = clean_listings_df.index) 

## Train / test split

In [37]:
#Split into explanatory and response variables

X = clean_listings_df.drop(['price_x'], axis = 1)
y = clean_listings_df['price_x'].copy()

#Split into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 

## Linear Regression

In [38]:
# Testing on Linear regression

lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit

LinearRegression(normalize=True)

In [39]:
#Predict and score the model
y_test_preds = lm_model.predict(X_test)

"The r-squared score for your model was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test))

'The r-squared score for your model was 0.5926181831561046 on 280363 values.'

Low R-squared score for Linear Regression indicates the fact that model is underfitting. This is likely to happen due to the inability of linear regression to find more complex non-linear connection between attributes. 

## Random Forest

In [40]:
# Let's use Random Forest to find more comlex non-linear relationships in data

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [41]:
# Grid search feature will help us to find the right combination of max_features and n_estimators for our model

param_grid = [
    {'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
    {'bootstrap':[False],'n_estimators':[3,10], 'max_features':[2,3,4]},
    ]

forest_reg = RandomForestRegressor()

# We also specify number of cross validations

grid_search = GridSearchCV(forest_reg, param_grid, cv = 5,
                           scoring = 'neg_mean_squared_error',
                           return_train_score = True)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [42]:
# if we continue to search with higher, the score may improve 

# You can also get the best estimator directly 

grid_search.best_estimator_

# If GridSearchCV is initialized with refit=True (which is the default), 
# then once it finds the best estimator using cross- validation, it retrains 
# it on the whole training set.

# Evaluations scores are also available
cvres = grid_search.cv_results_

for mean_score, params in zip (cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

14.80742708442583 {'max_features': 2, 'n_estimators': 3}
14.665805582906884 {'max_features': 2, 'n_estimators': 10}
14.613858613892374 {'max_features': 2, 'n_estimators': 30}
14.810062287829982 {'max_features': 4, 'n_estimators': 3}
14.666078935916905 {'max_features': 4, 'n_estimators': 10}
14.61007382638659 {'max_features': 4, 'n_estimators': 30}
14.790648218434555 {'max_features': 6, 'n_estimators': 3}
14.662041825042882 {'max_features': 6, 'n_estimators': 10}
14.615406630169824 {'max_features': 6, 'n_estimators': 30}
14.798437482208067 {'max_features': 8, 'n_estimators': 3}
14.641115818377665 {'max_features': 8, 'n_estimators': 10}
14.614589346257208 {'max_features': 8, 'n_estimators': 30}
14.623864852835249 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
14.608476539061162 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
14.620798103748253 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
14.60672985081835 {'bootstrap': False, 'max_features': 3, 'n_es

In [43]:
#Predict and score the model
y_test_preds = grid_search.predict(X_test)

"The r-squared score for your model was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test))

'The r-squared score for your model was 0.9804751416020936 on 280363 values.'

In [44]:
# So far this is the best model we could produce 
grid_search.best_estimator_

#bootstrap=False, max_features=4, n_estimators=10

RandomForestRegressor(bootstrap=False, max_features=4, n_estimators=10)

### Feature Importances

In [47]:
# Let's analyse importance of predicting attributes

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances


attributes = X_train.columns

sorted(zip(feature_importances,attributes), reverse = True)

[(0.17803787847413619, 'beds'),
 (0.1561785751766652, 'bathrooms'),
 (0.1407561800352261, 'cleaning_fee'),
 (0.058305333258753665, 'room_type_Imputed_dPrivate room'),
 (0.04739203449245012, 'month'),
 (0.04190388206405753, 'extra_people'),
 (0.040399538368748454, 'number_of_reviews'),
 (0.03358554898580498, 'review_scores_rating'),
 (0.03258392770250782, 'host_since_year'),
 (0.03077110956808467, 'availability_30'),
 (0.030580900547716467, 'minimum_nights'),
 (0.0220225469519129, 'maximum_nights'),
 (0.020248377924861068, 'review_scores_location'),
 (0.018321796325899368, 'neighbourhood_group_cleansed_Imputed_dDowntown'),
 (0.015259618163329686, 'room_type_Imputed_dShared room'),
 (0.011592652541617511, 'property_type_Imputed_dHouse'),
 (0.011048907621513268, 'host_is_superhost_Imputed_dt'),
 (0.010982550085170958, 'neighbourhood_group_cleansed_Imputed_dQueen Anne'),
 (0.010978990797787132, 'host_response_time_Imputed_dwithin a few hours'),
 (0.010901643275036375, 'host_response_time_I