# Data Modelling

In [75]:
# importing standard librbaries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
%matplotlib inline

In [76]:
# importing pre-processed data - look in Data Preparation - Seatle AirBnB for detailed steps

clean_listings_df = pd.read_csv('./listings_clean.csv')

In [77]:
clean_listings_df.columns.to_list()

['latitude',
 'longitude',
 'bathrooms',
 'beds',
 'price',
 'cleaning_fee',
 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'availability_30',
 'number_of_reviews',
 'review_scores_rating',
 'review_scores_location',
 'calculated_host_listings_count',
 'days_with_abnb',
 'neighbourhood_group_cleansed_dBeacon Hill',
 'neighbourhood_group_cleansed_dCapitol Hill',
 'neighbourhood_group_cleansed_dCascade',
 'neighbourhood_group_cleansed_dCentral Area',
 'neighbourhood_group_cleansed_dDelridge',
 'neighbourhood_group_cleansed_dDowntown',
 'neighbourhood_group_cleansed_dInterbay',
 'neighbourhood_group_cleansed_dLake City',
 'neighbourhood_group_cleansed_dMagnolia',
 'neighbourhood_group_cleansed_dNorthgate',
 'neighbourhood_group_cleansed_dOther neighborhoods',
 'neighbourhood_group_cleansed_dQueen Anne',
 'neighbourhood_group_cleansed_dRainier Valley',
 'neighbourhood_group_cleansed_dSeward Park',
 'neighbourhood_group_cleansed_dUniversity District',
 'neighbourhood_group_cleansed

## Feature Scaling 

In [78]:
# At this step we will normalise and standartise our data by doing it within sisngle pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
    ('norm_scaler', MinMaxScaler()),   
])

In [79]:
clean_listings_tr = num_pipeline.fit_transform(clean_listings_df)

# We receive our output in array
clean_listings_tr

array([[0.5747621 , 0.26155454, 0.125     , ..., 0.        , 0.        ,
        0.        ],
       [0.58717761, 0.29189643, 0.125     , ..., 0.        , 0.        ,
        0.        ],
       [0.54600265, 0.27028569, 0.5625    , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.0147397 , 1.        , 0.125     , ..., 0.        , 0.        ,
        0.        ],
       [0.55744009, 0.80225955, 0.125     , ..., 0.        , 0.        ,
        0.        ],
       [0.59621594, 0.42542029, 0.1875    , ..., 0.        , 0.        ,
        0.        ]])

In [80]:
# Building new df with tranformed data

clean_listings_tr = pd.DataFrame(clean_listings_tr, columns = clean_listings_df.columns,
                                  index = clean_listings_df.index) 

In [81]:
clean_listings_tr

Unnamed: 0,latitude,longitude,bathrooms,beds,price,cleaning_fee,extra_people,minimum_nights,maximum_nights,availability_30,...,property_type_Imputed_dChalet,property_type_Imputed_dCondominium,property_type_Imputed_dDorm,property_type_Imputed_dHouse,property_type_Imputed_dLoft,property_type_Imputed_dOther,property_type_Imputed_dTent,property_type_Imputed_dTownhouse,property_type_Imputed_dTreehouse,property_type_Imputed_dYurt
0,0.574762,0.261555,0.1250,0.000000,0.066327,0.000000,0.016667,0.000000,0.00364,0.466667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.587178,0.291896,0.1250,0.000000,0.132653,0.133333,0.000000,0.001001,0.00089,0.433333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.546003,0.270286,0.5625,0.428571,0.974490,1.000000,0.083333,0.003003,0.00029,0.033333,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.584330,0.271441,0.1250,0.071429,0.081633,0.000000,0.000000,0.000000,0.01124,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.559996,0.253370,0.2500,0.142857,0.438776,0.416667,0.050000,0.000000,0.01124,1.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3813,0.697449,0.328679,0.2500,0.142857,0.345918,0.766667,0.000000,0.002002,0.01124,0.600000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3814,0.632863,0.560043,0.1250,0.071429,0.060204,0.166667,0.083333,0.001001,0.00028,0.200000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3815,0.014740,1.000000,0.1250,0.000000,0.074490,0.116667,0.066667,0.000000,0.00006,0.966667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3816,0.557440,0.802260,0.1250,0.000000,0.080612,0.150000,0.000000,0.002002,0.01124,1.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Linear Regression

In [82]:
#Split into explanatory and response variables

numerical_vars = ['latitude','longitude','bathrooms','beds',
                  'cleaning_fee','extra_people','minimum_nights','maximum_nights',
                 'availability_30','number_of_reviews','review_scores_rating','review_scores_location',
                 'calculated_host_listings_count','days_with_abnb', 'room_type_dPrivate room',
                 'room_type_dShared room', 'bed_type_dCouch','bed_type_dFuton',
                  'bed_type_dPull-out Sofa','bed_type_dReal Bed', 'host_identity_verified_Imputed_dt',
 'property_type_Imputed_dBed & Breakfast',
 'property_type_Imputed_dBoat',
 'property_type_Imputed_dBungalow',
 'property_type_Imputed_dCabin',
 'property_type_Imputed_dCamper/RV',
 'property_type_Imputed_dChalet',
 'property_type_Imputed_dCondominium',
 'property_type_Imputed_dDorm',
 'property_type_Imputed_dHouse',
 'property_type_Imputed_dLoft',
 'property_type_Imputed_dOther',
 'property_type_Imputed_dTent',
 'property_type_Imputed_dTownhouse',
 'property_type_Imputed_dTreehouse',
 'property_type_Imputed_dYurt',
 'instant_bookable_dt',
 'require_guest_phone_verification_dt',
 'host_response_time_Imputed_dwithin a day',
 'host_response_time_Imputed_dwithin a few hours',
 'host_response_time_Imputed_dwithin an hour',
 'host_is_superhost_Imputed_dt',
 'neighbourhood_group_cleansed_dBeacon Hill',
 'neighbourhood_group_cleansed_dCapitol Hill',
 'neighbourhood_group_cleansed_dCascade',
 'neighbourhood_group_cleansed_dCentral Area',
 'neighbourhood_group_cleansed_dDelridge',
 'neighbourhood_group_cleansed_dDowntown',
 'neighbourhood_group_cleansed_dInterbay',
 'neighbourhood_group_cleansed_dLake City',
 'neighbourhood_group_cleansed_dMagnolia',
 'neighbourhood_group_cleansed_dNorthgate',
 'neighbourhood_group_cleansed_dOther neighborhoods',
 'neighbourhood_group_cleansed_dQueen Anne',
 'neighbourhood_group_cleansed_dRainier Valley',
 'neighbourhood_group_cleansed_dSeward Park',
 'neighbourhood_group_cleansed_dUniversity District',
 'neighbourhood_group_cleansed_dWest Seattle',]

X = clean_listings_df.drop(['price'], axis = 1)
#X = clean_listings_df[numerical_vars].copy()
y = clean_listings_df['price'].copy()

#Split into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 

In [83]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3818 entries, 0 to 3817
Data columns (total 58 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   latitude                                           3818 non-null   float64
 1   longitude                                          3818 non-null   float64
 2   bathrooms                                          3818 non-null   float64
 3   beds                                               3818 non-null   float64
 4   cleaning_fee                                       3818 non-null   float64
 5   extra_people                                       3818 non-null   float64
 6   minimum_nights                                     3818 non-null   int64  
 7   maximum_nights                                     3818 non-null   int64  
 8   availability_30                                    3818 non-null   int64  
 9   number_o

In [85]:
# Testing on Linear regression

lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit

LinearRegression(normalize=True)

In [86]:
#Predict and score the model
y_test_preds = lm_model.predict(X_test)

"The r-squared score for your model was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test))

'The r-squared score for your model was 0.5642126824593363 on 1146 values.'

## Random Forest

In [89]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [91]:
param_grid = [
    {'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
    {'bootstrap':[False],'n_estimators':[3,10], 'max_features':[2,3,4]},
    ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv = 5,
                           scoring = 'neg_mean_squared_error',
                           return_train_score = True)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [92]:
# if we continue to search with higher, the score may improve 

# You can also get the best estimator directly 

grid_search.best_estimator_

# If GridSearchCV is initialized with refit=True (which is the default), 
# then once it finds the best estimator using cross- validation, it retrains 
# it on the whole training set.

# Evaluations scores are also available
cvres = grid_search.cv_results_

for mean_score, params in zip (cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

75.91702852042765 {'max_features': 2, 'n_estimators': 3}
66.52902460385056 {'max_features': 2, 'n_estimators': 10}
64.05073079640788 {'max_features': 2, 'n_estimators': 30}
77.69284457160252 {'max_features': 4, 'n_estimators': 3}
64.47938206511037 {'max_features': 4, 'n_estimators': 10}
63.21501597118757 {'max_features': 4, 'n_estimators': 30}
72.41860443653307 {'max_features': 6, 'n_estimators': 3}
65.16605662578161 {'max_features': 6, 'n_estimators': 10}
61.62929537708438 {'max_features': 6, 'n_estimators': 30}
69.67923600725037 {'max_features': 8, 'n_estimators': 3}
63.79203677681307 {'max_features': 8, 'n_estimators': 10}
61.02915636337593 {'max_features': 8, 'n_estimators': 30}
71.50493080680538 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
65.60186259728438 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
71.82965234888242 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
65.46217567445909 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10

In [93]:
#Predict and score the model
y_test_preds = grid_search.predict(X_test)

"The r-squared score for your model was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test))

'The r-squared score for your model was 0.6040592904927777 on 1146 values.'

In [94]:
grid_search.best_estimator_

RandomForestRegressor(max_features=8, n_estimators=30)

## Logistic Regression

In [20]:
# first we will attempt with the same predictors as we used for linear regression
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression() # Instantiate
log_model.fit(X_train, y_train) #Fit

predictions = log_model.predict(X_test)

ValueError: Unknown label type: 'continuous'

## SVM

In [21]:
from sklearn.svm import SVC 

svc_model = SVC()

svc_model.fit(X_train, y_train)

predictions = svc_model.predict(X_test)

ValueError: Unknown label type: 'continuous'

In [11]:
from sklearn.metrics import classification_report, confusion_matrix

#it may be a good idea to normalise the data before running it through SVM

print (confusion_matrix(y_test, predictions))

print('\n')

print(classification_report(y_test, predictions))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


              precision    recall  f1-score   support

        26.0       0.00      0.00      0.00         1
        28.0       0.00      0.00      0.00         2
        29.0       0.00      0.00      0.00         4
        30.0       0.00      0.00      0.00         2
        31.0       0.00      0.00      0.00         1
        34.0       0.00      0.00      0.00         1
        35.0       0.00      0.00      0.00        11
        36.0       0.00      0.00      0.00         1
        37.0       0.00      0.00      0.00         1
        38.0       0.00      0.00      0.00         2
        39.0       0.00      0.00      0.00         4
        40.0       0.00      0.00      0.00         9
        41.0       0.00      0.00      0.00         3
        42.0       0.00      0.00      0.00         7
        45.0       0.00      0.00      0.00         9
        46.0 

  _warn_prf(average, modifier, msg_start, len(result))
