# Data Modelling

In [1]:
# importing standard librbaries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
%matplotlib inline

In [2]:
# importing pre-processed data - look in Data Preparation - Seatle AirBnB for detailed steps

clean_listings_df = pd.read_csv('./listings_clean.csv')

In [3]:
clean_listings_df.columns.to_list()

['latitude',
 'longitude',
 'bathrooms',
 'beds',
 'price',
 'cleaning_fee',
 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'availability_30',
 'number_of_reviews',
 'review_scores_rating',
 'review_scores_location',
 'calculated_host_listings_count',
 'days_with_abnb',
 'neighbourhood_cleansed_dAlki',
 'neighbourhood_cleansed_dArbor Heights',
 'neighbourhood_cleansed_dAtlantic',
 'neighbourhood_cleansed_dBelltown',
 'neighbourhood_cleansed_dBitter Lake',
 'neighbourhood_cleansed_dBriarcliff',
 'neighbourhood_cleansed_dBrighton',
 'neighbourhood_cleansed_dBroadview',
 'neighbourhood_cleansed_dBroadway',
 'neighbourhood_cleansed_dBryant',
 'neighbourhood_cleansed_dCedar Park',
 'neighbourhood_cleansed_dCentral Business District',
 'neighbourhood_cleansed_dColumbia City',
 'neighbourhood_cleansed_dCrown Hill',
 'neighbourhood_cleansed_dDunlap',
 'neighbourhood_cleansed_dEast Queen Anne',
 'neighbourhood_cleansed_dEastlake',
 'neighbourhood_cleansed_dFairmount Park',
 'neighbourh

## Linear Regression

In [4]:
#Split into explanatory and response variables

numerical_vars = ['latitude','longitude','bathrooms','beds',
                  'cleaning_fee','extra_people','minimum_nights','maximum_nights',
                 'availability_30','number_of_reviews','review_scores_rating','review_scores_location',
                 'calculated_host_listings_count','days_with_abnb']

#X = clean_listings_df.drop(['price'], axis = 1)
X = clean_listings_df[numerical_vars].copy()
y = clean_listings_df['price'].copy()

#Split into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 

In [12]:
X

Unnamed: 0,latitude,longitude,bathrooms,beds,cleaning_fee,extra_people,minimum_nights,maximum_nights,availability_30,number_of_reviews,review_scores_rating,review_scores_location,calculated_host_listings_count,days_with_abnb
0,47.636289,-122.371025,1.0,1.0,0.0,5.0,1,365,14,207,95.000000,9.000000,2,1607.0
1,47.639123,-122.365666,1.0,1.0,40.0,0.0,2,90,13,43,96.000000,10.000000,6,1047.0
2,47.629724,-122.369483,4.5,7.0,300.0,25.0,4,30,1,20,97.000000,10.000000,2,571.0
3,47.638473,-122.369279,1.0,2.0,0.0,0.0,1,1125,0,0,94.539262,9.608916,1,789.0
4,47.632918,-122.372471,2.0,3.0,125.0,15.0,1,1125,30,38,92.000000,9.000000,1,1497.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3813,47.664295,-122.359170,2.0,3.0,230.0,0.0,3,1125,18,1,80.000000,10.000000,8,266.0
3814,47.649552,-122.318309,1.0,2.0,50.0,25.0,2,29,6,2,100.000000,10.000000,1,82.0
3815,47.508453,-122.240607,1.0,1.0,35.0,20.0,1,7,29,0,94.539262,9.608916,1,5.0
3816,47.632335,-122.275530,1.0,1.0,45.0,0.0,3,1125,30,0,94.539262,9.608916,1,366.0


In [5]:
# Testing on Linear regression

lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit

LinearRegression(normalize=True)

In [6]:
#Predict and score the model
y_test_preds = lm_model.predict(X_test)

"The r-squared score for your model was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test))

'The r-squared score for your model was 0.49432322986086963 on 1146 values.'

## Logistic Regression

In [8]:
# first we will attempt with the same predictors as we used for linear regression
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression() # Instantiate
log_model.fit(X_train, y_train) #Fit

predictions = log_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## SVM

In [10]:
from sklearn.svm import SVC 

svc_model = SVC()

svc_model.fit(X_train, y_train)

predictions = svc_model.predict(X_test)

In [11]:
from sklearn.metrics import classification_report, confusion_matrix

#it may be a good idea to normalise the data before running it through SVM

print (confusion_matrix(y_test, predictions))

print('\n')

print(classification_report(y_test, predictions))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


              precision    recall  f1-score   support

        26.0       0.00      0.00      0.00         1
        28.0       0.00      0.00      0.00         2
        29.0       0.00      0.00      0.00         4
        30.0       0.00      0.00      0.00         2
        31.0       0.00      0.00      0.00         1
        34.0       0.00      0.00      0.00         1
        35.0       0.00      0.00      0.00        11
        36.0       0.00      0.00      0.00         1
        37.0       0.00      0.00      0.00         1
        38.0       0.00      0.00      0.00         2
        39.0       0.00      0.00      0.00         4
        40.0       0.00      0.00      0.00         9
        41.0       0.00      0.00      0.00         3
        42.0       0.00      0.00      0.00         7
        45.0       0.00      0.00      0.00         9
        46.0 

  _warn_prf(average, modifier, msg_start, len(result))
