First, import any dependencies

In [10]:
import pandas as pd
import helperFunctions
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import math

Next, grab the data we're working with. We'll be using both the Boston and Seattle listing data that Airbnb provided.

In [11]:
dfBL = pd.read_csv('../data/boston/listings.csv')
dfSL = pd.read_csv('../data/seattle/listings.csv')

We want to compare and contrast our models, so we want to run them on similar parameters. The Boston data actually has a few more columns than the Seattle data, so we'll remove them. Seattle doesn't have any columns that Boston doesn't.

In [12]:
BL = set(dfBL.columns)
SL = set(dfSL.columns)
print(f"Boston only cols: {BL.difference(SL)}")
print(f"Seattle only cols: {SL.difference(BL)}")
dfBL.drop(BL.difference(SL),axis=1,inplace=True)

Boston only cols: {'access', 'interaction', 'house_rules'}
Seattle only cols: set()


We also want to focus our analysis on short term rentals, so we'll remove any rentals that require more than a week of renting.

In [13]:
dfBL.drop(dfBL[dfBL['minimum_nights'] > 7].index,inplace=True)
dfSL.drop(dfSL[dfSL['minimum_nights'] > 7].index,inplace=True)

These columns are either meaningless (i.e. - id, urls that are always present), about the dataset (scraping, url), can't be trusted to be accurate (latitude/longitude), redudant to other data (a significant amount of the location ones), beyond the scope of what we're trying to do here (anything that would require image processing or NLP), or is subtlely just actually the price. We want to predict the price from the house, not from the price!

In [14]:
dropList = ['id','last_scraped', 'listing_url', 'scrape_id','thumbnail_url',
               'medium_url', 'host_id', 'host_url','host_name','city',
               'state','market','country', 'latitude', 'longitude',
               'calendar_last_scraped','is_location_exact','smart_location',
               'requires_license','license','picture_url','square_feet',
               'monthly_price','weekly_price','jurisdiction_names',
               'neighbourhood_group_cleansed','experiences_offered',
               'calendar_updated','country_code','calendar_updated',
               'host_thumbnail_url','host_picture_url','has_availability',
               'host_location','description','name','street','neighbourhood',
               'host_neighbourhood','neighbourhood_cleansed','host_listings_count']

These columns have some sort of enumeration to them

In [15]:
dummyCols = ['cancellation_policy','bed_type','room_type',
                 'property_type','host_response_time','zipcode']
listedEnumCols = ['amenities','host_verifications']

Various cleaning. Some cells lack data, which is implictly zero (if there's no cost for having extra people, that cost is $0.00).  Some columns we want to remove the data that's incomplete, some we only care if there's data or there's not data, and some we need to convert booleans.

In [16]:
isNullCols = ['xl_picture_url','transit', 'host_about',
                  'notes','neighborhood_overview','space','summary']
immuteZero = ['security_deposit','cleaning_fee','extra_people','reviews_per_month']
immuteMean = ['first_review_day','first_review_month','first_review_year',
                  'last_review_day','last_review_month','last_review_year',
                  'review_scores_rating','review_scores_accuracy',
                  'review_scores_cleanliness','review_scores_checkin',
                  'review_scores_communication','review_scores_location',
                  'review_scores_value','host_acceptance_rate','host_response_rate']
removeNans = ['bathrooms','bedrooms','beds','host_identity_verified','host_has_profile_pic',
                  'host_total_listings_count','host_is_superhost',
                  'host_since_month','host_since_day','host_since_year']
boolCols = ['host_is_superhost','host_has_profile_pic','host_identity_verified',
                'instant_bookable','require_guest_profile_picture','require_guest_phone_verification']

The following function cleans the data and returns a linear model, the training data, test data, and r2 scores.

In [19]:
def createLM(df_in):
    df = df_in.copy(deep=True)
    df.drop(dropList,axis=1,inplace=True)
    helperFunctions.convert_date_to_numeric(df)
    helperFunctions.convert_dollars_to_numeric(df)
    for col in immuteZero:
        df[col].fillna(0,inplace=True)
    for col in immuteMean:
        df.dropna(subset=[col],inplace=True)
    df.drop(immuteMean,axis=1,inplace=True)
    for col in removeNans:
        df.dropna(subset=[col],inplace=True)
    for col in boolCols:
        helperFunctions.convert_column_to_bool(df, col)
    #Create dummy columns
    for col in dummyCols:
        df = pd.get_dummies(df, columns=[col], drop_first=True, dummy_na=True)
    for cat in listedEnumCols:
        df = helperFunctions.complex_category_to_dummy(df, cat)
        df.drop(cat,axis=1,inplace=True)
    for col in isNullCols:
        df[f"{col}_null"] = df[col].isnull()
        df.drop([col],axis=1,inplace=True)
    y = df['price']
    X = df.drop(['price'],axis=1)
    x_cols = X.columns
    for col in x_cols:
        if len(X[col].unique())<2:
            X.drop([col],axis=1,inplace=True)
    cutoffs = [math.floor(y.count()*.01),math.floor(y.count()*.005),math.floor(y.count()*.001),math.floor(y.count()*.0)]
    r2_scores_test, r2_scores_train, lm_model, X_train, X_test, y_train, y_test,x_reduce = helperFunctions.find_optimal_lm_mod(X, y, cutoffs)
    return (r2_scores_test, r2_scores_train, lm_model, X_train, X_test, y_train, y_test,x_reduce)

In [20]:
r2_scores_test_bl, r2_scores_train_bl, lm_model_bl, X_train_bl, X_test_bl, y_train_bl, y_test_bl,reduce_X_bl = createLM(dfBL)
r2_scores_test_sl, r2_scores_train_sl, lm_model_sl, X_train_sl, X_test_sl, y_train_sl, y_test_sl,reduce_X_sl = createLM(dfSL)

Let's take a look at how well our models did:

In [21]:
print(f"Boston R2 test vs train: {r2_scores_test_bl}, {r2_scores_train_bl}")
print(f"Seattle R2 test vs train: {r2_scores_test_sl}, {r2_scores_train_sl}")

Boston R2 test vs train: [0.6350014277928068, 0.634810778797647, 0.6342499488163085, 0.6297554364519994], [0.7136322436336984, 0.7208443641744989, 0.7235681432923979, 0.7285550593194414]
Seattle R2 test vs train: [0.6248170484947038, 0.6240394411679842, 0.6193711285610737, 0.6173039668513339], [0.675060810604111, 0.676146867858381, 0.6794988047083405, 0.6964800603661554]


Finally, we can inspect some our model. We're interested in what the biggest impact factors for both cities are, so we can compare and contrast. We'll also look at which factor had the biggest positive impact.

It's worth noting that are model for Seattle appears to be taking the location and using that to negatively impact the price. This doesn't mean that every location is bad for your price - just relative to the "base location", which happened to be the column removed from the set when creating the dummy variables. It does mean it won't appear in our "most positive" impact coefficient. A good example at why looking at the absolute coefficient and just determining the magnitude of impact a feature has on the variance of the model can be more valuable than looking at the raw coefficient.

In [22]:
df_coef_bl = helperFunctions.coef_weights(lm_model_bl, reduce_X_bl)
df_coef_bl.head(30)

Unnamed: 0,est_int,coefs,abs_coefs
34,room_type_Shared room,-188553.445708,188553.445708
52,zipcode_02122,-164524.915152,164524.915152
60,zipcode_02132,-154167.354052,154167.354052
33,room_type_Private room,-141605.226098,141605.226098
59,zipcode_02131,-135895.655112,135895.655112
44,zipcode_02111,102792.595289,102792.595289
53,zipcode_02124,-100093.057772,100093.057772
54,zipcode_02125,-99999.413783,99999.413783
63,zipcode_02210,99639.689003,99639.689003
12,bedrooms,98577.559673,98577.559673


In [23]:
df_coef_sl = helperFunctions.coef_weights(lm_model_sl, reduce_X_sl)
df_coef_sl.head(30)

Unnamed: 0,est_int,coefs,abs_coefs
5,host_has_profile_pic,-256854.907655,256854.907655
33,room_type_Shared room,-161143.360501,161143.360501
59,zipcode_98133,-142802.988973,142802.988973
53,zipcode_98118,-112963.900357,112963.900357
45,zipcode_98106,-111371.97765,111371.97765
47,zipcode_98108,-104150.380289,104150.380289
105,'email',-98250.025703,98250.025703
58,zipcode_98126,-89400.310783,89400.310783
44,zipcode_98105,-86093.464105,86093.464105
57,zipcode_98125,-79812.24634,79812.24634


In [24]:
df_coef_sl.loc[df_coef_sl['coefs']==df_coef_sl['coefs'].max()]

Unnamed: 0,est_int,coefs,abs_coefs
71,"""Wireless Internet""",72922.599878,72922.599878


In [25]:
df_coef_bl.loc[df_coef_bl['coefs']==df_coef_bl['coefs'].max()]

Unnamed: 0,est_int,coefs,abs_coefs
44,zipcode_02111,102792.595289,102792.595289


In [26]:
reduce_X_bl['bedrooms'].var()

0.6146425140661573