First, import any dependencies

In [163]:
import pandas as pd
import helperFunctions
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import normalize
import math

Next, grab the data we're working with. We'll be using both the Boston and Seattle listing data that Airbnb provided.

In [153]:
dfBL = pd.read_csv('../data/boston/listings.csv')
dfSL = pd.read_csv('../data/seattle/listings.csv')

We want to compare and contrast our models, so we want to run them on similar parameters. The Boston data actually has a few more columns than the Seattle data, so we'll remove them. Seattle doesn't have any columns that Boston doesn't.

In [154]:
BL = set(dfBL.columns)
SL = set(dfSL.columns)
print(f"Boston only cols: {BL.difference(SL)}")
print(f"Seattle only cols: {SL.difference(BL)}")
dfBL.drop(BL.difference(SL),axis=1,inplace=True)

Boston only cols: {'access', 'interaction', 'house_rules'}
Seattle only cols: set()


We also want to focus our analysis on short term rentals, so we'll remove any rentals that require more than a week of renting.

In [155]:
dfBL.drop(dfBL[dfBL['minimum_nights'] > 7].index,inplace=True)
dfSL.drop(dfSL[dfSL['minimum_nights'] > 7].index,inplace=True)

These columns are either meaningless (i.e. - id, urls that are always present), about the dataset (scraping, url), can't be trusted to be accurate (latitude/longitude), redudant to other data (a significant amount of the location ones), beyond the scope of what we're trying to do here (anything that would require image processing or NLP), or is subtlely just actually the price. We want to predict the price from the house, not from the price!

In [156]:
drop_list = ['id','last_scraped', 'listing_url', 'scrape_id','thumbnail_url',
               'medium_url', 'host_id', 'host_url','host_name','city',
               'state','market','country', 'latitude', 'longitude',
               'calendar_last_scraped','is_location_exact','smart_location',
               'requires_license','license','picture_url','square_feet',
               'monthly_price','weekly_price','jurisdiction_names',
               'neighbourhood_group_cleansed','experiences_offered',
               'calendar_updated','country_code','calendar_updated',
               'host_thumbnail_url','host_picture_url','has_availability',
               'host_location','description','name','street','neighbourhood',
               'host_neighbourhood','neighbourhood_cleansed','host_listings_count',
               'host_has_profile_pic']

These columns have some sort of enumeration to them

In [157]:
dummy_cols = ['cancellation_policy','bed_type','room_type',
                 'property_type','host_response_time','zipcode']
listed_enum_cols = ['amenities','host_verifications']

Various cleaning. Some cells lack data, which is implictly zero (if there's no cost for having extra people, that cost is $0.00).  Some columns we want to remove the data that's incomplete, some we only care if there's data or there's not data, and some we need to convert booleans.

In [158]:
is_null_cols = ['xl_picture_url','transit', 'host_about',
                  'notes','neighborhood_overview','space','summary']
immute_zero_cols = ['security_deposit','cleaning_fee','extra_people','reviews_per_month']
remove_nan_cols = ['bathrooms','bedrooms','beds','host_identity_verified',
                  'host_total_listings_count','host_is_superhost',
                  'host_since_month','host_since_day','host_since_year',
                  'first_review_day','first_review_month','first_review_year',
                  'last_review_day','last_review_month','last_review_year',
                  'review_scores_rating','review_scores_accuracy',
                  'review_scores_cleanliness','review_scores_checkin',
                  'review_scores_communication','review_scores_location',
                  'review_scores_value','host_acceptance_rate','host_response_rate']
bool_cols = ['host_is_superhost','host_identity_verified',
                'instant_bookable','require_guest_profile_picture','require_guest_phone_verification']

The following function cleans the data and returns a linear model, the training data, test data, and r2 scores.

In [172]:
def createLM(df_in, test_size=.3, random_state=42):
    df = df_in.copy(deep=True)
    df.drop(drop_list,axis=1,inplace=True)
    helperFunctions.convert_date_to_numeric(df)
    helperFunctions.convert_dollars_to_numeric(df)
    helperFunctions.convert_percent_to_numeric(df)
    for col in immute_zero_cols:
        df[col].fillna(0,inplace=True)
    for col in remove_nan_cols:
        df.dropna(subset=[col],inplace=True)
    for col in bool_cols:
        helperFunctions.convert_column_to_bool(df, col)
    #Create dummy columns
    for col in dummy_cols:
        df = pd.get_dummies(df, columns=[col], drop_first=True, dummy_na=True)
    for cat in listed_enum_cols:
        df = helperFunctions.complex_category_to_dummy(df, cat)
        df.drop(cat,axis=1,inplace=True)
    for col in is_null_cols:
        df[f"{col}_null"] = df[col].isnull()
        df.drop([col],axis=1,inplace=True)
    y = df['price']
    X = df.drop(['price'],axis=1)
    x_cols = X.columns
    #Remove columns that have no variance. These don't contribute to the model and can lead to some weird behavior
    for col in x_cols:
        if len(X[col].unique())<2:
            X.drop([col],axis=1,inplace=True)
    
    # From experimentation remove data
    X_reduced = X.iloc[:,np.where(X.sum() > math.floor(y.count()*.005))[0]]
    X_reduced_norm = normalize(X_reduced)
    X_train, X_test, y_train, y_test = train_test_split(
        X_reduced_norm, y, test_size=test_size, random_state=random_state)

    # fit the model
    lm_model = LinearRegression()
    lm_model.fit(X_train, y_train)
    y_test_preds = lm_model.predict(X_test)
    y_train_preds = lm_model.predict(X_train)
    r2_scores_test = r2_score(y_test, y_test_preds)
    r2_scores_train = r2_score(y_train, y_train_preds)
    return (r2_scores_test, r2_scores_train, lm_model, X_train, X_test, y_train, y_test,X_reduced)

In [173]:
r2_scores_test_bl, r2_scores_train_bl, lm_model_bl, X_train_bl, X_test_bl, y_train_bl, y_test_bl,reduce_X_bl = createLM(dfBL)
r2_scores_test_sl, r2_scores_train_sl, lm_model_sl, X_train_sl, X_test_sl, y_train_sl, y_test_sl,reduce_X_sl = createLM(dfSL)

Let's take a look at how well our models did:

In [174]:
print(f"Boston R2 test vs train: {r2_scores_test_bl}, {r2_scores_train_bl}")
print(f"Seattle R2 test vs train: {r2_scores_test_sl}, {r2_scores_train_sl}")

Boston R2 test vs train: 0.635982362771986, 0.7312698042475881
Seattle R2 test vs train: 0.6270211145583205, 0.6844648691666639


Finally, we can inspect some our model. We're interested in what the biggest impact factors for both cities are, so we can compare and contrast. We'll also look at which factor had the biggest positive impact.

It's worth noting that are model for Seattle appears to be taking the location and using that to negatively impact the price. This doesn't mean that every location is bad for your price - just relative to the "base location", which happened to be the column removed from the set when creating the dummy variables. It does mean it won't appear in our "most positive" impact coefficient. A good example at why looking at the absolute coefficient and just determining the magnitude of impact a feature has on the variance of the model can be more valuable than looking at the raw coefficient.

In [175]:
df_coef_bl = helperFunctions.coef_weights(lm_model_bl, reduce_X_bl)
df_coef_bl.head(30)

Unnamed: 0,est_int,coefs,abs_coefs
76,zipcode_02132,-404140.370819,404140.370819
70,zipcode_02126,-396003.362253,396003.362253
67,zipcode_02122,-393785.407382,393785.407382
75,zipcode_02131,-370350.231683,370350.231683
79,zipcode_02136,-329700.59165,329700.59165
68,zipcode_02124,-305468.748043,305468.748043
69,zipcode_02125,-287561.296508,287561.296508
48,room_type_Shared room,-282568.461292,282568.461292
66,zipcode_02121,-282180.871945,282180.871945
74,zipcode_02130,-256089.652901,256089.652901


In [176]:
df_coef_sl = helperFunctions.coef_weights(lm_model_sl, reduce_X_sl)
df_coef_sl.head(30)

Unnamed: 0,est_int,coefs,abs_coefs
47,room_type_Shared room,-257866.177869,257866.177869
76,zipcode_98133,-207393.888436,207393.888436
62,zipcode_98106,-169372.619186,169372.619186
70,zipcode_98118,-162749.497137,162749.497137
122,'email',-159208.672881,159208.672881
64,zipcode_98108,-151306.831142,151306.831142
68,zipcode_98116,-129948.715928,129948.715928
61,zipcode_98105,-127126.980424,127126.980424
67,zipcode_98115,-125037.275551,125037.275551
88,"""Wireless Internet""",124846.911711,124846.911711


In [177]:
df_coef_sl.loc[df_coef_sl['coefs']==df_coef_sl['coefs'].max()]

Unnamed: 0,est_int,coefs,abs_coefs
88,"""Wireless Internet""",124846.911711,124846.911711


In [178]:
df_coef_bl.loc[df_coef_bl['coefs']==df_coef_bl['coefs'].max()]

Unnamed: 0,est_int,coefs,abs_coefs
133,'phone',193734.839475,193734.839475
