In [49]:
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import re

%matplotlib inline

In [50]:
listings_df = pd.read_csv('./listings.csv')
listings_df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",none,"Roslindale is quiet, convenient and friendly. ...",...,,f,,,f,moderate,f,f,1,
1,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,Charming and quiet room in a second floor 1910...,none,"The room is in Roslindale, a diverse and prima...",...,9.0,f,,,t,moderate,f,f,1,1.3
2,6976,https://www.airbnb.com/rooms/6976,20160906204935,2016-09-07,Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...",none,The LOCATION: Roslindale is a safe and diverse...,...,10.0,f,,,f,moderate,t,f,1,0.47
3,1436513,https://www.airbnb.com/rooms/1436513,20160906204935,2016-09-07,Spacious Sunny Bedroom Suite in Historic Home,Come experience the comforts of home away from...,Most places you find in Boston are small howev...,Come experience the comforts of home away from...,none,Roslindale is a lovely little neighborhood loc...,...,10.0,f,,,f,moderate,f,f,1,1.0
4,7651065,https://www.airbnb.com/rooms/7651065,20160906204935,2016-09-07,Come Home to Boston,"My comfy, clean and relaxing home is one block...","Clean, attractive, private room, one block fro...","My comfy, clean and relaxing home is one block...",none,"I love the proximity to downtown, the neighbor...",...,10.0,f,,,f,flexible,f,f,1,2.25


In [51]:
# Remove the columns that are just URLs, they won't help analysis or predictions

url_cols = [x for x in listings_df.columns if 'url' in x]
listings_df.drop(url_cols, axis=1, inplace=True)

# Remove columns that are all of 1 value
for col in listings_df.columns:
    if len(listings_df[col].unique()) == 1:
        listings_df.drop(col, axis=1, inplace=True)

# Columns that have $ in every rows are monetary columns, turn these into floats
for col in listings_df.select_dtypes('object').columns:
    if listings_df[col].str.contains("\$").mean() == 1:
        listings_df[col] = listings_df[col].replace('[\$,]', '', regex=True).astype(float)

In [52]:
cat_cols = listings_df.select_dtypes('object').columns
num_cols = listings_df.select_dtypes(['float', 'int']).columns

In [53]:
def standardize_text(df, text_field):
    """
    INPUT: DataFrame and text_field column
    OUTPUT: DataFrame with text_field column standardized (e.g. caps turned to lower case, urls removed etc.)
    """
    df[text_field] = df[text_field].str.replace(r"http\S+", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"http", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"@\S+", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ", regex=True)
    df[text_field] = df[text_field].str.replace(r"@", "at", regex=True)
    df[text_field] = df[text_field].str.lower()
    return df

def remove_missing_amenities_from_list(amenities_list):
    """
    INPUT: List of amenities strings
    OUTPUT: List of amenities strings with unwanted strings (that have 'missing' in them) removed
    """
    return [x for x in amenities_list if "missing" not in x]

def clean_and_tokenize_amenities_column(df):
    """
    INPUT: Listings DataFrame
    OUTPUT: Listings DataFrame with amenities column cleaned and its elements tokenized
    """
    df['amenities'] = df['amenities'].apply(lambda col: re.sub("[{}\"]", "", col).lower().split(","))
    df['amenities'] = df['amenities'].apply(remove_missing_amenities_from_list)
    return df

In [None]:
# Change amenities column string into a clean tokenized list of strings
listings_df = clean_and_tokenize_amenities_column(listings_df)

#
all_amenities = sorted(list(set(word for tokens in listings_df["amenities"] for word in tokens)))
all_amenities.remove('')

In [54]:
# Standardize rest of the text-heavy columns

for col in [
    'summary', 
    'space', 
    'description', 
    'neighborhood_overview', 
    'notes', 
    'transit', 
    'access', 
    'interaction',
    'house_rules',
]:
    listings_df = standardize_text(listings_df, col)

In [21]:
list(listings_df.columns)

['id',
 'name',
 'summary',
 'space',
 'description',
 'neighborhood_overview',
 'notes',
 'transit',
 'access',
 'interaction',
 'house_rules',
 'host_id',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'city',
 'zipcode',
 'market',
 'smart_location',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',
 'square_feet',
 'price',
 'weekly_price',
 'monthly_price',
 'security_deposit',
 'cleaning_fee',
 'guests_included',
 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'calendar_updated',
 'availability_30',
 'availability_60',
 'availability_90',
 'availabilit