In [49]:
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import re

%matplotlib inline

In [50]:
listings_df = pd.read_csv('./listings.csv')
listings_df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",none,"Roslindale is quiet, convenient and friendly. ...",...,,f,,,f,moderate,f,f,1,
1,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,Charming and quiet room in a second floor 1910...,none,"The room is in Roslindale, a diverse and prima...",...,9.0,f,,,t,moderate,f,f,1,1.3
2,6976,https://www.airbnb.com/rooms/6976,20160906204935,2016-09-07,Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...",none,The LOCATION: Roslindale is a safe and diverse...,...,10.0,f,,,f,moderate,t,f,1,0.47
3,1436513,https://www.airbnb.com/rooms/1436513,20160906204935,2016-09-07,Spacious Sunny Bedroom Suite in Historic Home,Come experience the comforts of home away from...,Most places you find in Boston are small howev...,Come experience the comforts of home away from...,none,Roslindale is a lovely little neighborhood loc...,...,10.0,f,,,f,moderate,f,f,1,1.0
4,7651065,https://www.airbnb.com/rooms/7651065,20160906204935,2016-09-07,Come Home to Boston,"My comfy, clean and relaxing home is one block...","Clean, attractive, private room, one block fro...","My comfy, clean and relaxing home is one block...",none,"I love the proximity to downtown, the neighbor...",...,10.0,f,,,f,flexible,f,f,1,2.25


In [51]:
# Remove the columns that are just URLs, they won't help analysis or predictions

url_cols = [x for x in listings_df.columns if 'url' in x]
listings_df.drop(url_cols, axis=1, inplace=True)

# Remove columns that are all of 1 value
for col in listings_df.columns:
    if len(listings_df[col].unique()) == 1:
        listings_df.drop(col, axis=1, inplace=True)

# Columns that have $ in every rows are monetary columns, turn these into floats
for col in listings_df.select_dtypes('object').columns:
    if listings_df[col].str.contains("\$").mean() == 1:
        listings_df[col] = listings_df[col].replace('[\$,]', '', regex=True).astype(float)

In [52]:
cat_cols = listings_df.select_dtypes('object').columns
num_cols = listings_df.select_dtypes(['float', 'int']).columns

In [53]:
def standardize_text(df, text_field):
    """
    INPUT: DataFrame and text_field column
    OUTPUT: DataFrame with text_field column standardized (e.g. caps turned to lower case, urls removed etc.)
    """
    df[text_field] = df[text_field].str.replace(r"http\S+", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"http", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"@\S+", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ", regex=True)
    df[text_field] = df[text_field].str.replace(r"@", "at", regex=True)
    df[text_field] = df[text_field].str.lower()
    return df

def remove_missing_amenities_from_list(amenities_list):
    """
    INPUT: List of amenities strings
    OUTPUT: List of amenities strings with unwanted strings (that have 'missing' in them) removed
    """
    return [x for x in amenities_list if "missing" not in x]

def clean_and_tokenize_amenities_column(df):
    """
    INPUT: Listings DataFrame
    OUTPUT: Listings DataFrame with amenities column cleaned and its elements tokenized
    """
    df['amenities'] = df['amenities'].apply(lambda col: re.sub("[{}\"]", "", col).lower().split(","))
    df['amenities'] = df['amenities'].apply(remove_missing_amenities_from_list)
    return df

In [None]:
# Change amenities column string into a clean tokenized list of strings
listings_df = clean_and_tokenize_amenities_column(listings_df)

#
all_amenities = sorted(list(set(word for tokens in listings_df["amenities"] for word in tokens)))
all_amenities.remove('')

In [54]:
# Standardize rest of the text-heavy columns

for col in [
    'summary', 
    'space', 
    'description', 
    'neighborhood_overview', 
    'notes', 
    'transit', 
    'access', 
    'interaction',
    'house_rules',
]:
    listings_df = standardize_text(listings_df, col)

In [55]:
listings_df

Unnamed: 0,id,name,summary,space,description,neighborhood_overview,notes,transit,access,interaction,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,12147973,Sunny Bungalow in the City,"cozy, sunny, family home master bedroom high...",the house has an open and cozy feel at the sam...,"cozy, sunny, family home master bedroom high...","roslindale is quiet, convenient and friendly ...",,"the bus stop is 2 blocks away, and frequent b...","you will have access to 2 bedrooms, a living r...",,...,,,,,f,moderate,f,f,1,
1,3075044,Charming room in pet friendly apt,charming and quiet room in a second floor 1910...,small but cozy and quite room with a full size...,charming and quiet room in a second floor 1910...,"the room is in roslindale, a diverse and prima...","if you don't have a us cell phone, you can tex...",plenty of safe street parking bus stops a few...,apt has one more bedroom (which i use) and lar...,"if i am at home, i am likely working in my hom...",...,10.0,10.0,9.0,9.0,t,moderate,f,f,1,1.30
2,6976,Mexican Folk Art Haven in Boston,"come stay with a friendly, middle aged guy in ...","come stay with a friendly, middle aged guy in ...","come stay with a friendly, middle aged guy in ...",the location roslindale is a safe and diverse...,i am in a scenic part of boston with a couple ...,"public transportation from the house, quick p...","i am living in the apartment during your stay,...","about me i'm a laid back, friendly, unmarried...",...,10.0,10.0,9.0,10.0,f,moderate,t,f,1,0.47
3,1436513,Spacious Sunny Bedroom Suite in Historic Home,come experience the comforts of home away from...,most places you find in boston are small howev...,come experience the comforts of home away from...,roslindale is a lovely little neighborhood loc...,please be mindful of the property as it is old...,there are buses that stop right in front of th...,the basement has a washer dryer and gym area ...,we do live in the house therefore might be som...,...,10.0,10.0,10.0,10.0,f,moderate,f,f,1,1.00
4,7651065,Come Home to Boston,"my comfy, clean and relaxing home is one block...","clean, attractive, private room, one block fro...","my comfy, clean and relaxing home is one block...","i love the proximity to downtown, the neighbor...",i have one roommate who lives on the lower lev...,from logan airport and south station you have...,you will have access to the front and side por...,i love my city and really enjoy sharing it wit...,...,10.0,10.0,9.0,10.0,f,flexible,f,f,1,2.25
5,12386020,Private Bedroom + Great Coffee,super comfy bedroom plus your own bathroom in ...,our sunny condo is located on the second and t...,super comfy bedroom plus your own bathroom in ...,we love our corner of roslindale! for quiet wa...,,to reach downtown boston via public transporta...,guests are welcome to share our living room an...,we're busy but always happy to have guests in ...,...,10.0,10.0,9.0,10.0,f,flexible,f,f,1,1.70
6,5706985,New Lrg Studio apt 15 min to Boston,it's a 5 minute walk to rosi square to catch t...,the whole house was recently redone and it 's ...,it's a 5 minute walk to rosi square to catch t...,roslindale is the new hip area of boston clos...,"information about the house, wifi pasword and ...",the commuter rail (needham line) is a 5 minute...,i have an electronic keypad so arrival time ca...,i can interact or not that is up to you this...,...,10.0,10.0,9.0,9.0,f,strict,f,f,3,4.00
7,2843445,"""Tranquility"" on ""Top of the Hill""","we can accommodate guests who are gluten free,...",we provide a bedroom and full shared bath ra...,"we can accommodate guests who are gluten free,...",our neighborhood is residential with friendly ...,we love having company and meeting people from...,we are a few minutes walk to public bus servic...,amenities include an evening snack upon arriva...,we will share with you tips about the area r...,...,10.0,10.0,10.0,10.0,f,moderate,t,t,2,2.38
8,753446,6 miles away from downtown Boston!,nice and cozy apartment about 6 miles away to ...,nice and cozy apartment about 6 miles away to ...,nice and cozy apartment about 6 miles away to ...,roslindale is a primarily residential neighbor...,,,,,...,10.0,10.0,9.0,10.0,f,moderate,f,f,1,5.36
9,849408,Perfect & Practical Boston Rental,this is a cozy and spacious two bedroom unit w...,perfect apartment rental for those in town vis...,this is a cozy and spacious two bedroom unit w...,"this neighborhood truly has it all good, not...",please note that this is a second floor apartm...,plenty of on street parking with no restrictio...,,i'm available for questions anytime before or ...,...,10.0,10.0,9.0,9.0,f,strict,f,f,2,1.01


In [21]:
list(listings_df.columns)

['id',
 'name',
 'summary',
 'space',
 'description',
 'neighborhood_overview',
 'notes',
 'transit',
 'access',
 'interaction',
 'house_rules',
 'host_id',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'city',
 'zipcode',
 'market',
 'smart_location',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',
 'square_feet',
 'price',
 'weekly_price',
 'monthly_price',
 'security_deposit',
 'cleaning_fee',
 'guests_included',
 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'calendar_updated',
 'availability_30',
 'availability_60',
 'availability_90',
 'availabilit