# Get the data

In [2]:
import pandas as pd
import numpy as np
import datetime
import json
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 500)
%load_ext autoreload
%autoreload 2

In [None]:
def get_data(file, nrows=None):
    if file == 'listings':
        cols = [ 'id',
                 'name',
                 'summary',
                 'space',
                 'description',
                 'experiences_offered',
                 'neighborhood_overview',
                 'notes',
                 'transit',
                 'access',
                 'interaction',
                 'house_rules',
                 'host_since',
                 'host_location',
                 'host_about',
                 'host_response_time',
                 'host_response_rate',
                 'host_neighbourhood',
                 'host_listings_count',
                 'host_total_listings_count',
                 'host_verifications',
                 'host_identity_verified',
                 'street',
                 'neighbourhood_cleansed',
                 'zipcode',
                 'latitude',
                 'longitude',
                 'is_location_exact',
                 'property_type',
                 'room_type',
                 'accommodates',
                 'bathrooms',
                 'bedrooms',
                 'beds',
                 'bed_type',
                 'amenities',
                 'price',
                 'weekly_price',
                 'monthly_price',
                 'security_deposit',
                 'cleaning_fee',
                 'guests_included',
                 'extra_people',
                 'minimum_nights',
                 'maximum_nights',
                 'availability_30',
                 'availability_60',
                 'availability_90',
                 'availability_365',
                 'number_of_reviews',
                 'number_of_reviews_ltm',
                 'first_review',
                 'last_review',
                 'review_scores_rating',
                 'review_scores_accuracy',
                 'review_scores_cleanliness',
                 'review_scores_checkin',
                 'review_scores_communication',
                 'review_scores_location',
                 'review_scores_value',
                 'instant_bookable',
                 'cancellation_policy',
                 'require_guest_profile_picture',
                 'require_guest_phone_verification',
                 'reviews_per_month']
        dates = ['host_since', 'first_review', 'last_review']
        data = pd.read_csv('../data/jan/listings.csv', low_memory=False, parse_dates=dates, usecols=cols,
                           index_col='id', nrows=nrows)
    return data[(data['review_scores_rating'].notna()) & (data['number_of_reviews']>2)]


listings = get_data('listings')

# listings.to_csv('../data/jan/listings_3plus.csv')

In [None]:
fixed_feats = ['host_since', 'host_location', 'host_neighbourhood', 'street', 'neighbourhood_cleansed', 'zipcode',
               'latitude', 'longitude', 'property_type', 'room_type', 'bathrooms', 'bedrooms', 'number_of_reviews']
review_feats = ['number_of_reviews', 'number_of_reviews_ltm', 'first_review', 'last_review', 'review_scores_accuracy',
                'review_scores_checkin', 'review_scores_cleanliness', 'review_scores_communication', 
                'review_scores_location', 'review_scores_value', 'reviews_per_month']
flex_feats = [item for item in listings.columns if (item not in fixed_feats) and (item not in review_feats)] # everything else

In [None]:
listings.shape

In [None]:
listings.head(1)

# Amenities

## Functions

In [11]:
def decode_amenities(df):
    data = df   
    def str_to_list(strn):
        row_items = strn[1:-1].split(',')
        for key,item in enumerate(row_items):
            row_items[key] = item.strip('"').casefold()
        return row_items    
    return data[['amenities']].applymap(str_to_list)

def count_amenities(df):
    data = df
    amenities = decode_amenities(data)
    return amenities.applymap(lambda x: len(x))

def has_amenity(df, name, alias=None):
    data = df
    if not alias:
        alias = name
    amenities = decode_amenities(data)
    col_name = f'has_{alias}'
    amenities[col_name] = amenities[['amenities']].applymap(lambda x: 1 if name.casefold() in x else 0)
    return amenities[[col_name]]

def count_amenity(df, name):
    data = has_amenity(df, name)
    return int(data.sum())

def count_filterables(df, filterables):
    data = decode_amenities(df)
    def apply_count(amenities):
        count = 0
        for item in filterables:
            if isinstance(item, list):
                for x in item:
                    if x.casefold() in amenities:
                        count += 1
                        break
            else:
                if item.casefold() in amenities:
                    count += 1
        return count
    return data.applymap(apply_count)

def encode_filterables(df, filterables):
    data = decode_amenities(df)
    for item in filterables:
        if isinstance(item, list):
            data[''.join([i for i in item[0] if i.isalpha()])] = 0
        else:
            data[''.join([i for i in item if i.isalpha()])] = 0
    encoding = data.drop(columns='amenities').copy()
    for index,column in enumerate(encoding.columns):
        if isinstance(filterables[index], list):
            for item in filterables[index]:
                data[column] =  data[['amenities',column]].apply(
                    lambda x: 1 if item.casefold() in x['amenities'] else 1 if x[column] == 1 else 0,
                    axis=1)
        else:
            data[column] =  data[['amenities']].apply(lambda x: 1 if filterables[index].casefold() in x['amenities'] else 0, axis=1)
    return data.drop(columns='amenities')

In [12]:
filterable_features_localized = ['Free parking on premises', ['Hot tub','shared hot tub','private hot tub'],
                      ['Gym','shared gym'], ['Pool','private pool','shared pool']]
filterable_amenities_localized = [['Kitchen','kitchenette','full kitchen',"chef's kitchen"],'Shampoo','Heating','Air conditioning',
                        ['Washing machine','Washer','Washer / Dryer'],'Dryer','Wifi','Breakfast',
                        'Indoor fireplace','Hangers','Iron','Hair dryer',['Dedicated workspace','laptop friendly workspace'],
                        ['TV', 'cable tv'],['Cot',"Pack 'n Play/travel crib', 'crib"],'High chair',
                        'Self check-in',['Smoke alarm','Smoke detector'],['Carbon monoxide alarm','carbon monoxide detector'],
                        'Private bathroom','Piano']

In [None]:
encode_filterables(listings, filterable_amenities_localized).head()

## Exploration

In [None]:
localized = [['Washer','Washer / Dryer'], ['Pack â€™n Play/travel crib', 'crib'], 'Smoke detector', 'carbon monoxide detector',
            'laptop friendly workspace']

In [None]:
count_filterables(listings, localized)

In [None]:
filterable_features_localized = ['Free parking on premises', ['Hot tub','shared hot tub','private hot tub'],
                      ['Gym','shared gym'], ['Pool','private pool','shared pool']]
filterable_amenities_localized = [['Kitchen','kitchenette','full kitchen',"chef's kitchen"],'Shampoo','Heating','Air conditioning',
                        ['Washing machine','Washer','Washer / Dryer'],'Dryer','Wifi','Breakfast',
                        'Indoor fireplace','Hangers','Iron','Hair dryer',['Dedicated workspace','laptop friendly workspace'],
                        ['tv', 'cable tv'],['Cot',"Pack 'n Play/travel crib', 'crib"],'High chair',
                        'Self check-in',['Smoke alarm','Smoke detector'],['Carbon monoxide alarm','carbon monoxide detector'],
                        'Private bathroom','Piano']

In [None]:
count_filterables(listings, filterable_amenities_localized)

In [None]:
listings.loc[25023,'amenities']

In [None]:
ams = decode_amenities(listings)
ams.head(5)

In [None]:
amcounts = count_amenities(listings)

In [None]:
ams_bool = has_amenity(listings, 'smoking')

In [None]:
has_amenity(listings, 'tv')

In [None]:
ams_ = ams.join(ams_bool)

In [None]:
ams_[ams_['has_smoking'] == 1][['amenities']]

In [None]:
count_amenity(listings, 'internet')

In [None]:
sns.histplot(amcounts)

In [None]:
amcounts.max()

In [None]:
amcounts.describe()

In [46]:
listings['amcount'] = count_amenities(listings)

In [None]:
listings['has_hotwater'] = has_amenity(listings, 'hot water', 'hotwater')

In [None]:
listings['has_freeparking'] = has_amenity(listings, 'free parking on premises', 'freeparking')

In [None]:
sns.scatterplot(data=listings, x='review_scores_rating', y='amcount', hue='has_hotwater')

In [None]:
sns.scatterplot(data=listings[listings['number_of_reviews']>2], x='review_scores_rating', y='amcount', hue='has_freeparking')

In [None]:
listings[listings['number_of_reviews']>2][['review_scores_rating','has_freeparking','amcount']].corr()

In [24]:
def get_amenities_list(data):
    av_amenities=[]
    for row in decode_amenities(data)['amenities']:
        for item in row:
            if item not in av_amenities:
                av_amenities.append(item)
    return av_amenities

amenities_list = get_amenities_list(listings)

In [None]:
filterable_features = ['Free parking on premises', 'Hot tub', 'Gym', 'Pool']
filterable_amenities = ['Kitchen','Shampoo','Heating','Air conditioning','Washing machine','Dryer','Wifi','Breakfast',
                        'Indoor fireplace','Hangers','Iron','Hair dryer','Dedicated workspace','TV','Cot','High chair',
                        'Self check-in','Smoke alarm','Carbon monoxide alarm','Private bathroom','Piano']


In [None]:
for feat in filterable_features:
    print(feat,count_amenity(listings, feat))

In [None]:
for feat in filterable_amenities:
    print(feat,count_amenity(listings, feat))

In [None]:
listings['bed_type'].value_counts()

In [None]:
listings[listings['host_total_listings_count'] != listings['host_total_listings_count']]

# Room / bed number ratios

In [None]:
room_cols = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'review_scores_rating']
listings[room_cols].corr()

In [None]:
room_data = listings[room_cols].copy()

In [None]:
def get_room_ratios(df):
    data = df
    data['accommodates_to_rooms_ratio'] = data['accommodates'] / data['bedrooms']
#     data['accommodates_to_bathrooms_ratio'] = data['accommodates'] / data['bathrooms']
#     data['accommodates_to_beds_ratio'] = data['accommodates'] / data['beds']
#     data['beds_to_rooms_ratio'] = data['beds'] / data['bedrooms']
#     data['beds_to_bathrooms_ratio'] = data['beds'] / data['bathrooms']
    return data[['accommodates_to_rooms_ratio']]

In [None]:
get_room_ratios(room_data).corr()

In [7]:
from fivestar.data import get_data

In [10]:
listings = get_data(local=True)

In [9]:
listings2.head(2)

Unnamed: 0_level_0,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,house_rules,host_since,host_location,host_about,host_response_time,host_response_rate,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_identity_verified,street,neighbourhood_cleansed,zipcode,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,reviews_per_month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1
11551,Arty and Bright London Apartment in Zone 2,Unlike most rental apartments out there my fla...,"Amenities Bedding: 1 Double bed, 1 living room...",Unlike most rental apartments out there my fla...,family,Not even 10 minutes by metro from Victoria Sta...,No Smoking (very strict) Check-in time is afte...,Tons of buses (24hrs) go into central London f...,Guest will have access to the entire apartment,No interaction with guests as you book the ent...,No Smoking (very strict) No pets are allowed i...,2009-10-03,"London, England, United Kingdom","Hello, I'm a friendly Italian man with a very ...",within an hour,100%,Brixton,3.0,3.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,"London, United Kingdom",Lambeth,SW9 8DG,51.46225,-0.11732,t,Apartment,Entire home/apt,4,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Paid par...",$88.00,$645.00,"$2,350.00",$400.00,$30.00,2,$25.00,2,180,17,42,72,344,186,3,2010-03-21,2019-11-17,92.0,9.0,9.0,10.0,10.0,9.0,9.0,t,strict_14_with_grace_period,f,t,1.56
13913,Holiday London DB Room Let-on going,My bright double bedroom with a large window h...,"Hello Everyone, I'm offering my lovely double ...",My bright double bedroom with a large window h...,business,Finsbury Park is a friendly melting pot commun...,For art lovers I can give guest my Tate Member...,The flat only a 10 minute walk to Finsbury Par...,Guest will have access to the self catering ki...,I like to have little chats with my guest over...,I'm an artist and have my artwork up on the wa...,2009-11-16,"London, England, United Kingdom",I am a Multi-Media Visual Artist and Creative ...,within a few hours,70%,LB of Islington,4.0,4.0,"['email', 'phone', 'facebook', 'reviews', 'off...",t,"Islington, Greater London, United Kingdom",Islington,N4 3,51.56802,-0.11121,t,Apartment,Private room,2,1.0,1.0,0.0,Real Bed,"{TV,""Cable TV"",Wifi,Kitchen,""Paid parking off ...",$65.00,$333.00,"$1,176.00",$100.00,$15.00,1,$15.00,1,29,30,60,90,365,20,6,2010-08-18,2019-11-25,97.0,10.0,10.0,10.0,10.0,9.0,9.0,f,moderate,f,f,0.17


# Correlations

In [43]:
features = [['Free parking on premises', 'free street parking', 'paid parking on premises' ,'paid parking off premises'],
            'Heating','Dryer','Wifi','Breakfast',
            'Indoor fireplace','Hangers','Iron','Hair dryer',['Dedicated workspace','laptop friendly workspace'],
            ['TV', 'cable tv'],
            ['Smoke alarm','Smoke detector'],['Carbon monoxide alarm','carbon monoxide detector']]

In [47]:
listings['filtercount'] = count_filterables(listings, features)

In [48]:
matrix = listings[['review_scores_rating']].copy()
matrix = matrix.join(encode_filterables(listings, features)).join(listings[['amcount','filtercount']])

In [49]:
matrix.corr()

Unnamed: 0,review_scores_rating,Freeparkingonpremises,Heating,Dryer,Wifi,Breakfast,Indoorfireplace,Hangers,Iron,Hairdryer,Dedicatedworkspace,TV,Smokealarm,Carbonmonoxidealarm,amcount,filtercount
review_scores_rating,1.0,0.140483,0.05123,0.073914,0.076819,0.10856,0.070339,0.066435,0.068851,0.064457,0.076045,0.060388,0.10041,0.073758,0.149912,0.182322
Freeparkingonpremises,0.140483,1.0,0.059096,0.039821,0.050893,0.085854,0.079255,0.11754,0.098696,0.082403,0.109022,0.067653,0.12192,0.127794,0.390594,0.414795
Heating,0.05123,0.059096,1.0,0.076468,0.128094,0.01812,0.035207,0.158717,0.124042,0.10797,0.09966,0.100936,0.127606,0.092018,0.19492,0.283266
Dryer,0.073914,0.039821,0.076468,1.0,0.05433,-0.0008,0.068764,0.089463,0.152034,0.121838,0.121342,0.168442,0.099928,0.103426,0.284734,0.429842
Wifi,0.076819,0.050893,0.128094,0.05433,1.0,0.020681,0.015392,0.089219,0.123374,0.127097,0.091673,0.100152,0.07408,0.058362,0.140795,0.236314
Breakfast,0.10856,0.085854,0.01812,-0.0008,0.020681,1.0,0.063826,0.018102,0.000314,0.029358,0.016918,-0.018005,0.02605,0.020237,0.046193,0.203853
Indoorfireplace,0.070339,0.079255,0.035207,0.068764,0.015392,0.063826,1.0,0.011292,0.0367,0.02665,0.046465,0.088753,0.025686,0.058905,0.153417,0.237906
Hangers,0.066435,0.11754,0.158717,0.089463,0.089219,0.018102,0.011292,1.0,0.40493,0.33442,0.328527,0.098557,0.1681,0.165791,0.365171,0.528649
Iron,0.068851,0.098696,0.124042,0.152034,0.123374,0.000314,0.0367,0.40493,1.0,0.460232,0.31298,0.230514,0.194976,0.171808,0.403236,0.602881
Hairdryer,0.064457,0.082403,0.10797,0.121838,0.127097,0.029358,0.02665,0.33442,0.460232,1.0,0.270733,0.163613,0.169418,0.169163,0.380283,0.561823


In [50]:
listings.loc[41172620]

name                                    Triple/Twin Private Room Victoria Westminster
summary                             This is a private room near Victoria Station f...
space                               This fantastic house is placed in one of the b...
description                         This is a private room near Victoria Station f...
experiences_offered                                                              none
                                                          ...                        
require_guest_profile_picture                                                       f
require_guest_phone_verification                                                    f
reviews_per_month                                                                   3
amcount                                                                            28
filtercount                                                                         8
Name: 41172620, Length: 66, dtype: object

# Encode Host Verified