In [73]:
from collections import Counter
import ast
import os
import pandas as pd
import re

pd.set_option('display.max_columns', None)

In [74]:
data_path = "./data/listings/"
data = pd.DataFrame()
for file_name in os.listdir(data_path):
    data = pd.concat([data, pd.read_csv(f"{data_path}{file_name}", sep=",")], ignore_index=True)

In [75]:
data.sort_values(by="last_scraped", ascending=False, inplace=True)
data.drop_duplicates(subset=["id"], keep="first", inplace=True)

In [81]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74859 entries, 42845 to 37469
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            74859 non-null  int64  
 1   listing_url                                   74859 non-null  object 
 2   scrape_id                                     74859 non-null  int64  
 3   last_scraped                                  74859 non-null  object 
 4   source                                        74859 non-null  object 
 5   name                                          74850 non-null  object 
 6   description                                   73614 non-null  object 
 7   neighborhood_overview                         45659 non-null  object 
 8   picture_url                                   74859 non-null  object 
 9   host_id                                       74859 non-n

In [77]:
data["amenities"] = [ast.literal_eval(amenities) for amenities in data["amenities"]]

In [78]:
def normalize_amenity(amenity: str) -> str:
    amenity = amenity.lower()
    amenity = amenity.replace("-", " ")
    amenity = re.sub(r"[^\w\s]", " ", amenity)
    amenity = re.sub(r"[\s]{2,}", " ", amenity)
    amenity = amenity.strip()
    return amenity

In [79]:
amenities = [normalize_amenity(amenity) for amenities in data["amenities"] for amenity in amenities]
amenities = Counter(amenities)

In [105]:
# to avoid overfitting
amenities = {amenity: amenity_count for (amenity, amenity_count) in amenities.items() if amenity_count >= 200}

In [106]:
sorted(amenities.items(), key=lambda x:x[1], reverse=True)

[('kitchen', 68536),
 ('smoke alarm', 67430),
 ('long term stays allowed', 66561),
 ('essentials', 63462),
 ('wifi', 62247),
 ('hangers', 57286),
 ('iron', 56349),
 ('hair dryer', 52858),
 ('washer', 51539),
 ('hot water', 51223),
 ('heating', 49684),
 ('tv', 49351),
 ('dishes and silverware', 49149),
 ('free parking on premises', 47539),
 ('refrigerator', 46315),
 ('microwave', 45437),
 ('shampoo', 45111),
 ('cooking basics', 44851),
 ('bed linens', 41322),
 ('air conditioning', 41224),
 ('oven', 34956),
 ('stove', 34296),
 ('private entrance', 32307),
 ('dishwasher', 31827),
 ('fire extinguisher', 30693),
 ('dryer', 30576),
 ('free street parking', 28805),
 ('extra pillows and blankets', 28719),
 ('coffee maker', 27223),
 ('first aid kit', 27120),
 ('bbq grill', 25988),
 ('lockbox', 24277),
 ('hot water kettle', 22917),
 ('toaster', 22242),
 ('wine glasses', 20637),
 ('dining table', 20090),
 ('cleaning products', 19419),
 ('shower gel', 19135),
 ('dedicated workspace', 19055),
 ('fr