In [1]:
# Import dependencies
import pandas as pd
import ast                          # importing abstract syntax tree to allow the parsing of string based lists as lists  

### **Import and Limit CSV**

In [2]:
# Filepath
csv_filepath =  "../../data/processed/cleaned_listings.csv"

# Read CSV
listings_cleaned = pd.read_csv(csv_filepath)

# Limit CSV
listings_amenities = listings_cleaned[['id', 'amenities']].copy()

# Display CSV
listings_amenities

Unnamed: 0,id,amenities
0,696407278180533419,"[""Hair dryer"", ""Central heating"", ""Paid parkin..."
1,696457318817239920,"[""Hair dryer"", ""Central heating"", ""Shampoo"", ""..."
2,696460661218975906,"[""Hair dryer"", ""Central heating"", ""Shampoo"", ""..."
3,696602542310304703,"[""Hair dryer"", ""Room-darkening shades"", ""Keypa..."
4,696638535490228478,"[""Pool table"", ""Dedicated workspace"", ""Exercis..."
...,...,...
15106,1366954101885217722,"[""Heating"", ""Shampoo"", ""Iron"", ""Body soap"", ""B..."
15107,1366970333908472070,"[""Hair dryer"", ""Room-darkening shades"", ""Eleva..."
15108,1366999672978459092,"[""Pool table"", ""Fire extinguisher"", ""Outdoor d..."
15109,1367290343089381102,"[""Keypad"", ""Heating"", ""Iron"", ""Body soap"", ""Be..."


### **Apply Abstract Syntax Tree**
Allows the stringified list in amenities to be turned into an accessible list

In [3]:
# Apply AST
listings_amenities['amenities'] = listings_amenities['amenities'].apply(ast.literal_eval)

### **Explode Dataset**

In [4]:
# Explode amenities Column
exploded_amenities = listings_amenities.explode('amenities')

# Display Exploded amenities
exploded_amenities

Unnamed: 0,id,amenities
0,696407278180533419,Hair dryer
0,696407278180533419,Central heating
0,696407278180533419,Paid parking garage on premises
0,696407278180533419,Shampoo
0,696407278180533419,Iron
...,...,...
15110,1367402901303182875,Fire pit
15110,1367402901303182875,BBQ grill
15110,1367402901303182875,Washer
15110,1367402901303182875,Pool


### **Amenity Grouping**

In [5]:
# Create Amenity Grouping Dictionary
amenity_grouping = {
    # Grouped Coffee makers
    "Coffee": "Coffee maker",
    "Coffee maker: drip coffee maker": "Coffee maker",
    "Coffee maker: Keurig coffee machine": "Coffee maker",

    # Heating / AC
    "Central heating": "Heating",
    "Central air conditioning": "Air conditioning",

    # Washers / Dryers
    "Free washer – In unit": "Washer",
    "Free dryer – In unit": "Dryer",

    # Cooking tools
    "Baking sheet": "Cooking basics",
    "Electric stove": "Stove",
    "Mini fridge": "Refrigerator",
    "Barbecue utensils": "BBQ grill",
    "Stainless steel oven": "Oven",

    # Privacy / security
    "Lock on bedroom door": "Private room lock",
    "Exterior security cameras on property": "Security camera",

    # Furniture
    "Clothing storage: closet": "Clothing storage",
    "Closet": "Clothing storage",
    "Clothing storage: closet and dresser": "Clothing storage",
    "Drying rack for clothing": "Drying rack",
    
    # Patios
    "Private patio or balcony": "Patio or balcony",
    
    # Extras
    "Exercise equipment": "Gym",
    "Shared gym in building": "Gym",
    "Private backyard – Fully fenced": "Backyard"
}

# Apply mapping to exploded_amenities
exploded_amenities['amenities'] = exploded_amenities['amenities'].replace(amenity_grouping)

### **Amenity Selection**

In [6]:
# List of Amenities (to KEEP)
basic_amenities = [
    "Smoke alarm", "Carbon monoxide alarm", "Hot water", "Heating", "Essentials", 
    "Bed linens", "Shampoo", "Shower gel", "Body soap", "Hangers", "Hair dryer",
    "Conditioner", "Fire extinguisher", "Cleaning products", "Private room lock"
]

convenience_amenities = [
    "Wifi", "Kitchen", "Microwave", "Refrigerator", "Freezer", "Cooking basics",
    "Stove", "Oven", "Iron", "Air conditioning", "TV", "Washer", "Dryer", "Dishwasher",
    "Coffee maker", "Hot water kettle", "Self check-in", "Elevator", "Drying rack",
    "Dedicated workspace", "Room-darkening shades", "Free parking on premises",
    "Dishes and silverware", "Dining table", "Toaster", "Extra pillows and blankets",
    "First aid kit", "Wine glasses", "Bathtub", "Outdoor furniture", 'Lockbox', 'Keypad',
    "Paid parking on premises", "Clothing storage", "Blender", "Portable fans",
    "Free street parking", 'Ethernet connection', 'Paid parking off premises',
    "Smart lock", "Rice maker", "Paid street parking off premises"
]

special_amenities = [
    "Hot tub", "Pool", "Private entrance", "BBQ grill", "Fire pit",
    "Outdoor dining area", "Patio or balcony", "Gym", "Lake access",
    "Indoor fireplace", "Sound system", "Game console", "EV charger",
    "Security camera", "Pets allowed", "Books and reading material",
    "Backyard", "City skyline view", "Board games"
]

# might remove these as they're administrative or potentially non relevant at best
other_amenities = [
    'Long term stays allowed', 'Luggage dropoff allowed', 'Laundromat nearby', 
    'Host greets you', 'Single level home', 'Cleaning available during stay'
]

# put together for all amenities to be included
complete_amenities = basic_amenities + convenience_amenities + special_amenities + other_amenities

### **Amenity Standardization**

In [7]:
# Standardized Amenities
amenities_clean = []

# Cycle and filter out the amenities on the lists above
for index, row in exploded_amenities.iterrows():
    if row['amenities'] in complete_amenities:
        amenities_clean.append(row)

# Convert amenities_cleaned back into Dataframe
amenities_cleaned = pd.DataFrame(amenities_clean)

# Drop instances of duplicates
amenities_cleaned = amenities_cleaned.drop_duplicates()

### **Amenity Binary encoding by pd.get_dummies** - Method 1 for ML 
- may be best but also may just be considered to be a lot of noise ( will require a round of feature importance to determine whether all columns are beneficial)

In [12]:
# Use pd.get_dummies to make categories ingestible for ML
amenity_dummies = pd.get_dummies(amenities_cleaned, columns=['amenities'])

# Group by id and then sum rows to limit back to 1 row per listing
listing_amenities = amenity_dummies.groupby('id').sum()

# Clean up column titles
listing_amenities.columns = listing_amenities.columns.str.replace(" ", "_")

listing_amenities

Unnamed: 0_level_0,amenity_classification,amenities_Air_conditioning,amenities_BBQ_grill,amenities_Backyard,amenities_Bathtub,amenities_Bed_linens,amenities_Blender,amenities_Board_games,amenities_Body_soap,amenities_Books_and_reading_material,...,amenities_Single_level_home,amenities_Smart_lock,amenities_Smoke_alarm,amenities_Sound_system,amenities_Stove,amenities_TV,amenities_Toaster,amenities_Washer,amenities_Wifi,amenities_Wine_glasses
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26654,basicconvenienceconveniencebasicbasicconvenien...,1,0,0,1,1,1,0,0,1,...,1,0,1,0,1,0,1,1,1,1
40701,basicconvenienceconvenienceconveniencebasicspe...,1,0,0,1,1,1,0,1,1,...,0,0,1,0,1,0,1,1,1,1
44452,basicconveniencebasicbasicconvenienceconvenien...,1,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,1,1,0
45399,basicconvenienceotherconveniencebasicbasicconv...,1,0,0,1,1,0,0,0,0,...,1,0,1,0,1,0,1,1,1,0
45893,basicconvenienceconveniencebasicbasicconvenien...,1,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1366954101885217722,basicbasicconveniencebasicconveniencebasicconv...,1,1,1,1,1,0,0,1,0,...,0,1,1,0,1,1,1,1,1,0
1366970333908472070,basicconvenienceconvenienceconveniencebasicbas...,1,1,0,1,1,1,1,0,0,...,0,0,1,0,1,1,1,1,1,1
1366999672978459092,basicspecialconveniencespecialconvenienceconve...,1,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0
1367290343089381102,conveniencebasicconveniencebasicbasicconvenien...,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### **Amenity Categorization**

In [9]:
# Function to define amenity by predefined lists
def categorize_amenity(amenity):
    if amenity in basic_amenities:
        return 'basic'
    elif amenity in convenience_amenities:
        return 'convenience'
    elif amenity in special_amenities:
        return 'special'
    elif amenity in other_amenities :
        return 'other'

# Apply function to amenities and save to new column
amenities_cleaned['amenity_classification'] = amenities_cleaned['amenities'].apply(categorize_amenity)

### **Amenity Category counts by listing_id**  - Method 2 for ML
- Categorized by a system of 4 categories myself with common sense which might be considered inaccurate.

In [10]:
# Count Categories
amenities_counts = amenities_cleaned.groupby(['id', 'amenity_classification']).size().unstack(fill_value=0)

# Rename Columns
amenities_counts = amenities_counts.rename(columns={
    'basic': 'basic_amenities_count',
    'convenience': 'convenience_amenities_count',
    'special': 'special_amenities_count',
    'other': 'other_amenities_count'
})

amenities_counts

amenity_classification,basic_amenities_count,convenience_amenities_count,other_amenities_count,special_amenities_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
26654,12,31,3,6
40701,9,32,2,4
44452,8,17,1,0
45399,10,20,3,2
45893,7,17,0,0
...,...,...,...,...
1366954101885217722,12,28,2,8
1366970333908472070,12,31,2,5
1366999672978459092,3,7,0,4
1367290343089381102,5,11,1,2


In [11]:
top_amenities = amenities_cleaned['amenities'].value_counts().head(20)

display(amenities_cleaned)
top_amenities

Unnamed: 0,id,amenities,amenity_classification
0,696407278180533419,Hair dryer,basic
0,696407278180533419,Heating,basic
0,696407278180533419,Shampoo,basic
0,696407278180533419,Iron,convenience
0,696407278180533419,Microwave,convenience
...,...,...,...
15110,1367402901303182875,Fire pit,special
15110,1367402901303182875,BBQ grill,special
15110,1367402901303182875,Washer,convenience
15110,1367402901303182875,Pool,special


amenities
Smoke alarm              14638
Kitchen                  13805
Wifi                     13671
Air conditioning         13375
Carbon monoxide alarm    13353
Heating                  12620
Hot water                12414
Hangers                  11838
Dishes and silverware    11548
Washer                   11271
Hair dryer               11266
Refrigerator             11127
Essentials               11110
Microwave                10982
Cooking basics           10936
Bed linens               10789
Iron                     10226
Dedicated workspace       9930
Shampoo                   9659
Coffee maker              9335
Name: count, dtype: int64