In [1]:
# Import dependencies
import pandas as pd
import ast                          # importing abstract syntax tree to allow the parsing of string based lists as lists  
from sklearn.preprocessing import StandardScaler

### **Import and make a copy of CSV**

In [2]:
# Filepath
csv_filepath =  "../data/processed/cleaned_listings.csv"

# Read CSV
listings_csv = pd.read_csv(csv_filepath)

# make a copy
listings_df = listings_csv.copy()

In [3]:
listings_df

Unnamed: 0,id,latitude,longitude,neighbourhood,room_type,property_type,standardized_property_type,accommodates,bedrooms,beds,bathrooms,bathrooms_count,amenities,stay_duration,price,number_of_reviews_ly,host_is_superhost,host_listings_count,estimated_occupancy_l365d,estimated_revenue_l365d
0,696407278180533419,43.670920,-79.395190,Annex,Entire home/apt,Entire condo,apartment,3,1,1,1 bath,1.0,"[""Hair dryer"", ""Central heating"", ""Paid parkin...",Mid-term,450.0,2,0,7,56,25200.0
1,696457318817239920,43.655704,-79.399910,Kensington-Chinatown,Private room,Private room in rental unit,budget,2,1,1,1 private bath,1.0,"[""Hair dryer"", ""Central heating"", ""Shampoo"", ""...",Mid-term,78.0,2,1,8,112,8736.0
2,696602542310304703,43.648778,-79.401183,Kensington-Chinatown,Entire home/apt,Entire rental unit,apartment,3,1,1,1 bath,1.0,"[""Hair dryer"", ""Room-darkening shades"", ""Keypa...",Short-term,132.0,4,1,1,224,29568.0
3,696973520016945803,43.612110,-79.539850,Alderwood,Entire home/apt,Entire rental unit,apartment,2,1,2,1 bath,1.0,"[""Hair dryer"", ""Heating"", ""Shampoo"", ""Body soa...",Short-term,81.0,12,0,1,60,4860.0
4,697004718610327555,43.735735,-79.480703,Downsview-Roding-CFB,Entire home/apt,Entire townhouse,home,6,2,3,2 bath,2.5,"[""Room-darkening shades"", ""Keypad"", ""Elevator""...",Mid-term,236.0,15,0,1,90,21240.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9621,1355385238383046824,43.645749,-79.393083,Waterfront Communities-The Island,Entire home/apt,Entire rental unit,apartment,4,1,2,1 bath,1.0,"[""Hair dryer"", ""Elevator"", ""Central heating"", ...",Mid-term,170.0,0,1,1,24,4080.0
9622,1362331480938692703,43.649540,-79.388640,Waterfront Communities-The Island,Entire home/apt,Entire home,home,6,2,3,1 bath,1.0,"[""Hair dryer"", ""Cleaning available during stay...",Mid-term,145.0,0,0,1,6,870.0
9623,1363197896119450651,43.653270,-79.396790,Kensington-Chinatown,Entire home/apt,Entire home,home,2,1,1,1 bath,1.0,"[""Hair dryer"", ""Keypad"", ""Mini fridge"", ""Heati...",Mid-term,114.0,0,1,5,6,684.0
9624,1364280421231205552,43.705119,-79.272289,Clairlea-Birchmount,Private room,Private room in home,budget,2,1,1,2 shared bath,2.0,"[""Hair dryer"", ""Room-darkening shades"", ""Firep...",Short-term,55.0,0,0,2,6,330.0


### **Feature Extraction - Categorical Columns Binary Encoding**

#### use pd.get_dummies on all categorical columns to binary encode for the ML Models

In [4]:
# Apply pd.get_dummies
listings_df = pd.get_dummies(listings_df, columns=['neighbourhood', 'room_type', 'standardized_property_type', 'stay_duration'], dtype=int)

### **Feature Extraction - Amenities**

#### Limit to Necessary Columns and apply Abstract Syntax Tree
Allows the stringified list in amenities to be turned into an accessible list

In [5]:
# Limit CSV to Amenities Column
listings_amenities = listings_df[['id', 'amenities']].copy()

# Apply AST
listings_amenities['amenities'] = listings_amenities['amenities'].apply(ast.literal_eval)

#### Explode Amenities

In [6]:
# Explode amenities Column
exploded_amenities = listings_amenities.explode('amenities')

#### Amenity Basic Cleanup

In [7]:
# Create Amenity Grouping Dictionary
amenity_grouping = {
    # Grouped Coffee makers
    "Coffee": "Coffee maker",
    "Coffee maker: drip coffee maker": "Coffee maker",
    "Coffee maker: Keurig coffee machine": "Coffee maker",

    # Heating / AC
    "Central heating": "Heating",
    "Central air conditioning": "Air conditioning",

    # Washers / Dryers
    "Free washer – In unit": "Washer",
    "Free dryer – In unit": "Dryer",

    # Cooking tools
    "Baking sheet": "Cooking basics",
    "Electric stove": "Stove",
    "Mini fridge": "Refrigerator",
    "Barbecue utensils": "BBQ grill",
    "Stainless steel oven": "Oven",

    # Privacy / security
    "Lock on bedroom door": "Private room lock",
    "Exterior security cameras on property": "Security camera",

    # Furniture
    "Clothing storage: closet": "Clothing storage",
    "Closet": "Clothing storage",
    "Clothing storage: closet and dresser": "Clothing storage",
    "Drying rack for clothing": "Drying rack",
    
    # Patios
    "Private patio or balcony": "Patio or balcony",
    
    # Extras
    "Exercise equipment": "Gym",
    "Shared gym in building": "Gym",
    "Private backyard – Fully fenced": "Backyard"
}

# Apply mapping to exploded_amenities - essentially renaming similar or same amenities
exploded_amenities['amenities'] = exploded_amenities['amenities'].replace(amenity_grouping)

#### Amenity Standardization (list)

In [8]:
# List of Amenities (to KEEP)
basic_amenities = [
    "Smoke alarm", "Carbon monoxide alarm", "Hot water", "Heating", "Essentials", 
    "Bed linens", "Shampoo", "Shower gel", "Body soap", "Hangers", "Hair dryer",
    "Conditioner", "Fire extinguisher", "Cleaning products", "Private room lock"
]

convenience_amenities = [
    "Wifi", "Kitchen", "Microwave", "Refrigerator", "Freezer", "Cooking basics",
    "Stove", "Oven", "Iron", "Air conditioning", "TV", "Washer", "Dryer", "Dishwasher",
    "Coffee maker", "Hot water kettle", "Self check-in", "Elevator", "Drying rack",
    "Dedicated workspace", "Room-darkening shades", "Free parking on premises",
    "Dishes and silverware", "Dining table", "Toaster", "Extra pillows and blankets",
    "First aid kit", "Wine glasses", "Bathtub", "Outdoor furniture", 'Lockbox', 'Keypad',
    "Paid parking on premises", "Clothing storage", "Blender", "Portable fans",
    "Free street parking", 'Ethernet connection', 'Paid parking off premises',
    "Smart lock", "Rice maker", "Paid street parking off premises"
]

special_amenities = [
    "Hot tub", "Pool", "Private entrance", "BBQ grill", "Fire pit",
    "Outdoor dining area", "Patio or balcony", "Gym", "Lake access",
    "Indoor fireplace", "Sound system", "Game console", "EV charger",
    "Security camera", "Pets allowed", "Books and reading material",
    "Backyard", "City skyline view", "Board games"
]

# might remove these as they're administrative or potentially non relevant at best
other_amenities = [
    'Long term stays allowed', 'Luggage dropoff allowed', 'Laundromat nearby', 
    'Host greets you', 'Single level home', 'Cleaning available during stay'
]

# put together for all amenities to be included
complete_amenities = basic_amenities + convenience_amenities + special_amenities + other_amenities

#### Amenity Standardization

In [9]:
# Standardized Amenities
amenities_clean = []

# Cycle and filter out the amenities on the lists above
for index, row in exploded_amenities.iterrows():
    if row['amenities'] in complete_amenities:
        amenities_clean.append(row)

# Convert amenities_cleaned back into Dataframe
amenities_cleaned = pd.DataFrame(amenities_clean)

# Drop instances of duplicates
amenities_cleaned = amenities_cleaned.drop_duplicates()

#### Amenity Binary encoding by pd.get_dummies - **Method 1** for ML 
- may be best but also may just be considered to be a lot of noise ( will require a round of feature importance to determine whether all columns are beneficial)

In [10]:
# Use pd.get_dummies to make categories ingestible for ML
amenity_dummies = pd.get_dummies(amenities_cleaned, columns=['amenities'])

# Group by id and then sum rows to limit back to 1 row per listing
listing_binary_amenities = amenity_dummies.groupby('id').sum().reset_index()

# Clean up column titles
listing_binary_amenities.columns = listing_binary_amenities.columns.str.replace(" ", "_")

# Merge back into dataframe
listings_df = pd.merge(listings_df, listing_binary_amenities, on='id', how='inner')

#### Amenity Categorization

In [11]:
# Function to define amenity by predefined lists
def categorize_amenity(amenity):
    if amenity in basic_amenities:
        return 'basic'
    elif amenity in convenience_amenities:
        return 'convenience'
    elif amenity in special_amenities:
        return 'special'
    elif amenity in other_amenities :
        return 'other'

# Apply function to amenities and save to new column
amenities_cleaned['amenity_classification'] = amenities_cleaned['amenities'].apply(categorize_amenity)

#### Amenity Category counts by listing_id  - **Method 2** for ML
- Categorized by a system of 4 bins, though dictating via common sense might not be considered completely accurate.

In [12]:
# Count Categories
amenities_counts = amenities_cleaned.groupby(['id', 'amenity_classification']).size().unstack(fill_value=0).reset_index()

# Rename Columns
amenities_counts = amenities_counts.rename(columns={
    'basic': 'basic_amenities_count',
    'convenience': 'convenience_amenities_count',
    'special': 'special_amenities_count',
    'other': 'other_amenities_count'
})

# Total Amenity count
amenities_counts['total_amenities_count'] = amenities_counts['basic_amenities_count'] + amenities_counts['convenience_amenities_count'] + amenities_counts['special_amenities_count'] + amenities_counts['other_amenities_count']

# Merge back into dataframe
listings_df = pd.merge(listings_df, amenities_counts, on='id', how='inner')

### **Drop Select Features**
- Drop property_type as we have standardized_property_type.
- Drop bathrooms as we have bathrooms_count.
- Drop amenities as we have binary encoded it
    - Drop basic_amenities_count, convenience_amenities_count, other_amenities_count, special_amenities_count and total_amenities_count as well as we are using Amenity Selection Method 1 (look above)


In [13]:
# Drop ID
listings_output_features = listings_df.drop(columns=[
    'property_type', 'bathrooms', 'amenities', 'basic_amenities_count', 'convenience_amenities_count', 
    'other_amenities_count', 'special_amenities_count', 'total_amenities_count']) 

### **Standard Scaler**

- This will Scale the numerical columns that aren't hot encoded/ represent magnitude rather than those that represent membership (boolean)

- Not all models require to be scaled
    - Linear Regression, kmeans and nn Models definitely benefit
    - Decision Trees and Random Forest Models less so

In [14]:
# Should we scale for this next model? 
scale_data_boolean = True

In [15]:
# Columns to Scale
numerical_columns = [
    'accommodates', 'bedrooms', 'beds', 'bathrooms_count', 'price','number_of_reviews_ly', 'host_is_superhost', 'host_listings_count',
    'estimated_occupancy_l365d', 'estimated_revenue_l365d']

# Create StandardScaler()
scaler = StandardScaler()

# Scale Numerical Columns
if scale_data_boolean:
    scaled_array = scaler.fit_transform(listings_output_features[numerical_columns])
    scaled_df = pd.DataFrame(scaled_array, columns=numerical_columns)
    
    # Drop unscaled numerical columns and replace with scaled
    for col in numerical_columns:
        listings_output_features[col] = scaled_df[col]

### **Check and Save Cleaned Data**

- keeping id, latitude and longitude for later visualization and demo stages

In [16]:
# Check Output Data
listings_output_features

Unnamed: 0,id,latitude,longitude,accommodates,bedrooms,beds,bathrooms_count,price,number_of_reviews_ly,host_is_superhost,...,amenities_Single_level_home,amenities_Smart_lock,amenities_Smoke_alarm,amenities_Sound_system,amenities_Stove,amenities_TV,amenities_Toaster,amenities_Washer,amenities_Wifi,amenities_Wine_glasses
0,696407278180533419,43.670920,-79.395190,-0.159324,-0.491473,-0.697287,-0.473682,2.148153,-0.687940,-1.174763,...,0,0,0,0,0,1,0,1,1,0
1,696457318817239920,43.655704,-79.399910,-0.643838,-0.491473,-0.697287,-0.473682,-0.623993,-0.687940,0.851235,...,0,0,1,0,1,0,1,0,1,0
2,696602542310304703,43.648778,-79.401183,-0.159324,-0.491473,-0.697287,-0.473682,-0.221585,-0.574807,0.851235,...,0,0,1,1,0,1,1,1,0,1
3,696973520016945803,43.612110,-79.539850,-0.643838,-0.491473,0.180248,-0.473682,-0.601637,-0.122277,-1.174763,...,1,0,1,0,1,0,1,0,1,1
4,697004718610327555,43.735735,-79.480703,1.294221,0.620523,1.057782,2.085266,0.553424,0.047422,-1.174763,...,0,0,1,0,1,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9620,1355385238383046824,43.645749,-79.393083,0.325191,-0.491473,0.180248,-0.473682,0.061591,-0.801072,0.851235,...,1,0,1,0,1,1,1,1,1,0
9621,1362331480938692703,43.649540,-79.388640,1.294221,0.620523,1.057782,-0.473682,-0.124709,-0.801072,-1.174763,...,0,0,1,0,1,1,0,1,1,1
9622,1363197896119450651,43.653270,-79.396790,-0.643838,-0.491473,-0.697287,-0.473682,-0.355721,-0.801072,0.851235,...,0,0,1,0,1,0,0,1,1,0
9623,1364280421231205552,43.705119,-79.272289,-0.643838,-0.491473,-0.697287,1.232283,-0.795390,-0.801072,-1.174763,...,0,0,1,0,0,1,1,0,1,0


In [17]:
# Save Cleaned Data as CSV
if scale_data_boolean:
    listings_output_features.to_csv('../data/processed/listings_feature_matrix_scaled.csv', index=False)
else:
    listings_output_features.to_csv('../data/processed/listings_feature_matrix.csv', index=False)
