In [23]:
# Import dependencies
import pandas as pd
import ast                          # importing abstract syntax tree to allow the parsing of string based lists as lists  
from sklearn.preprocessing import StandardScaler
import numpy as np

### **Import and make a copy of CSV**

In [24]:
# Filepath
csv_filepath =  "../../../data/processed/cleaned_listings.csv"

# Read CSV
listings_csv = pd.read_csv(csv_filepath)

# make a copy
listings_df = listings_csv.copy()

In [30]:
listings_df

Unnamed: 0,id,latitude,longitude,downtown_distance,downtown_zone,neighbourhood,neighbourhood_freq,luxury_neighborhood_flag,room_type,property_type,...,bathrooms_count,amenities,min_stay,allow_long_term,price,number_of_reviews_ly,host_is_superhost,host_listings_count,estimated_occupancy_l365d,estimated_revenue_l365d
0,696407278180533419,43.670920,-79.395190,3.215592,Downtown,Annex,288,0,Entire home/apt,condo,...,1.0,"[""Hair dryer"", ""Central heating"", ""Paid parkin...",Short-term,0,450.0,2,0,7,56,25200.0
1,696457318817239920,43.655704,-79.399910,1.784751,City Center,Kensington-Chinatown,219,0,Private room,rental unit,...,1.0,"[""Hair dryer"", ""Central heating"", ""Shampoo"", ""...",Short-term,0,78.0,2,1,8,112,8736.0
2,696602542310304703,43.648778,-79.401183,1.325140,City Center,Kensington-Chinatown,219,0,Entire home/apt,rental unit,...,1.0,"[""Hair dryer"", ""Room-darkening shades"", ""Keypa...",Short-term,0,132.0,4,1,1,224,29568.0
3,696973520016945803,43.612110,-79.539850,12.753377,Outer City,Alderwood,37,0,Entire home/apt,rental unit,...,1.0,"[""Hair dryer"", ""Heating"", ""Shampoo"", ""Body soa...",Short-term,0,81.0,12,0,1,60,4860.0
4,697004718610327555,43.735735,-79.480703,12.802003,Outer City,Downsview-Roding-CFB,80,0,Entire home/apt,townhouse,...,2.5,"[""Room-darkening shades"", ""Keypad"", ""Elevator""...",Short-term,0,236.0,15,0,1,90,21240.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9609,1355385238383046824,43.645749,-79.393083,0.595242,City Center,Waterfront Communities-The Island,1730,1,Entire home/apt,rental unit,...,1.0,"[""Hair dryer"", ""Elevator"", ""Central heating"", ...",Short-term,0,170.0,0,1,1,24,4080.0
9610,1362331480938692703,43.649540,-79.388640,0.781578,City Center,Waterfront Communities-The Island,1730,1,Entire home/apt,home,...,1.0,"[""Hair dryer"", ""Cleaning available during stay...",Short-term,0,145.0,0,0,1,6,870.0
9611,1363197896119450651,43.653270,-79.396790,1.419694,City Center,Kensington-Chinatown,219,0,Entire home/apt,home,...,1.0,"[""Hair dryer"", ""Keypad"", ""Mini fridge"", ""Heati...",Short-term,0,114.0,0,1,5,6,684.0
9612,1364280421231205552,43.705119,-79.272289,11.558048,Outer City,Clairlea-Birchmount,51,0,Private room,home,...,2.0,"[""Hair dryer"", ""Room-darkening shades"", ""Firep...",Short-term,1,55.0,0,0,2,6,330.0


### **Feature Engineering**

In [4]:
# Extra Feature Engineering (taken from Jitesh)
listings_df['bed_bath_interaction'] = listings_df['bedrooms'] * listings_df['bathrooms_count']
listings_df['log_beds'] = np.log1p(listings_df['beds'])
listings_df['log_reviews_ly'] = np.log1p(listings_df['number_of_reviews_ly'])

This next part proved ineffective, just noise

In [5]:
# # Extra Feature Engineering ideas
# listings_df['beds_x_accommodates'] = listings_df['beds'] * listings_df['accommodates']
# listings_df['bedrooms_x_neigh_freq'] = listings_df['bedrooms'] * listings_df['neighbourhood_freq']
# listings_df['bathrooms_x_neigh_freq'] = listings_df['bathrooms_count'] * listings_df['neighbourhood_freq']
# listings_df['beds_per_bedroom'] = listings_df['beds'] / (listings_df['bedrooms'] + 0.1)

### **Feature Extraction - Categorical Columns Binary Encoding**

#### use pd.get_dummies on all categorical columns to binary encode for the ML Models

In [6]:
# Apply pd.get_dummies
listings_df = pd.get_dummies(listings_df, columns=['bathroom_type', 'min_stay', 'downtown_zone', 'neighbourhood', 'room_type', 'property_type', 'standardized_property_type'], dtype=int)

### **Feature Extraction - Amenities**

#### Limit to Necessary Columns and apply Abstract Syntax Tree
Allows the stringified list in amenities to be turned into an accessible list

In [7]:
# Limit CSV to Amenities Column
listings_amenities = listings_df[['id', 'amenities']].copy()

# Apply AST
listings_amenities['amenities'] = listings_amenities['amenities'].apply(ast.literal_eval)

#### Explode Amenities

In [8]:
# Explode amenities Column
exploded_amenities = listings_amenities.explode('amenities')

#### Amenity Basic Cleanup

In [9]:
# Create Amenity Grouping Dictionary
amenity_grouping = {
    # Grouped Coffee makers
    "Coffee": "Coffee maker",
    "Coffee maker: drip coffee maker": "Coffee maker",
    "Coffee maker: Keurig coffee machine": "Coffee maker",

    # Heating / AC
    "Central heating": "Heating",
    "Central air conditioning": "Air conditioning",

    # Washers / Dryers
    "Free washer – In unit": "Washer",
    "Free dryer – In unit": "Dryer",

    # Cooking tools
    "Baking sheet": "Cooking basics",
    "Electric stove": "Stove",
    "Mini fridge": "Refrigerator",
    "Barbecue utensils": "BBQ grill",
    "Stainless steel oven": "Oven",

    # Privacy / security
    "Lock on bedroom door": "Private room lock",
    "Exterior security cameras on property": "Security camera",

    # Furniture
    "Clothing storage: closet": "Clothing storage",
    "Closet": "Clothing storage",
    "Clothing storage: closet and dresser": "Clothing storage",
    "Drying rack for clothing": "Drying rack",
    
    # Patios
    "Private patio or balcony": "Patio or balcony",
    
    # Extras
    "Exercise equipment": "Gym",
    "Shared gym in building": "Gym",
    "Private backyard – Fully fenced": "Backyard"
}

# Apply mapping to exploded_amenities - essentially renaming similar or same amenities
exploded_amenities['amenities'] = exploded_amenities['amenities'].replace(amenity_grouping)

#### Amenity Standardization (list)

In [10]:
# List of Amenities (to KEEP)
basic_amenities = [
    "Smoke alarm", "Carbon monoxide alarm", "Hot water", "Heating", "Essentials", 
    "Bed linens", "Shampoo", "Shower gel", "Body soap", "Hangers", "Hair dryer",
    "Conditioner", "Fire extinguisher", "Cleaning products", "Private room lock"
]

convenience_amenities = [
    "Wifi", "Kitchen", "Microwave", "Refrigerator", "Freezer", "Cooking basics",
    "Stove", "Oven", "Iron", "Air conditioning", "TV", "Dishwasher",
    "Coffee maker", "Hot water kettle", "Self check-in", "Elevator", "Drying rack",
    "Room-darkening shades", "Dishes and silverware", "Toaster", "Extra pillows and blankets",
    "First aid kit", "Wine glasses", "Lockbox", "Keypad", "Clothing storage",
    "Blender", "Portable fans", "Smart lock", "Rice maker", "Ethernet connection"
]

special_amenities = [
    "Hot tub", "Pool", "Private entrance", "BBQ grill", "Fire pit",
    "Outdoor dining area", "Patio or balcony", "Gym", "Lake access",
    "Indoor fireplace", "Sound system", "Game console", "EV charger",
    "Security camera", "Pets allowed", "Backyard", "City skyline view", "Board games"
]

# might remove these as they're administrative or potentially non relevant at best
other_amenities = [
    "Washer", "Dryer", "Bathtub", "Dining table", "Dedicated workspace",
    "Free parking on premises", "Paid parking on premises",
    "Free street parking", "Paid street parking off premises",
    "Books and reading material", "Outdoor furniture",
    "Long term stays allowed", "Luggage dropoff allowed", "Laundromat nearby", 
    "Host greets you", "Single level home", "Cleaning available during stay"
]

# put together for all amenities to be included
complete_amenities = basic_amenities + convenience_amenities + special_amenities + other_amenities

#### Amenity Standardization

In [11]:
# Standardized Amenities
amenities_clean = []

# Cycle and filter out the amenities on the lists above
for index, row in exploded_amenities.iterrows():
    if row['amenities'] in complete_amenities:
        amenities_clean.append(row)

# Convert amenities_cleaned back into Dataframe
amenities_cleaned = pd.DataFrame(amenities_clean)

# Drop instances of duplicates
amenities_cleaned = amenities_cleaned.drop_duplicates()

#### Amenity Binary encoding by pd.get_dummies - **Method 1** for ML 
- may be best but also may just be considered to be a lot of noise ( will require a round of feature importance to determine whether all columns are beneficial)

In [12]:
# Use pd.get_dummies to make categories ingestible for ML
amenity_dummies = pd.get_dummies(amenities_cleaned, columns=['amenities'])

# Group by id and then sum rows to limit back to 1 row per listing
listing_binary_amenities = amenity_dummies.groupby('id').sum().reset_index()

# Clean up column titles
listing_binary_amenities.columns = listing_binary_amenities.columns.str.replace(" ", "_")

# Merge back into dataframe
listings_df = pd.merge(listings_df, listing_binary_amenities, on='id', how='inner')

#### Amenity Categorization

In [13]:
# Function to define amenity by predefined lists
def categorize_amenity(amenity):
    if amenity in basic_amenities:
        return 'basic'
    elif amenity in convenience_amenities:
        return 'convenience'
    elif amenity in special_amenities:
        return 'special'
    elif amenity in other_amenities :
        return 'other'

# Apply function to amenities and save to new column
amenities_cleaned['amenity_classification'] = amenities_cleaned['amenities'].apply(categorize_amenity)

#### Amenity Category counts by listing_id  - **Method 2** for ML
- Categorized by a system of 4 bins, though dictating via common sense might not be considered completely accurate.

In [14]:
# Count Categories
amenities_counts = amenities_cleaned.groupby(['id', 'amenity_classification']).size().unstack(fill_value=0).reset_index()

# Rename Columns
amenities_counts = amenities_counts.rename(columns={
    'basic': 'basic_amenities_count',
    'convenience': 'convenience_amenities_count',
    'special': 'special_amenities_count',
    'other': 'other_amenities_count'
})

# Total Amenity count
# amenities_counts['total_amenities_count'] = amenities_counts['basic_amenities_count'] + amenities_counts['convenience_amenities_count'] + amenities_counts['special_amenities_count'] + amenities_counts['other_amenities_count']

# Merge back into dataframe
# listings_df = pd.merge(listings_df, amenities_counts, on='id', how='inner')

### **Drop Select Features**
- Drop property_type as we have standardized_property_type.
- Drop luxury markers as they're derived from our target columns price and estimated revenue
- Drop bathrooms as we have bathrooms_count.
- Drop Room Type as we hav ea standardized and more descriptive Property Type
- Drop original amenities as we have one hot encoded it
    - Drop basic_amenities_count, convenience_amenities_count, other_amenities_count, special_amenities_count and total_amenities_count as well as we are using Amenity Selection Method 1 (look above)


In [15]:
# Drop ID
listings_price_features = listings_df.drop(columns=[
    'luxury_neighborhood_flag', 'luxury_score', 'amenities']) 

### **Standard Scaler**

- This will Scale the numerical columns that aren't hot encoded/ represent magnitude rather than those that represent membership (boolean)

- Not all models require to be scaled
    - Linear Regression, kmeans and nn Models definitely benefit
    - Decision Trees and Random Forest Models less so

In [16]:
# Should we scale for this next model? 
scale_data_boolean = True

In [17]:
listings_price_features.columns[0:30]

Index(['id', 'latitude', 'longitude', 'downtown_distance',
       'neighbourhood_freq', 'property_type_freq', 'property_type_neigh_freq',
       'accommodates', 'bedrooms', 'beds', 'bathrooms_count',
       'allow_long_term', 'price', 'number_of_reviews_ly', 'host_is_superhost',
       'host_listings_count', 'estimated_occupancy_l365d',
       'estimated_revenue_l365d', 'bed_bath_interaction', 'log_beds',
       'log_reviews_ly', 'bathroom_type_private', 'bathroom_type_shared',
       'min_stay_Long-term', 'min_stay_Short-term',
       'downtown_zone_City Center', 'downtown_zone_Downtown',
       'downtown_zone_Near Downtown', 'downtown_zone_Outer City',
       'neighbourhood_Agincourt North'],
      dtype='object')

In [18]:
# Columns to Scale
price_columns = [
    'latitude', 'longitude', 'accommodates', 'bedrooms', 'beds', 'neighbourhood_freq',
    'property_type_neigh_freq', 'property_type_freq', 'bathrooms_count', 'host_listings_count',
    'number_of_reviews_ly', 'bed_bath_interaction', 'log_beds', 'downtown_distance'
    # , 'beds_x_accommodates', 'bedrooms_x_neigh_freq', 'bathrooms_x_neigh_freq', 'beds_per_bedroom'
    ]
occupancy_columns = price_columns + ['price']

# make a copy of output dataframe
listings_occupancy_features = listings_price_features.copy()

# Create StandardScaler()
scaler = StandardScaler()

# Scale Numerical Columns
if scale_data_boolean:
    scaled_occupancy = scaler.fit_transform(listings_price_features[occupancy_columns])
    scaled_price = scaler.fit_transform(listings_occupancy_features[price_columns])
    
    scaled_occupancy_df = pd.DataFrame(scaled_occupancy, columns=occupancy_columns)
    scaled_price_df = pd.DataFrame(scaled_price, columns=price_columns)
    
    # Drop unscaled numerical columns and replace with scaled
    for col in occupancy_columns:
        listings_occupancy_features[col] = scaled_occupancy_df[col]
    
    for col in price_columns:
        listings_price_features[col] = scaled_price_df[col]

### **Check and Save Cleaned Data**

- keeping id, latitude and longitude for later visualization and demo stages

In [19]:
# Check Output Data
display(listings_occupancy_features)
display(listings_price_features)

Unnamed: 0,id,latitude,longitude,downtown_distance,neighbourhood_freq,property_type_freq,property_type_neigh_freq,accommodates,bedrooms,beds,...,amenities_Single_level_home,amenities_Smart_lock,amenities_Smoke_alarm,amenities_Sound_system,amenities_Stove,amenities_TV,amenities_Toaster,amenities_Washer,amenities_Wifi,amenities_Wine_glasses
0,696407278180533419,-0.249773,0.024375,-0.621581,-0.197747,-0.670719,-0.308531,-0.160269,-0.491918,-0.697614,...,0,0,0,0,0,1,0,1,1,0
1,696457318817239920,-0.554987,-0.039965,-0.855404,-0.308232,0.096774,-0.361087,-0.644755,-0.491918,-0.697614,...,0,0,1,0,1,0,1,0,1,0
2,696602542310304703,-0.693914,-0.057328,-0.930511,-0.308232,0.096774,-0.361087,-0.160269,-0.491918,-0.697614,...,0,0,1,1,0,1,1,1,0,1
3,696973520016945803,-1.429419,-1.947689,0.937046,-0.599654,0.096774,-0.564741,-0.644755,-0.491918,0.179697,...,1,0,1,0,1,0,1,0,1,1
4,697004718610327555,1.050319,-1.141370,0.944992,-0.530801,-1.958153,-0.538463,1.293188,0.619696,1.057008,...,0,0,1,0,1,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9608,1355385238383046824,-0.754676,0.053104,-1.049788,2.111215,0.096774,1.902105,0.324217,-0.491918,0.179697,...,1,0,1,0,1,1,1,1,1,0
9609,1362331480938692703,-0.678626,0.113668,-1.019338,2.111215,0.985043,-0.515470,1.293188,0.619696,1.057008,...,0,0,1,0,1,1,0,1,1,1
9610,1363197896119450651,-0.603807,0.002563,-0.915060,-0.308232,0.985043,-0.292107,-0.644755,-0.491918,-0.697614,...,0,0,1,0,1,0,0,1,1,0
9611,1364280421231205552,0.436208,1.699811,0.741710,-0.577237,0.985043,-0.482622,-0.644755,-0.491918,-0.697614,...,0,0,1,0,0,1,1,0,1,0


Unnamed: 0,id,latitude,longitude,downtown_distance,neighbourhood_freq,property_type_freq,property_type_neigh_freq,accommodates,bedrooms,beds,...,amenities_Single_level_home,amenities_Smart_lock,amenities_Smoke_alarm,amenities_Sound_system,amenities_Stove,amenities_TV,amenities_Toaster,amenities_Washer,amenities_Wifi,amenities_Wine_glasses
0,696407278180533419,-0.249773,0.024375,-0.621581,-0.197747,-0.670719,-0.308531,-0.160269,-0.491918,-0.697614,...,0,0,0,0,0,1,0,1,1,0
1,696457318817239920,-0.554987,-0.039965,-0.855404,-0.308232,0.096774,-0.361087,-0.644755,-0.491918,-0.697614,...,0,0,1,0,1,0,1,0,1,0
2,696602542310304703,-0.693914,-0.057328,-0.930511,-0.308232,0.096774,-0.361087,-0.160269,-0.491918,-0.697614,...,0,0,1,1,0,1,1,1,0,1
3,696973520016945803,-1.429419,-1.947689,0.937046,-0.599654,0.096774,-0.564741,-0.644755,-0.491918,0.179697,...,1,0,1,0,1,0,1,0,1,1
4,697004718610327555,1.050319,-1.141370,0.944992,-0.530801,-1.958153,-0.538463,1.293188,0.619696,1.057008,...,0,0,1,0,1,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9608,1355385238383046824,-0.754676,0.053104,-1.049788,2.111215,0.096774,1.902105,0.324217,-0.491918,0.179697,...,1,0,1,0,1,1,1,1,1,0
9609,1362331480938692703,-0.678626,0.113668,-1.019338,2.111215,0.985043,-0.515470,1.293188,0.619696,1.057008,...,0,0,1,0,1,1,0,1,1,1
9610,1363197896119450651,-0.603807,0.002563,-0.915060,-0.308232,0.985043,-0.292107,-0.644755,-0.491918,-0.697614,...,0,0,1,0,1,0,0,1,1,0
9611,1364280421231205552,0.436208,1.699811,0.741710,-0.577237,0.985043,-0.482622,-0.644755,-0.491918,-0.697614,...,0,0,1,0,0,1,1,0,1,0


In [20]:
# Save Cleaned Data as CSV
if scale_data_boolean:
    listings_occupancy_features.to_csv('../../../data/processed/listings_feature_matrix_scaled_occupancy.csv', index=False)
    listings_price_features.to_csv('../../../data/processed/listings_feature_matrix_scaled_price.csv', index=False)
else:
    listings_price_features.to_csv('../../../data/processed/listings_feature_matrix.csv', index=False)
