In [1]:
# Import dependencies
import pandas as pd
from math import radians, cos, sin, asin, sqrt

### **Import and Display CSV**

In [2]:
# Filepath
csv_filepath =  "../data/raw/listings.csv"

# Read CSV
listings_csv = pd.read_csv(csv_filepath)

### **Removing Unnecessary/ Noisy Data Columns**

In [3]:
# list of Columns to Definitely Remove
definite_removal = ['listing_url', 'scrape_id', 'last_scraped', 'source','picture_url', 'host_id',
                    'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
                    'host_response_time', 'host_thumbnail_url', 'host_picture_url', 
                    'host_neighbourhood', 'host_total_listings_count', 'host_verifications', 
                    'host_has_profile_pic', 'host_identity_verified','minimum_minimum_nights', 
                    'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 
                    'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated',
                    'calendar_last_scraped', 'last_review', 'review_scores_accuracy', 
                    'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 
                    'license', 'instant_bookable','reviews_per_month']

# Potential Removal    
potential_removal = ['name', 'description', 'neighborhood_overview', 'host_response_rate', 
                    'host_acceptance_rate', 'neighbourhood', 'neighbourhood_group_cleansed', 'bathrooms',
                    'has_availability', 'availability_30', 'availability_60', 'availability_90',
                    'availability_365', 'availability_eoy', 'number_of_reviews', 'review_scores_rating',
                    'number_of_reviews_ltm', 'number_of_reviews_l30d', 'review_scores_value', 'review_scores_location',
                    'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 
                    'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms']

removal = definite_removal + potential_removal

listings_reduced = listings_csv.drop(removal, axis=1)

### **Drop Null Values/ Inactive Listings**

In [4]:
# Copy dataframe
listings_cleaned = listings_reduced.copy()

# Fill 'na' for specific rows where number_of_reviews == 0 and first_review is null thus selecting all new listing (new listing = no reviews)
filtered_rows = (listings_cleaned['number_of_reviews_ly'] == 0) & (listings_cleaned['first_review'].isnull())

# Replace Empty values in review columns with 'na' only for unreviewed listings not the inactive listings
listings_cleaned.loc[filtered_rows, 'first_review'] = 'na'

# Now drop rows that still have nulls (i.e. truly missing data)
listings_cleaned = listings_cleaned.dropna(how='any')

# Drop first_review as we only needed it for this part
listings_cleaned= listings_cleaned.drop(columns='first_review')

In [5]:
# Remove rows with 0 estimated_occupancy_l365d and 0 estimated_revenue_l365d
listings_cleaned = listings_cleaned[(listings_cleaned['estimated_occupancy_l365d'] != 0) & (listings_cleaned['estimated_revenue_l365d'] != 0)]

# Remove Rows for Room Type Shared, data is much too sparse
listings_cleaned=listings_cleaned[listings_cleaned['room_type'] != 'Shared room']

### **Fix Data Types**

In [6]:
# fix datatypes
listings_cleaned['host_is_superhost'] = listings_cleaned['host_is_superhost'].map({'t':1, 'f':0}).astype('int64')
listings_cleaned['host_listings_count'] = listings_cleaned['host_listings_count'].astype('int64')
listings_cleaned['bedrooms'] = listings_cleaned['bedrooms'].astype('int64')
listings_cleaned['beds'] = listings_cleaned['beds'].astype('int64')
listings_cleaned['price'] = listings_cleaned['price'].str.replace('$', '').str.replace(',', '').astype('float')


### **Limit Price Range removing Outliers**

In [7]:
# Limit Price range to 1 to 2000
listings_cleaned = listings_cleaned[(listings_cleaned['price'] > 1) & (listings_cleaned['price'] < 2000)]

### **Bathrooms Classification/ Grouping** 

In [8]:
# Define function to extract the bathroom count from the bathrooms_text column
def count_bathrooms(text):
    text = text.lower()
    
    if 'half' in text:
        return 0.5
    else:
        return float(text.split()[0])
    
    # Apply Function to new table
listings_cleaned['bathrooms_count'] = listings_cleaned['bathrooms_text'].apply(count_bathrooms)

In [9]:
# Define Function to extract the bathroom type from the bathroom_text column
def bathroom_type(text):
    text = text.lower()
    
    if 'shared' in text:
        return 'shared'
    elif 'private':
        return 'private'
    else:
        return 'regular'

# Apply function to bathroom type column
listings_cleaned['bathroom_type'] = listings_cleaned['bathrooms_text'].apply(bathroom_type)

### **Property Classification/ Grouping**

In [10]:
# Define the categorization function 
def categorize_property(property_type):
    property_type = property_type.lower()
    
    # Group 1: Budget Spaces
    if any(x in property_type for x in ['private room', 'shared room', 'private room in hostel', 'shared room in casa particular',
                                        'shared room in home', 'shared room in hostel', 'private room in casa particular', 
                                        'casa particular', 'private room in barn', 'shared room in townhouse', 'private room in yurt']):
        return 'budget'
    
    # Group 2: Apartments
    elif any(x in property_type for x in ['entire condo', 'entire rental unit', 'entire serviced apartment', 'entire loft', 'entire apartment',
                                        'private room in condo', 'private room in rental unit', 'private room in serviced apartment',
                                        'private room in loft', 'shared room in condo', 'shared room in rental unit', 'shared room in loft']):
        return 'apartment'
    
    # Group 3: Homes
    elif any(x in property_type for x in ['entire home', 'entire house', 'entire townhouse', 'entire bungalow', 'entire cottage', 'entire home/apt',
                                        'private room in home', 'private room in townhouse', 'private room in bungalow', 'private room in cottage']):
        return 'home'
    
    # Group 4: Vacation Stays
    elif any(x in property_type for x in ['entire villa', 'entire vacation home', 'entire guest suite', 'entire guesthouse', 'entire cabin', 'farm stay',
                                        'private room in guest suite', 'private room in guesthouse', 'private room in villa', 'private room in vacation home']):
        return 'vacation'
    
    # Default case if no matches found
    else:
        return 'Other'
    
# Apply the categorization
listings_cleaned['standardized_property_type'] = listings_cleaned['property_type'].apply(categorize_property)

In [11]:
# Function to clean Property Type
def clean_property_type(property_type):
    property_type = property_type.lower()
    
    for text in ['entire', 'private room in', 'shared room in', 'room in']:
        if text in property_type:
            property_type = property_type.replace(text, "")
        
    return property_type.strip()

# Apply Property Type cleaning function
listings_cleaned['property_type'] = listings_cleaned['property_type'].apply(clean_property_type)


### **Stay Duration Classification**

In [12]:
# Function to Classify Stay Duration
def classify_min_stay(min_nights):
    if min_nights >= 30:
        return 'Long-term'
    else:
        return 'Short-term'

# Function to Classify Stay Duration
def allows_long_term(max_nights):
    if max_nights < 30:
        return 1
    else:
        return 0

# Apply the categorization
listings_cleaned['min_stay'] = listings_cleaned['minimum_nights'].apply(classify_min_stay)
listings_cleaned['allow_long_term'] = listings_cleaned['maximum_nights'].apply(allows_long_term)

### **Neighbourhood Frequency Calc**

In [13]:
# Listings per neighbourhood
listings_cleaned['neighbourhood_freq'] = listings_cleaned['neighbourhood_cleansed'].map(listings_cleaned['neighbourhood_cleansed'].value_counts())

# Listings per property type
listings_cleaned['property_type_freq'] = listings_cleaned['property_type'].map(listings_cleaned['property_type'].value_counts())

# Listings per property type per neighbourhood
freq = listings_cleaned.groupby(['neighbourhood_cleansed', 'property_type']).size().reset_index(name='property_type_neigh_freq')
listings_cleaned = listings_cleaned.merge(freq, on=['neighbourhood_cleansed', 'property_type'], how='left')

### **Distance from Downtown**

In [14]:
# Toronto Downtown/ CN Tower Coords
downtown_lat = 43.6426
downtown_lon = -79.3871

In [15]:
# Function to define distance
def distance_from(lat1, lon1, lat2, lon2):
    # Convert to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    
    # Calculate distance
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    km = 6371 * c  # Earth's radius in km
    return km

# Apply Function and create new column
listings_cleaned['downtown_distance'] = listings_cleaned.apply(lambda row: distance_from(row['latitude'], row['longitude'], downtown_lat, downtown_lon), axis=1)

In [16]:
# Bin Distance Groups
listings_cleaned['downtown_zone'] = pd.cut(
    listings_cleaned['downtown_distance'], 
    bins=[0, 2, 5, 10, 100],
    labels=['City Center', 'Downtown', 'Near Downtown', 'Outer City']
)

### **Rename/ ReOrder cleaned df to make Cohesive Output**

In [17]:
# rename columns
listings_cleaned = listings_cleaned.rename(columns={
    'neighbourhood_cleansed': 'neighbourhood'    
})

# reorder columns
listings_output = listings_cleaned[[
    'id', 'latitude', 'longitude', 'downtown_distance', 'downtown_zone', 'neighbourhood', 'neighbourhood_freq', 
    'room_type', 'property_type', 'standardized_property_type',
    'property_type_freq', 'property_type_neigh_freq', 'accommodates', 
    'bedrooms', 'beds', 'bathroom_type', 'bathrooms_count', 'amenities', 
    'min_stay', 'allow_long_term', 'price', 'number_of_reviews_ly', 'host_is_superhost', 
    'host_listings_count', 'estimated_occupancy_l365d', 'estimated_revenue_l365d'
    ]]


### **Check and Save Cleaned Data**

In [18]:
# Check Output Data
listings_output

Unnamed: 0,id,latitude,longitude,downtown_distance,downtown_zone,neighbourhood,neighbourhood_freq,room_type,property_type,standardized_property_type,...,bathrooms_count,amenities,min_stay,allow_long_term,price,number_of_reviews_ly,host_is_superhost,host_listings_count,estimated_occupancy_l365d,estimated_revenue_l365d
0,696407278180533419,43.670920,-79.395190,3.215592,Downtown,Annex,288,Entire home/apt,condo,apartment,...,1.0,"[""Hair dryer"", ""Central heating"", ""Paid parkin...",Short-term,0,450.0,2,0,7,56,25200.0
1,696457318817239920,43.655704,-79.399910,1.784751,City Center,Kensington-Chinatown,219,Private room,rental unit,budget,...,1.0,"[""Hair dryer"", ""Central heating"", ""Shampoo"", ""...",Short-term,0,78.0,2,1,8,112,8736.0
2,696602542310304703,43.648778,-79.401183,1.325140,City Center,Kensington-Chinatown,219,Entire home/apt,rental unit,apartment,...,1.0,"[""Hair dryer"", ""Room-darkening shades"", ""Keypa...",Short-term,0,132.0,4,1,1,224,29568.0
3,696973520016945803,43.612110,-79.539850,12.753377,Outer City,Alderwood,37,Entire home/apt,rental unit,apartment,...,1.0,"[""Hair dryer"", ""Heating"", ""Shampoo"", ""Body soa...",Short-term,0,81.0,12,0,1,60,4860.0
4,697004718610327555,43.735735,-79.480703,12.802003,Outer City,Downsview-Roding-CFB,80,Entire home/apt,townhouse,home,...,2.5,"[""Room-darkening shades"", ""Keypad"", ""Elevator""...",Short-term,0,236.0,15,0,1,90,21240.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9609,1355385238383046824,43.645749,-79.393083,0.595242,City Center,Waterfront Communities-The Island,1730,Entire home/apt,rental unit,apartment,...,1.0,"[""Hair dryer"", ""Elevator"", ""Central heating"", ...",Short-term,0,170.0,0,1,1,24,4080.0
9610,1362331480938692703,43.649540,-79.388640,0.781578,City Center,Waterfront Communities-The Island,1730,Entire home/apt,home,home,...,1.0,"[""Hair dryer"", ""Cleaning available during stay...",Short-term,0,145.0,0,0,1,6,870.0
9611,1363197896119450651,43.653270,-79.396790,1.419694,City Center,Kensington-Chinatown,219,Entire home/apt,home,home,...,1.0,"[""Hair dryer"", ""Keypad"", ""Mini fridge"", ""Heati...",Short-term,0,114.0,0,1,5,6,684.0
9612,1364280421231205552,43.705119,-79.272289,11.558048,Outer City,Clairlea-Birchmount,51,Private room,home,budget,...,2.0,"[""Hair dryer"", ""Room-darkening shades"", ""Firep...",Short-term,1,55.0,0,0,2,6,330.0


In [19]:
# Save Cleaned Data as CSV
listings_output.to_csv('../data/processed/cleaned_listings.csv', index=False)
