In [1]:
# Import dependencies
import pandas as pd

### **Import and Display CSV**

In [2]:
# Filepath
csv_filepath =  "../data/raw/listings.csv"

# Read CSV
listings_csv = pd.read_csv(csv_filepath)

### **Removing Unnecessary/ Noisy Data Columns**

In [3]:
# list of Columns to Definitely Remove
definite_removal = ['listing_url', 'scrape_id', 'last_scraped', 'source','picture_url', 'host_id',
                    'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
                    'host_response_time', 'host_thumbnail_url', 'host_picture_url', 
                    'host_neighbourhood', 'host_total_listings_count', 'host_verifications', 
                    'host_has_profile_pic', 'host_identity_verified','minimum_minimum_nights', 
                    'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 
                    'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated',
                    'calendar_last_scraped', 'last_review', 'review_scores_accuracy', 
                    'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 
                    'license', 'instant_bookable','reviews_per_month']

# Potential Removal    
potential_removal = ['name', 'description', 'neighborhood_overview', 'host_response_rate', 
                    'host_acceptance_rate', 'neighbourhood', 'neighbourhood_group_cleansed', 'bathrooms',
                    'has_availability', 'availability_30', 'availability_60', 'availability_90',
                    'availability_365', 'availability_eoy', 'number_of_reviews', 'review_scores_rating',
                    'number_of_reviews_ltm', 'number_of_reviews_l30d', 'review_scores_value', 'review_scores_location',
                    'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 
                    'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms']

removal = definite_removal + potential_removal

listings_reduced = listings_csv.drop(removal, axis=1)

### **Drop Null Values/ Inactive Listings**

In [4]:
# Copy dataframe
listings_cleaned = listings_reduced.copy()

# Fill 'na' for specific rows where number_of_reviews == 0 and first_review is null thus selecting all new listing (new listing = no reviews)
filtered_rows = (listings_cleaned['number_of_reviews_ly'] == 0) & (listings_cleaned['first_review'].isnull())

# Replace Empty values in review columns with 'na' only for unreviewed listings not the inactive listings
listings_cleaned.loc[filtered_rows, 'first_review'] = 'na'

# Now drop rows that still have nulls (i.e. truly missing data)
listings_cleaned = listings_cleaned.dropna(how='any')

# Drop first_review as we only needed it for this part
listings_cleaned= listings_cleaned.drop(columns='first_review')

In [5]:
# Remove rows with 0 estimated_occupancy_l365d and 0 estimated_revenue_l365d
listings_cleaned = listings_cleaned[(listings_cleaned['estimated_occupancy_l365d'] != 0) & (listings_cleaned['estimated_revenue_l365d'] != 0)]

### **Fix Data Types**

In [6]:
# fix datatypes
listings_cleaned['host_is_superhost'] = listings_cleaned['host_is_superhost'].map({'t':1, 'f':0}).astype('int64')
listings_cleaned['host_listings_count'] = listings_cleaned['host_listings_count'].astype('int64')
listings_cleaned['bedrooms'] = listings_cleaned['bedrooms'].astype('int64')
listings_cleaned['beds'] = listings_cleaned['beds'].astype('int64')
listings_cleaned['price'] = listings_cleaned['price'].str.replace('$', '').str.replace(',', '').astype('float')


### **Limit Price Range removing Outliers**

In [7]:
# Limit Price range to 1 to 2000
listings_cleaned = listings_cleaned[(listings_cleaned['price'] > 1) & (listings_cleaned['price'] < 2000)]

### **Bathrooms Classification/ Grouping** 

In [8]:
# Define function to extract the bathroom count from the bathrooms_text column
def count_bathrooms(text):
    text = text.lower()
    
    if 'half' in text:
        return 0.5
    else:
        return float(text.split()[0])
    
    # Apply Function to new table
listings_cleaned['bathrooms_count'] = listings_cleaned['bathrooms_text'].apply(count_bathrooms)

In [9]:
# Bathroom value bins
no_bath = ['0 baths', '0 shared baths']
one_bath = ['1 bath', 'Half-bath']
one_shared_bath = ['1 shared bath', 'Shared half-bath']
one_private_bath = ['1 private bath', 'Private half-bath']
two_bath = ['2 baths', '2.5 baths', '1.5 baths', ]
two_shared_bath = ['1.5 shared baths', '2 shared baths', '2.5 shared baths',]
three_plus_bath = ['3 baths', '4 baths', '5 baths', '3.5 baths', '4.5 baths', '5.5 baths', '6.5 baths', '6 baths', '20 baths']
three_plus_shared_bath = ['3 shared baths', '3.5 shared baths', '4 shared baths', '4.5 shared baths']


# Function to group bathroom values
def simplify_bathroom_text(text):
    if text in no_bath:
        return 'no bath'
    elif text in one_shared_bath:
        return '1 shared bath'
    elif text in one_private_bath:
        return '1 private bath'
    elif text in one_bath:
        return '1 bath'
    elif text in two_shared_bath:
        return '2 shared bath'
    elif text in two_bath:
        return '2 bath'
    elif text in three_plus_bath:
        return '3 or more baths'
    elif text in three_plus_shared_bath:
        return '3 or more shared baths'
    else:
        return text

# Apply Function and update
listings_cleaned['bathrooms_text'] = listings_cleaned['bathrooms_text'].apply(simplify_bathroom_text)
listings_cleaned.rename(columns={'bathrooms_text': 'bathrooms'}, inplace=True)

### **Property Classification/ Grouping**

In [10]:
# Define the categorization function 
def categorize_property(property_type):
    property_type = property_type.lower()
    
    # Group 1: Budget Spaces
    if any(x in property_type for x in ['private room', 'shared room', 'private room in hostel', 'shared room in casa particular',
                                        'shared room in home', 'shared room in hostel', 'private room in casa particular', 
                                        'casa particular', 'private room in barn', 'shared room in townhouse', 'private room in yurt']):
        return 'budget'
    
    # Group 2: Apartments
    elif any(x in property_type for x in ['entire condo', 'entire rental unit', 'entire serviced apartment', 'entire loft', 'entire apartment',
                                        'private room in condo', 'private room in rental unit', 'private room in serviced apartment',
                                        'private room in loft', 'shared room in condo', 'shared room in rental unit', 'shared room in loft']):
        return 'apartment'
    
    # Group 3: Homes
    elif any(x in property_type for x in ['entire home', 'entire house', 'entire townhouse', 'entire bungalow', 'entire cottage', 'entire home/apt',
                                        'private room in home', 'private room in townhouse', 'private room in bungalow', 'private room in cottage']):
        return 'home'
    
    # Group 4: Vacation Stays
    elif any(x in property_type for x in ['entire villa', 'entire vacation home', 'entire guest suite', 'entire guesthouse', 'entire cabin', 'farm stay',
                                        'private room in guest suite', 'private room in guesthouse', 'private room in villa', 'private room in vacation home']):
        return 'vacation'
    
    # Group 5: Hotels
    elif any(x in property_type for x in ['room in hotel', 'room in aparthotel', 'room in boutique hotel', 'room in aparthotel', 'private room in bed and breakfast']):
        return 'hotels'
    
    # Group 6: Unique stays
    elif any(x in property_type for x in ['tiny home', 'private room in tiny home', 'castle', 'private room in castle', 'camper', 'rv', 'camper/rv',
                                        'boat', 'island', 'shipping container', 'tower', 'cave', 'private room in earthen home']):
        return 'unique'
    
    # Default case if no matches found
    else:
        return 'Other'
    
# Apply the categorization
listings_cleaned['standardized_property_type'] = listings_cleaned['property_type'].apply(categorize_property)

### **Stay Duration Classification**

In [11]:
# Function to Classify Stay Duration
def classify_rental(min_nights, max_nights):
    if min_nights > 31:
        return 'Long-term'
    elif min_nights <= 31 and max_nights <= 180:
        return 'Short-term'
    else:
        return 'Mid-term'

# Apply the categorization
listings_cleaned['stay_duration'] = listings_cleaned[['minimum_nights', 'maximum_nights']].apply(lambda x : classify_rental(x[0],x[1]), axis=1)

  listings_cleaned['stay_duration'] = listings_cleaned[['minimum_nights', 'maximum_nights']].apply(lambda x : classify_rental(x[0],x[1]), axis=1)


### **Rename/ ReOrder cleaned df to make Cohesive Output**

In [12]:
# rename columns
listings_cleaned = listings_cleaned.rename(columns={
    'neighbourhood_cleansed': 'neighbourhood'    
})

# reorder columns
listings_output = listings_cleaned[[
    'id', 'latitude', 'longitude','neighbourhood', 'room_type', 'property_type', 
    'standardized_property_type', 'accommodates', 'bedrooms', 'beds', 'bathrooms',
    'bathrooms_count', 'amenities', 'stay_duration', 'price',
    'number_of_reviews_ly', 'host_is_superhost', 'host_listings_count', 
    'estimated_occupancy_l365d', 'estimated_revenue_l365d'
    ]]


### **Check and Save Cleaned Data**

In [13]:
# Check Output Data
listings_output

Unnamed: 0,id,latitude,longitude,neighbourhood,room_type,property_type,standardized_property_type,accommodates,bedrooms,beds,bathrooms,bathrooms_count,amenities,stay_duration,price,number_of_reviews_ly,host_is_superhost,host_listings_count,estimated_occupancy_l365d,estimated_revenue_l365d
0,696407278180533419,43.670920,-79.395190,Annex,Entire home/apt,Entire condo,apartment,3,1,1,1 bath,1.0,"[""Hair dryer"", ""Central heating"", ""Paid parkin...",Mid-term,450.0,2,0,7,56,25200.0
1,696457318817239920,43.655704,-79.399910,Kensington-Chinatown,Private room,Private room in rental unit,budget,2,1,1,1 private bath,1.0,"[""Hair dryer"", ""Central heating"", ""Shampoo"", ""...",Mid-term,78.0,2,1,8,112,8736.0
4,696602542310304703,43.648778,-79.401183,Kensington-Chinatown,Entire home/apt,Entire rental unit,apartment,3,1,1,1 bath,1.0,"[""Hair dryer"", ""Room-darkening shades"", ""Keypa...",Short-term,132.0,4,1,1,224,29568.0
6,696973520016945803,43.612110,-79.539850,Alderwood,Entire home/apt,Entire rental unit,apartment,2,1,2,1 bath,1.0,"[""Hair dryer"", ""Heating"", ""Shampoo"", ""Body soa...",Short-term,81.0,12,0,1,60,4860.0
7,697004718610327555,43.735735,-79.480703,Downsview-Roding-CFB,Entire home/apt,Entire townhouse,home,6,2,3,2 bath,2.5,"[""Room-darkening shades"", ""Keypad"", ""Elevator""...",Mid-term,236.0,15,0,1,90,21240.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21501,1355385238383046824,43.645749,-79.393083,Waterfront Communities-The Island,Entire home/apt,Entire rental unit,apartment,4,1,2,1 bath,1.0,"[""Hair dryer"", ""Elevator"", ""Central heating"", ...",Mid-term,170.0,0,1,1,24,4080.0
21558,1362331480938692703,43.649540,-79.388640,Waterfront Communities-The Island,Entire home/apt,Entire home,home,6,2,3,1 bath,1.0,"[""Hair dryer"", ""Cleaning available during stay...",Mid-term,145.0,0,0,1,6,870.0
21591,1363197896119450651,43.653270,-79.396790,Kensington-Chinatown,Entire home/apt,Entire home,home,2,1,1,1 bath,1.0,"[""Hair dryer"", ""Keypad"", ""Mini fridge"", ""Heati...",Mid-term,114.0,0,1,5,6,684.0
21609,1364280421231205552,43.705119,-79.272289,Clairlea-Birchmount,Private room,Private room in home,budget,2,1,1,2 shared bath,2.0,"[""Hair dryer"", ""Room-darkening shades"", ""Firep...",Short-term,55.0,0,0,2,6,330.0


In [14]:
# Save Cleaned Data as CSV
listings_output.to_csv('../data/processed/cleaned_listings.csv', index=False)
