In [1]:
# Import dependencies
import pandas as pd

### **Import and Display CSV**

In [2]:
# Filepath
csv_filepath =  "../data/raw/listings.csv"

# Read CSV
listings_csv = pd.read_csv(csv_filepath)

### **Removing Unnecessary/ Noisy Data Columns**

In [3]:
# list of Columns to Definitely Remove
definite_removal = ['listing_url', 'scrape_id', 'last_scraped', 'source','picture_url', 'host_id',
                    'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
                    'host_response_time', 'host_thumbnail_url', 'host_picture_url', 
                    'host_neighbourhood', 'host_total_listings_count', 'host_verifications', 
                    'host_has_profile_pic', 'host_identity_verified','minimum_minimum_nights', 
                    'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 
                    'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated',
                    'calendar_last_scraped', 'last_review', 'review_scores_accuracy', 
                    'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 
                    'license', 'instant_bookable','reviews_per_month']

# Potential Removal    
potential_removal = ['name', 'description', 'neighborhood_overview', 'host_response_rate', 
                    'host_acceptance_rate', 'neighbourhood', 'neighbourhood_group_cleansed', 'bathrooms',
                    'has_availability', 'availability_30', 'availability_60', 'availability_90',
                    'availability_365', 'availability_eoy', 'number_of_reviews', 'review_scores_rating',
                    'number_of_reviews_ltm', 'number_of_reviews_l30d', 'review_scores_value', 'review_scores_location',
                    'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 
                    'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms']

removal = definite_removal + potential_removal

listings_reduced = listings_csv.drop(removal, axis=1)

### **Drop Null Values/ Inactive Listings**

In [4]:
# Copy dataframe
listings_cleaned = listings_reduced.copy()

# Fill 'na' for specific rows where number_of_reviews == 0 and first_review is null thus selecting all new listing (new listing = no reviews)
filtered_rows = (listings_cleaned['number_of_reviews_ly'] == 0) & (listings_cleaned['first_review'].isnull())

# Replace Empty values in review columns with 'na' only for unreviewed listings not the inactive listings
listings_cleaned.loc[filtered_rows, 'first_review'] = 'na'

# Now drop rows that still have nulls (i.e. truly missing data)
listings_cleaned = listings_cleaned.dropna(how='any')

# Drop first_review as we only needed it for this part
listings_cleaned= listings_cleaned.drop(columns='first_review')

In [5]:
# Remove rows with 0 estimated_occupancy_l365d and 0 estimated_revenue_l365d
listings_cleaned = listings_cleaned[(listings_cleaned['estimated_occupancy_l365d'] != 0) & (listings_cleaned['estimated_revenue_l365d'] != 0)]

### **Fix Data Types**

In [6]:
# fix datatypes
listings_cleaned['host_is_superhost'] = listings_cleaned['host_is_superhost'].map({'t':1, 'f':0}).astype('int64')
listings_cleaned['host_listings_count'] = listings_cleaned['host_listings_count'].astype('int64')
listings_cleaned['bedrooms'] = listings_cleaned['bedrooms'].astype('int64')
listings_cleaned['beds'] = listings_cleaned['beds'].astype('int64')
listings_cleaned['price'] = listings_cleaned['price'].str.replace('$', '').str.replace(',', '').astype('float')


### **Limit Price Range removing Outliers**

In [7]:
# Limit Price range to 1 to 2000
listings_cleaned = listings_cleaned[(listings_cleaned['price'] > 1) & (listings_cleaned['price'] < 2000)]

### **Bathrooms Classification/ Grouping** 

In [8]:
# Bathroom value bins
no_bath = ['0 baths', '0 shared baths']
one_bath = ['1 bath', 'Half-bath']
one_shared_bath = ['1 shared bath', 'Shared half-bath']
one_private_bath = ['1 private bath', 'Private half-bath']
two_bath = ['2 baths', '2.5 baths', '1.5 baths', ]
two_shared_bath = ['1.5 shared baths', '2 shared baths', '2.5 shared baths',]
three_plus_bath = ['3 baths', '4 baths', '5 baths', '3.5 baths', '4.5 baths', '5.5 baths', '6.5 baths', '6 baths', '20 baths']
three_plus_shared_bath = ['3 shared baths', '3.5 shared baths', '4 shared baths', '4.5 shared baths']


# Function to group bathroom values
def simplify_bathroom_text(text):
    if text in no_bath:
        return 'no bath'
    elif text in one_shared_bath:
        return '1 shared bath'
    elif text in one_private_bath:
        return '1 private bath'
    elif text in one_bath:
        return '1 bath'
    elif text in two_shared_bath:
        return '2 shared bath'
    elif text in two_bath:
        return '2 bath'
    elif text in three_plus_bath:
        return '3 or more baths'
    elif text in three_plus_shared_bath:
        return '3 or more shared baths'
    else:
        return text

# Apply Function and update
listings_cleaned['bathrooms_text'] = listings_cleaned['bathrooms_text'].apply(simplify_bathroom_text)
listings_cleaned.rename(columns={'bathrooms_text': 'bathrooms'}, inplace=True)

In [9]:
# Function to group bathroom values by just count
def simplify_bathroom_text(text):
    if text == 'no bath':
        return 0
    elif '1' in text:
        return 1
    elif '2' in text:
        return 2
    elif '3' in text:
        return 3

# Apply Function and update
listings_cleaned['bathrooms_count'] = listings_cleaned['bathrooms'].apply(simplify_bathroom_text)

### **Property Classification/ Grouping**

In [10]:
# Define the categorization function 
def categorize_property(property_type):
    property_type = property_type.lower()
    
    # Group 1: Private Spaces
    if any(x in property_type for x in ['private room']):
        return 'Private Spaces'
    
    # Group 2: Entire Properties
    elif any(x in property_type for x in ['entire condo', 'entire rental unit', 'entire townhouse', 'entire home', 
                                        'entire bungalow', 'entire guest suite', 'entire loft', 'entire serviced apartment', 
                                        'entire guesthouse', 'entire villa', 'entire cottage', 'entire vacation home', 
                                        'entire home/apt', 'entire cabin']):
        return 'Entire Properties'
    
    # Group 3: Shared Spaces
    elif any(x in property_type for x in ['shared room']):
        return 'Shared Spaces'
    
    # Group 4: Unique and Experiential Stays
    elif any(x in property_type for x in ['earthen home', 'farm stay', 'cave', 'island', 'boat', 'camper', 'rv', 
                                        'shipping container', 'castle', 'tower', 'tiny home', 'casa particular', 
                                        'room in boutique hotel']):
        return 'Unique and Experiential Stays'
    
    # Group 5: Hotels and Similar
    elif any(x in property_type for x in ['room in hotel', 'room in aparthotel']):
        return 'Hotels and Similar'
    
    # Default case if no matches found
    else:
        return 'Other'
    
    # Apply the categorization
listings_cleaned['standardized_property_type'] = listings_cleaned['property_type'].apply(categorize_property)

### **Check and Save Cleaned Data**

In [11]:
# Check Output Data
listings_cleaned

Unnamed: 0,id,host_is_superhost,host_listings_count,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,...,beds,amenities,price,minimum_nights,maximum_nights,number_of_reviews_ly,estimated_occupancy_l365d,estimated_revenue_l365d,bathrooms_count,standardized_property_type
0,696407278180533419,0,7,Annex,43.670920,-79.395190,Entire condo,Entire home/apt,3,1 bath,...,1,"[""Hair dryer"", ""Central heating"", ""Paid parkin...",450.0,28,365,2,56,25200.0,1,Entire Properties
1,696457318817239920,1,8,Kensington-Chinatown,43.655704,-79.399910,Private room in rental unit,Private room,2,1 private bath,...,1,"[""Hair dryer"", ""Central heating"", ""Shampoo"", ""...",78.0,28,365,2,112,8736.0,1,Private Spaces
4,696602542310304703,1,1,Kensington-Chinatown,43.648778,-79.401183,Entire rental unit,Entire home/apt,3,1 bath,...,1,"[""Hair dryer"", ""Room-darkening shades"", ""Keypa...",132.0,28,120,4,224,29568.0,1,Entire Properties
6,696973520016945803,0,1,Alderwood,43.612110,-79.539850,Entire rental unit,Entire home/apt,2,1 bath,...,2,"[""Hair dryer"", ""Heating"", ""Shampoo"", ""Body soa...",81.0,3,90,12,60,4860.0,1,Entire Properties
7,697004718610327555,0,1,Downsview-Roding-CFB,43.735735,-79.480703,Entire townhouse,Entire home/apt,6,2 bath,...,3,"[""Room-darkening shades"", ""Keypad"", ""Elevator""...",236.0,2,365,15,90,21240.0,2,Entire Properties
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21501,1355385238383046824,1,1,Waterfront Communities-The Island,43.645749,-79.393083,Entire rental unit,Entire home/apt,4,1 bath,...,2,"[""Hair dryer"", ""Elevator"", ""Central heating"", ...",170.0,1,365,0,24,4080.0,1,Entire Properties
21558,1362331480938692703,0,1,Waterfront Communities-The Island,43.649540,-79.388640,Entire home,Entire home/apt,6,1 bath,...,3,"[""Hair dryer"", ""Cleaning available during stay...",145.0,2,365,0,6,870.0,1,Entire Properties
21591,1363197896119450651,1,5,Kensington-Chinatown,43.653270,-79.396790,Entire home,Entire home/apt,2,1 bath,...,1,"[""Hair dryer"", ""Keypad"", ""Mini fridge"", ""Heati...",114.0,1,365,0,6,684.0,1,Entire Properties
21609,1364280421231205552,0,2,Clairlea-Birchmount,43.705119,-79.272289,Private room in home,Private room,2,2 shared bath,...,1,"[""Hair dryer"", ""Room-darkening shades"", ""Firep...",55.0,1,7,0,6,330.0,2,Private Spaces


In [12]:
listings_cleaned['property_type'].value_counts()

property_type
Entire rental unit                    2359
Private room in home                  2059
Entire home                           1654
Entire condo                          1609
Entire guest suite                     496
Private room in rental unit            317
Private room in condo                  171
Private room in bungalow               154
Private room in townhouse              152
Entire townhouse                       125
Private room in guest suite            112
Entire bungalow                         91
Entire loft                             90
Entire guesthouse                       74
Room in hotel                           25
Private room in bed and breakfast       23
Private room in villa                   21
Entire serviced apartment               14
Entire place                             9
Room in boutique hotel                   9
Tiny home                                7
Private room in guesthouse               7
Private room                            

In [13]:
# Save Cleaned Data as CSV
listings_cleaned.to_csv('../data/processed/cleaned_listings.csv', index=False)
