In [106]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import warnings

In [107]:
COLUMNS_TO_DROP = [
    "name",
    "description",
    "listing_url",
    "scrape_id",
    "last_scraped",
    "source",
    "picture_url",
    "host_name",
    "host_about",
    "host_url",
    "host_verifications",
    "host_location",
    "neighborhood_overview",
    "neighbourhood",
    "host_neighbourhood", #neighbourhood_cleansed used instead
    "neighbourhood_group_cleansed",
    "host_thumbnail_url",
    "host_picture_url",
    "property_type", #use room_type instead as it isn't freeform text
    "license",
    "calendar_updated",
    "calendar_last_scraped"
]

COLUMNS_TO_RENAME = {'price': 'price_nok'}
DATE_COLUMNS = ["host_since","first_review","last_review"]


In [108]:
df_listings = pd.read_csv("listings.csv")
df_listings.head(5)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,42932,https://www.airbnb.com/rooms/42932,20240629155744,2024-06-29,city scrape,"Charming apartment, Oslo Center, Ro",,(URL HIDDEN),https://a0.muscache.com/pictures/miso/Hosting-...,187463,...,4.96,4.96,4.85,,f,2,2,0,0,0.39
1,71725,https://www.airbnb.com/rooms/71725,20240629155744,2024-06-29,city scrape,Central big room - ap with balcony,A bargain in Oslo!!<br />Room in new and fully...,The apartment is situated in the eastern part ...,https://a0.muscache.com/pictures/468893/9c8190...,368229,...,4.94,4.71,4.8,,f,1,0,1,0,0.34
2,85902,https://www.airbnb.com/rooms/85902,20240629155744,2024-06-29,city scrape,"Stylish design Apt, super central",Discount on longer stays - Super central locat...,A very central location on the east side of to...,https://a0.muscache.com/pictures/fcb4770a-e06f...,250159,...,4.83,4.49,4.71,,f,2,1,1,0,0.5
3,123041,https://www.airbnb.com/rooms/123041,20240629155744,2024-06-29,city scrape,"Cozy room in design apartment, central east side",Welcome to a relaxing stay at a creative desig...,A very central location on the east side of to...,https://a0.muscache.com/pictures/fc30957d-cbbe...,250159,...,4.85,4.61,4.76,,f,2,1,1,0,1.48
4,149776,https://www.airbnb.com/rooms/149776,20240629155744,2024-06-30,city scrape,Bird's nest in the heart of Oslo,"A double room in a spacious, elegant apartment...","Despite its central location, the neighbourhoo...",https://a0.muscache.com/pictures/fc1eb1a5-0ab8...,714768,...,4.95,4.88,4.83,,f,2,0,2,0,0.27


In [109]:
# Drop columns that are defined as generally unnecessary for our problem and rename where necessary
def drop_columns_by_list (df, columns_to_delete):
    column_filter = df.filter(columns_to_delete)
    df.drop(column_filter, inplace=True, axis=1)
    return df

df_listings = drop_columns_by_list(df=df_listings,columns_to_delete=COLUMNS_TO_DROP)
df_listings.rename(columns=COLUMNS_TO_RENAME,inplace=True)

def convert_price_column(df):
    # Convert prive column
    if df['price_nok'].dtype != "float":
        # Remove the dollar sign and commas, then convert to float column
        df['price_nok'] = df['price_nok'].str.replace('$', '').str.replace(',', '').astype(float)
    return df


df_listings = convert_price_column(df_listings)
df_listings["price_nok"].head(5)

  df['price_nok'] = df['price_nok'].str.replace('$', '').str.replace(',', '').astype(float)


0    3400.0
1     572.0
2    1929.0
3     736.0
4    1029.0
Name: price_nok, dtype: float64

In [110]:
# Helper function to check if a value is 't', 'f', or NaN
def is_tf_nan(value):
    return value in ['t', 'f'] or pd.isna(value)

# Convert boolean (t,f) to 0,1
def find_and_convert_boolean_columns(df):
    # Identify columns that contain only 't', 'f', or NaN
    columns_to_convert = df.columns[df.applymap(is_tf_nan).all()]
    print("Boolean columns to be converted:", columns_to_convert)
    df[columns_to_convert] = df[columns_to_convert].replace({'t': 1, 'f': 0}).fillna(0).astype(int)
    return df

# Remove percentage signs and convert to float
def find_and_convert_percentage_columns(df):
    percentage_columns = []

    # Identify columns with percentage values
    for column in df.columns:
        if df[column].dropna().astype(str).str.endswith('%').any():
            percentage_columns.append(column)
    
    print("Percentage columns to be converted:", percentage_columns)
    # Convert identified columns to decimal format
    for column in percentage_columns:
        df[column] = df[column].str.replace('%', '').astype(float) / 100
        df[column] = df[column].fillna(0)  # Handle NaN values if needed

    return df

def find_and_convert_date_columns(df,date_columns_to_convert,date_format='%Y-%m-%d'):
    column_filter = df.filter(date_columns_to_convert)
    
    print("Date columns to be converted:", column_filter.columns)
    # Convert identified columns to datetime
    for column in column_filter:
        df[column] = pd.to_datetime(df[column], format=date_format)
    
    return df


df_listings = find_and_convert_boolean_columns(df_listings)
df_listings = find_and_convert_percentage_columns(df_listings)
df_listings = find_and_convert_date_columns(df_listings, date_columns_to_convert=DATE_COLUMNS)
df_listings.head(5)

Boolean columns to be converted: Index(['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified',
       'has_availability', 'instant_bookable'],
      dtype='object')
Percentage columns to be converted: ['host_response_rate', 'host_acceptance_rate']
Date columns to be converted: Index(['host_since', 'first_review', 'last_review'], dtype='object')


Unnamed: 0,id,host_id,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,42932,187463,2010-08-01,within an hour,1.0,1.0,1,2,2,1,...,4.94,4.96,4.96,4.85,0,2,2,0,0,0.39
1,71725,368229,2011-01-30,within a few hours,1.0,0.67,0,1,1,1,...,4.98,4.94,4.71,4.8,0,1,0,1,0,0.34
2,85902,250159,2010-10-01,within an hour,1.0,0.96,1,3,3,1,...,4.73,4.83,4.49,4.71,0,2,1,1,0,0.5
3,123041,250159,2010-10-01,within an hour,1.0,0.96,1,3,3,1,...,4.85,4.85,4.61,4.76,0,2,1,1,0,1.48
4,149776,714768,2011-06-18,within an hour,1.0,0.67,0,2,2,1,...,4.98,4.95,4.88,4.83,0,2,0,2,0,0.27


In [111]:
# Inspect columns that contain Nan values and decide how to handle them
print(df_listings.isna().sum().sort_values(ascending=False).to_string())

review_scores_value                             2739
review_scores_location                          2738
review_scores_accuracy                          2738
review_scores_cleanliness                       2738
review_scores_checkin                           2738
review_scores_communication                     2738
reviews_per_month                               2714
first_review                                    2714
last_review                                     2714
review_scores_rating                            2714
host_response_time                              2117
price_nok                                       1249
bathrooms                                       1215
beds                                            1210
bedrooms                                         173
bathrooms_text                                     6
number_of_reviews_l30d                             0
availability_60                                    0
availability_90                               

In [112]:
# Fill NaN values of bathrooms (numerical) with 0 or number extracted from the bathroom text
df_listings['bathrooms_text'].fillna('0', inplace=True)
numeric_values_from_bathroom_text = df_listings['bathrooms_text'].str.extract(r'(\d+\.?\d*)')[0].astype(float)
numeric_values_from_bathroom_text.fillna(0.0, inplace=True)
df_listings["bathrooms"].fillna(numeric_values_from_bathroom_text, inplace=True)

# Extract information whether it is a shared bathroom from the bathrooms_text column and encode it
df_listings['is_shared_bathroom'] = df_listings['bathrooms_text'].str.contains('shared').astype(int)

# Drop redundant bathrooms_text column to get rid of textual information
df_listings.drop(columns="bathrooms_text", inplace=True)

df_listings

def transform_bathroom_columns(df):
    df['bathrooms_text'].fillna('0', inplace=True)
    numeric_values_from_bathroom_text = df['bathrooms_text'].str.extract(r'(\d+\.?\d*)')[0].astype(float)
    numeric_values_from_bathroom_text.fillna(0.0, inplace=True)
    df["bathrooms"].fillna(numeric_values_from_bathroom_text, inplace=True)

    # Extract information whether it is a shared bathroom from the bathrooms_text column and encode it
    df['is_shared_bathroom'] = df['bathrooms_text'].str.contains('shared').astype(int)

    # Drop redundant bathrooms_text column to get rid of textual information
    df.drop(columns="bathrooms_text", inplace=True)
    return df

In [113]:
df_listings["room_type"].unique()
df_listings["neighbourhood_cleansed"].nunique()
df_listings["host_response_time"].unique()
df_listings["neighbourhood_cleansed"].unique()



array(['Frogner', 'Gamle Oslo', 'St. Hanshaugen', 'Sagene', 'Nordstrand',
       'Grünerløkka', 'Nordre Aker', 'Sentrum', 'Vestre Aker', 'Ullern',
       'Østensjø', 'Alna', 'Søndre Nordstrand', 'Bjerke', 'Grorud',
       'Stovner', 'Marka'], dtype=object)

In [114]:
# 4 unique values for room type 'Entire home/apt', 'Private room', 'Shared room', 'Hotel room'
df_listings = pd.get_dummies(df_listings, columns=['room_type'], prefix='room_type')

# 5 unique values for host_response_time after cleaning Nan 'within an hour', 'within a few hours',
#  'no information' (previously nan), 'within a day','a few days or more'
df_listings['host_response_time'].fillna('no information',inplace=True)
df_listings = pd.get_dummies(df_listings, columns=['host_response_time'],prefix='response_time')

# 17 unique values for neighbourhood_cleansed
''' 'Frogner', 'Gamle Oslo', 'St. Hanshaugen', 'Sagene', 'Nordstrand',
       'Grünerløkka', 'Nordre Aker', 'Sentrum', 'Vestre Aker', 'Ullern',
       'Østensjø', 'Alna', 'Søndre Nordstrand', 'Bjerke', 'Grorud',
       'Stovner', 'Marka'
'''
df_listings = pd.get_dummies(df_listings, columns=['neighbourhood_cleansed'],prefix='located_in')
df_listings

def encode_listings_data(df):
       df = pd.get_dummies(df, columns=['room_type'], prefix='room_type')
       df['host_response_time'].fillna('no information',inplace=True)
       df = pd.get_dummies(df, columns=['host_response_time'],prefix='response_time')
       df = pd.get_dummies(df, columns=['neighbourhood_cleansed'],prefix='located_in')
       return df


In [115]:
print(df_listings[df_listings["bathrooms"].isna() == True])

Empty DataFrame
Columns: [id, host_id, host_since, host_response_rate, host_acceptance_rate, host_is_superhost, host_listings_count, host_total_listings_count, host_has_profile_pic, host_identity_verified, latitude, longitude, accommodates, bathrooms, bedrooms, beds, amenities, price_nok, minimum_nights, maximum_nights, minimum_minimum_nights, maximum_minimum_nights, minimum_maximum_nights, maximum_maximum_nights, minimum_nights_avg_ntm, maximum_nights_avg_ntm, has_availability, availability_30, availability_60, availability_90, availability_365, number_of_reviews, number_of_reviews_ltm, number_of_reviews_l30d, first_review, last_review, review_scores_rating, review_scores_accuracy, review_scores_cleanliness, review_scores_checkin, review_scores_communication, review_scores_location, review_scores_value, instant_bookable, calculated_host_listings_count, calculated_host_listings_count_entire_homes, calculated_host_listings_count_private_rooms, calculated_host_listings_count_shared_rooms

In [116]:
def replace_nan_beds(row):
    if pd.isna(row['beds']):
        return math.ceil(row['accommodates'] / 2)
    else: return row['beds']

def transform_beds_column(df):
    df['beds'] = df.apply(replace_nan_beds, axis=1)
    return df


In [117]:
def replace_nan_bedrooms(row):
    if pd.isna(row['bedrooms']):
        if row['beds'] > 2:
            return math.ceil(row['beds'] / 2)
        elif row['beds'] == 0:
            return 0
        else: return 1
    else: return row['bedrooms']

def transform_bedroom_column(df):
    df['bedrooms'] = df.apply(replace_nan_bedrooms, axis=1)
    return df

In [118]:
def process_listings(df_listings_raw, save_file = False):
    df_listings_processed = df_listings_raw.copy()

    # Drop unnecessary columns
    df_listings_processed = drop_columns_by_list(df=df_listings_processed,columns_to_delete=COLUMNS_TO_DROP)
    
    # Rename specific columns
    df_listings_processed.rename(columns=COLUMNS_TO_RENAME,inplace=True)

    # Type Conversions
    df_listings_processed = find_and_convert_boolean_columns(df_listings_processed)
    df_listings_processed = find_and_convert_percentage_columns(df_listings_processed)
    df_listings_processed = find_and_convert_date_columns(df_listings_processed, date_columns_to_convert=DATE_COLUMNS)
    df_listings_processed = convert_price_column(df_listings_processed)

    # Transformation and Encoding
    df_listings_processed = transform_bathroom_columns(df_listings_processed)
    df_listings_processed = transform_beds_column(df_listings_processed)
    df_listings_processed = transform_bedroom_column(df_listings_processed)
    df_listings_processed = encode_listings_data(df_listings_processed)

    # Drop NaN values that cannot be transformed nor used
    df_listings_processed = df_listings_processed.dropna(subset=['price_nok'])
    
    if (save_file):
        df_listings_processed.to_csv("csv/listings_processed.csv",index=False)
    
    return df_listings_processed



In [119]:
df_listings_raw = pd.read_csv("listings.csv")
df_listings_processed = process_listings(df_listings_raw, True)
df_listings_processed

Boolean columns to be converted: Index(['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified',
       'has_availability', 'instant_bookable'],
      dtype='object')
Percentage columns to be converted: ['host_response_rate', 'host_acceptance_rate']
Date columns to be converted: Index(['host_since', 'first_review', 'last_review'], dtype='object')


  df['price_nok'] = df['price_nok'].str.replace('$', '').str.replace(',', '').astype(float)


Unnamed: 0,id,host_id,host_since,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,...,located_in_Nordre Aker,located_in_Nordstrand,located_in_Sagene,located_in_Sentrum,located_in_St. Hanshaugen,located_in_Stovner,located_in_Søndre Nordstrand,located_in_Ullern,located_in_Vestre Aker,located_in_Østensjø
0,42932,187463,2010-08-01,1.0,1.00,1,2,2,1,1,...,0,0,0,0,0,0,0,0,0,0
1,71725,368229,2011-01-30,1.0,0.67,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,85902,250159,2010-10-01,1.0,0.96,1,3,3,1,1,...,0,0,0,0,0,0,0,0,0,0
3,123041,250159,2010-10-01,1.0,0.96,1,3,3,1,1,...,0,0,0,0,0,0,0,0,0,0
4,149776,714768,2011-06-18,1.0,0.67,0,2,2,1,1,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10094,1188980800408582541,39576399,2015-07-26,0.0,0.00,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
10095,1188989142274052057,133880219,2017-06-07,0.0,0.00,0,1,2,1,0,...,0,0,0,0,0,0,0,0,0,0
10096,1189002581216594055,100778838,2016-10-22,0.0,0.00,0,1,1,1,1,...,1,0,0,0,0,0,0,0,0,0
10097,1189050162273648943,83467404,2016-07-13,0.0,0.00,0,1,1,1,1,...,0,0,1,0,0,0,0,0,0,0


In [120]:
# Inspect columns that contain Nan values and decide how to handle them
print(df_listings_processed.isna().sum().sort_values(ascending=False).to_string())

review_scores_value                             2497
review_scores_cleanliness                       2496
review_scores_location                          2496
review_scores_checkin                           2496
review_scores_accuracy                          2496
review_scores_communication                     2496
reviews_per_month                               2472
review_scores_rating                            2472
last_review                                     2472
first_review                                    2472
located_in_St. Hanshaugen                          0
calculated_host_listings_count_entire_homes        0
room_type_Entire home/apt                          0
is_shared_bathroom                                 0
calculated_host_listings_count_shared_rooms        0
calculated_host_listings_count_private_rooms       0
instant_bookable                                   0
calculated_host_listings_count                     0
room_type_Private room                        

In [121]:
# Iterate over each row in the original dataframe
# Get all unique values from the list_column
unique_amenities = set()

#unique_values = set(val for sublist in df['list_column'] for val in sublist)

all_amenities_df = pd.DataFrame(columns=["id"])
all_amenities_df.loc[0, "id"] = -1
all_amenities_df.set_index('id', inplace=True)
for index, row in df_listings_processed.iterrows():

    amenities_clean = [] 
    # Get the ID and amenities list
    id = row['id']

    amenities = str(row['amenities'])
    amenities = amenities.strip('[]')
    amenities = amenities.split(',')

    for amenity in amenities:
        amenity = amenity.strip()
        amenity = amenity.replace("\"","")
        amenities_clean.append(amenity)
    
    unique_values = set(amenities_clean)
    unique_amenities.update(unique_values)

print(f"Amount of unique amenities present in the listings dataset: {len(unique_amenities)}")

# Initialize the new DataFrame with the id column
amenities_encoded_df = pd.DataFrame(df_listings_processed['id'])
# Add columns for each unique value
for amenity in unique_amenities:
    amenities_encoded_df[amenity] = df_listings_processed['amenities'].apply(lambda x: 1 if amenity in x else 0)
amenities_encoded_df.drop(amenities_encoded_df.columns[1], axis=1, inplace=True)

# Disable when using apply
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)


Amount of unique amenities present in the listings dataset: 2173


  amenities_encoded_df[amenity] = df_listings_processed['amenities'].apply(lambda x: 1 if amenity in x else 0)
  amenities_encoded_df[amenity] = df_listings_processed['amenities'].apply(lambda x: 1 if amenity in x else 0)
  amenities_encoded_df[amenity] = df_listings_processed['amenities'].apply(lambda x: 1 if amenity in x else 0)
  amenities_encoded_df[amenity] = df_listings_processed['amenities'].apply(lambda x: 1 if amenity in x else 0)
  amenities_encoded_df[amenity] = df_listings_processed['amenities'].apply(lambda x: 1 if amenity in x else 0)
  amenities_encoded_df[amenity] = df_listings_processed['amenities'].apply(lambda x: 1 if amenity in x else 0)
  amenities_encoded_df[amenity] = df_listings_processed['amenities'].apply(lambda x: 1 if amenity in x else 0)
  amenities_encoded_df[amenity] = df_listings_processed['amenities'].apply(lambda x: 1 if amenity in x else 0)
  amenities_encoded_df[amenity] = df_listings_processed['amenities'].apply(lambda x: 1 if amenity in x else 0)
 

In [122]:
amenities_encoded_df.to_csv("csv/amenities_raw.csv",index=False)

In [123]:
# Calculate the sum of each column (number of 1s)
column_sums = amenities_encoded_df.sum()

# Sort the columns by their sum in descending order
sorted_columns = column_sums.sort_values(ascending=False)

# Get the top N columns with the most 1s
N = 50  # Adjust N to get the desired number of top columns
top_columns = sorted_columns.head(N)

print(top_columns)
sorted_columns.to_csv("csv/amenities_encoded_ranking.csv")

Kitchen                       8588
Smoke alarm                   8477
Fire extinguisher             7999
Wifi                          7992
TV                            7334
Dishes and silverware         6172
Hot water                     6153
Washer                        5914
Refrigerator                  5713
Cooking basics                5675
Bed linens                    5422
Essentials                    5347
Coffee                        5041
Dishwasher                    5041
Hair dryer                    4921
Dedicated workspace           4782
Iron                          4605
Hangers                       4588
Cleaning products             4389
Wine glasses                  4349
Dining table                  4321
Coffee maker                  4262
Outdoor dining area           4135
Freezer                       3836
Drying rack for clothing      3830
Hot water kettle              3569
Oven                          3439
Room-darkening shades         3395
Heating             

In [124]:
amenities_categorized_df = pd.DataFrame()
amenities_categorized_df["id"] = amenities_encoded_df["id"]
amenities_categorized_df['Has Kitchen'] = amenities_encoded_df.filter(regex='(?i)kitchen').max(axis=1)
amenities_categorized_df['Has Coffee'] = amenities_encoded_df.filter(regex='(?i)(?!table)(coffee|espresso)').max(axis=1)
amenities_categorized_df['Has Smoke Alarm'] = amenities_encoded_df.filter(regex='(?i)smoke detector|smoke alarm|monoxide').max(axis=1)
amenities_categorized_df['Has Fire extinguisher'] = amenities_encoded_df.filter(regex='(?i)Fire extinguisher|fire-extinguisher').max(axis=1)
amenities_categorized_df['Has TV'] = amenities_encoded_df.filter(regex='(?i)TV|television').max(axis=1)
amenities_categorized_df['Has Digital Entertainment'] = amenities_encoded_df.filter(regex='(?i)amazon prime|netflix|chromecast|disney|roku|hbo max|ps2|ps3|xbox|nintendo|video games|console').max(axis=1)
amenities_categorized_df['Has Stove/Oven'] = amenities_encoded_df.filter(regex='(?i)stove|stovetop|oven|cooker|furnace').max(axis=1)
amenities_categorized_df['Has Fridge'] = amenities_encoded_df.filter(regex='(?i)fridge|refrigerator').max(axis=1)
amenities_categorized_df['Has Heating'] = amenities_encoded_df.filter(regex='(?i)heating|heater|heat|fireplace|furnace|wood-burning').max(axis=1)
amenities_categorized_df['Has Hot Water'] = amenities_encoded_df.filter(regex='(?i)(?! kettle)(hot water|water heater|warm water)').max(axis=1)
amenities_categorized_df['Has Internet'] = amenities_encoded_df.filter(regex='(?i)wifi|wlan|internet|mpbs|network|ethernet').max(axis=1)
amenities_categorized_df['Has Sound/Speakers'] = amenities_encoded_df.filter(regex='(?i)sound|soundsystem|speaker|music').max(axis=1)
amenities_categorized_df['Has Bathroom Items'] = amenities_encoded_df.filter(regex='(?i)conditioner|shampoo|soap|lotion|shower gel').max(axis=1)
amenities_categorized_df['Has Washer'] = amenities_encoded_df.filter(regex='(?i)(?!dishwasher)(washer|washing|laundry)').max(axis=1)
amenities_categorized_df['Has Dryer'] = amenities_encoded_df.filter(regex='(?i)(?!hair)(dryer|dry cleaning)').max(axis=1)
amenities_categorized_df['Has Dishwasher'] = amenities_encoded_df.filter(regex='(?i)(dishwasher)').max(axis=1)
amenities_categorized_df['Has Parking'] = amenities_encoded_df.filter(regex='(?i)(?!view)(parking|garage|park)').max(axis=1)
amenities_categorized_df['Has Pool/Hot Tub'] = amenities_encoded_df.filter(regex='(?i)(?!refrigerator)(pool|hot tub|swim)').max(axis=1)
amenities_categorized_df['Has Backyard'] = amenities_encoded_df.filter(regex='(?i)(yard|garden|backyard)').max(axis=1)
amenities_categorized_df['Has Balcony or Similar'] = amenities_encoded_df.filter(regex='(?i)(balcony|patio|rooftop)').max(axis=1)
amenities_categorized_df['Has Fitness'] = amenities_encoded_df.filter(regex='(?i)(gym|fitness|exercise|workout|weights)').max(axis=1)
amenities_categorized_df['Has Housekeeping available'] = amenities_encoded_df.filter(regex='(?i)(housekeeping|house keep)').max(axis=1)
amenities_categorized_df['Has Bed linens'] = amenities_encoded_df.filter(regex='(?i)(bed linen|bed sheet)').max(axis=1)
amenities_categorized_df['Has Room Darkening'] = amenities_encoded_df.filter(regex='(?i)(room-darkening shades)').max(axis=1)
amenities_categorized_df['Has Luggage dropoff'] = amenities_encoded_df.filter(regex='(?i)(Luggage dropoff)').max(axis=1)
amenities_categorized_df['Has Clothing storage'] = amenities_encoded_df.filter(regex='(?i)(clothing storage|closet|dresser|wardrobe)').max(axis=1)
amenities_categorized_df['Has Dishes and silverware'] = amenities_encoded_df.filter(regex='(?i)(dishes and silverware|dishes|plates)').max(axis=1)
amenities_categorized_df['Allows Pets'] = amenities_encoded_df.filter(regex='(?i)(Pets allowed)').max(axis=1)
amenities_categorized_df['Is by water'] = amenities_encoded_df.filter(regex='(?i)(?!essentials)(lake|beach|water front|waterfront|ocean|sea)').max(axis=1)
amenities_categorized_df['Has BBQ/Grill'] = amenities_encoded_df.filter(regex='(?i)(?!utensils)(grill|bbq|water front|waterfront|ocean|sea)').max(axis=1)
amenities_categorized_df['Has Air conditioning'] = amenities_encoded_df.filter(regex='(?i)(air conditioning|ac unit|ac )').max(axis=1)
amenities_categorized_df['Has Sauna'] = amenities_encoded_df.filter(regex='(?i)(sauna)').max(axis=1)
amenities_categorized_df['Has Elevator'] = amenities_encoded_df.filter(regex='(?i)(elevator)').max(axis=1)
amenities_categorized_df['Has EV charger'] = amenities_encoded_df.filter(regex='(?i)(ev charger|electric vehicle charging|ev charging|ev station)').max(axis=1)
amenities_categorized_df['Has a view'] = amenities_encoded_df.filter(regex='(?i)(view)').max(axis=1)
amenities_categorized_df

Unnamed: 0,id,Has Kitchen,Has Coffee,Has Smoke Alarm,Has Fire extinguisher,Has TV,Has Digital Entertainment,Has Stove/Oven,Has Fridge,Has Heating,...,Has Clothing storage,Has Dishes and silverware,Allows Pets,Is by water,Has BBQ/Grill,Has Air conditioning,Has Sauna,Has Elevator,Has EV charger,Has a view
0,42932,1,1,1,1,1,1,1,1,1,...,0,1,0,1,1,0,0,0,0,1
1,71725,1,1,1,1,0,0,1,1,1,...,1,1,1,0,1,0,0,1,0,0
2,85902,1,1,1,1,1,0,1,1,1,...,1,1,0,0,0,0,0,0,0,0
3,123041,1,1,1,1,0,0,1,1,1,...,1,1,0,0,0,0,0,0,0,0
4,149776,0,0,1,1,0,0,1,1,1,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10094,1188980800408582541,1,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10095,1188989142274052057,1,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10096,1189002581216594055,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10097,1189050162273648943,1,1,1,1,1,1,1,1,1,...,1,1,0,0,0,0,0,1,0,0


In [125]:
amenities_categorized_df.to_csv("csv/amenities_encoded_categorized.csv")

In [126]:
# Calculate the sum of each column (number of 1s)
column_sums = amenities_categorized_df.sum()

# Sort the columns by their sum in descending order
sorted_columns = column_sums.sort_values(ascending=False)

# Get the top N columns with the most 1s
N = 35  # Adjust N to get the desired number of top columns
top_columns = sorted_columns.head(N)

print(top_columns)

Has Internet                  8704
Has Kitchen                   8589
Has Smoke Alarm               8489
Has Washer                    8247
Has Fire extinguisher         7999
Has TV                        7334
Has Heating                   6953
Has Parking                   6733
Has Fridge                    6233
Has Dishes and silverware     6172
Has Hot Water                 6153
Has Dryer                     5709
Has Bed linens                5422
Has Stove/Oven                5413
Has Dishwasher                5041
Has Coffee                    5041
Has Bathroom Items            4176
Has Backyard                  3927
Has Balcony or Similar        3605
Has BBQ/Grill                 3446
Has Room Darkening            3395
Has Clothing storage          3215
Has a view                    2065
Allows Pets                   2020
Has Sound/Speakers            1984
Has Elevator                  1848
Is by water                   1647
Has Digital Entertainment     1515
Has Air conditioning