In [2]:
pip install haversine

Collecting haversine
  Using cached haversine-2.8.1-py2.py3-none-any.whl.metadata (5.9 kB)
Using cached haversine-2.8.1-py2.py3-none-any.whl (7.7 kB)
Installing collected packages: haversine
Successfully installed haversine-2.8.1
Note: you may need to restart the kernel to use updated packages.


In [344]:
import pandas as pd
import numpy as np
from datetime import datetime
from haversine import haversine
import math

In [346]:
df = pd.read_csv('Airbnb_Data.csv')
df = df[df['city'] == 'NYC']
print(df.columns)

Index(['id', 'log_price', 'property_type', 'room_type', 'amenities',
       'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
       'cleaning_fee', 'city', 'description', 'first_review',
       'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'host_since', 'instant_bookable', 'last_review', 'latitude',
       'longitude', 'name', 'neighbourhood', 'number_of_reviews',
       'review_scores_rating', 'thumbnail_url', 'zipcode', 'bedrooms', 'beds'],
      dtype='object')


In [348]:
### CLEANING ###

In [350]:
df = df[['log_price', 'property_type', 'room_type', 'amenities',
       'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
       'cleaning_fee', 'host_identity_verified', 'host_response_rate',
       'host_since', 'instant_bookable', 'latitude',
       'longitude', 'number_of_reviews', 'review_scores_rating',
        'zipcode', 'bedrooms', 'beds']]

In [353]:
# Desired unique amenities:
desired_unique_amenities = ['Waterfront', 'Washer / Dryer', 'Pets allowed', 'Wireless Internet', 
 'Air conditioning', 'Family/kid friendly', 'Dishwasher', 'Kitchen', 
 'Beachfront', 'Heating', 'TV', 'Pool']

# FUNCTION amenities_one_hot
# Performs one hot on the amenities column
def amenities_one_hot(dataframe, desired_amenities, column='amenities'):
    # Initialize new columns for each desired amenity with a default value of 0
    for amenity in desired_amenities:
        dataframe[amenity] = 0

    # Use apply to set the value of each amenity column to 1 if it's in the list of amenities for that row
    def set_amenity_flags(amenities_list):
        for amenity in amenities_list:
            if amenity in desired_amenities:
                dataframe.loc[dataframe.index, amenity] = 1
    
    dataframe[column].apply(set_amenity_flags)
    
    return dataframe

df = amenities_one_hot(df, desired_unique_amenities)
df = df.drop(columns=['amenities'])

print(df.columns)

Index(['log_price', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bed_type', 'cancellation_policy', 'cleaning_fee',
       'host_identity_verified', 'host_response_rate', 'host_since',
       'instant_bookable', 'latitude', 'longitude', 'number_of_reviews',
       'review_scores_rating', 'zipcode', 'bedrooms', 'beds', 'Waterfront',
       'Washer / Dryer', 'Pets allowed', 'Wireless Internet',
       'Air conditioning', 'Family/kid friendly', 'Dishwasher', 'Kitchen',
       'Beachfront', 'Heating', 'TV', 'Pool'],
      dtype='object')


In [354]:
# FUNCTION boolean_encode
# Encodes boolean with True = 1, False = 0
def boolean_encode(dataframe, column, t = 't'):
    # Drop NaN values
    dataframe = dataframe.dropna(subset=[column])
    
    dataframe[column] = dataframe[column].apply(lambda value: 1 if value == True or value == t else 0)
    
    return dataframe

In [355]:
# FUNCTION fix_host_response_rate
# Fixes the string percentages into doubles
def fix_host_response_rate(dataframe, column = 'host_response_rate'):
    dataframe = dataframe.dropna(subset=[column])
    
    # Remove '%' symbol, convert to float, and divide by 100
    dataframe[column] = dataframe[column].str.replace('%', '', regex=False).astype(float) / 100
    
    # Round to a single decimal place
    dataframe[column] = dataframe[column].round(1)
    
    return dataframe

In [356]:
# FUNCTION find_num_years_hosting
# Finds the number of years a host has been hosting
def find_num_years_hosting(dataframe, column = 'host_since'):
    dataframe[column] = pd.to_datetime(dataframe[column])
    current_year = datetime.now().year
    
    dataframe['num_years_hosting'] = current_year - dataframe[column].dt.year
    
    dataframe.drop(columns=[column], inplace=True)

    return dataframe

In [357]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # Radius of the Earth in kilometers
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = math.sin(dlat / 2) ** 2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c  # Distance in kilometers
    return distance

# FUNCTION find_distance_to_landmarks
# Finds the distance from an air_bnb to each landmark
def find_distance_to_landmarks(dataframe, landmarks, lat_column='latitude', lon_column='longitude'):
    # Iterate through landmarks
    for landmark in landmarks:
        column_name = "distance_to_" + landmark[0]
        
        # Calculate distance to each landmark for each row in the DataFrame
        distances = []
        for index, row in dataframe.iterrows():
            distance = haversine(row[lat_column], row[lon_column], landmark[1], landmark[2])
            distances.append(distance)
        
        # Assign distances to the new column in the DataFrame
        dataframe[column_name] = np.round(distances,0)

    return dataframe

In [358]:
# FUNCTION fix_zip_codes
# Function to remove any rows where the 'zipcode' column has more 
# than five chars
def fix_zip_codes(dataframe, column='zipcode'):
    dataframe = dataframe.dropna(subset=[column])
    dataframe = dataframe[dataframe[column].str.len() <= 5]
    
    return dataframe

In [359]:
#####################################################
# EXECUTION
#####################################################

In [360]:
######################################################
# BOOLEAN ENCODINGS
######################################################
# One-hot-encode 'cleaning_fee'
df = boolean_encode(df, 'cleaning_fee')

# One-hot-encode 'host_identity_verified'
df = boolean_encode(df, 'host_identity_verified')

df = boolean_encode(df, 'instant_bookable')
######################################################

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column] = dataframe[column].apply(lambda value: 1 if value == True or value == t else 0)


In [361]:
######################################################
# fix host_response rate
######################################################
df = fix_host_response_rate(df)
######################################################

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column] = dataframe[column].str.replace('%', '', regex=False).astype(float) / 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column] = dataframe[column].round(1)


In [362]:
######################################################
# Find number of years a host has been hosting
######################################################
df = find_num_years_hosting(df)
######################################################

In [363]:
landmarks = [('Times_Square', 40.758896, -73.985130), 
             ('Empire_State_Building', 40.7484, -73.9857),
             ('Statue_of_Liberty', 40.6892, -74.0445),
             ('Central_Park',40.7826, -73.9656),
             ('JFK_Airport', 40.6446, -73.7797)]

######################################################
# Find the distance of each airBnB to every landmark in
# kilometers
######################################################
df = find_distance_to_landmarks(df, landmarks)
######################################################

df = df.drop(columns=['latitude'])
df = df.drop(columns=['longitude'])

In [364]:
df = fix_zip_codes(df)

######################################################
# Drop all rows where 'zipcode' has more than five chars
######################################################
df = fix_zip_codes(df)
######################################################

In [365]:
# Drop rows with NaN's in bedrooms, beds, accommodates, or log_price columns
df = df.dropna(subset=['bedrooms', 'beds', 'accommodates', 'log_price'])

In [366]:
percentile_33 = df['log_price'].quantile(0.33)
percentile_66 = df['log_price'].quantile(0.66)

# Set values above the 66th percentile to 'High', 
# between the 33rd and 66th percentiles to 'Medium',
# and below the 33rd percentile to 'Low'
df['log_price'] = df['log_price'].apply(lambda x: 'High' if x > percentile_66 else ('Medium' if x > percentile_33 else 'Low'))

In [367]:
# Drop unneccessary columns:
df = df.drop(columns=['number_of_reviews', 'review_scores_rating'])

In [368]:
print(len(df))

16270


In [369]:
print(df.shape)

(16270, 32)


In [370]:
print(df.shape)

(16270, 32)


In [371]:
print(df.columns)

Index(['log_price', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bed_type', 'cancellation_policy', 'cleaning_fee',
       'host_identity_verified', 'host_response_rate', 'instant_bookable',
       'zipcode', 'bedrooms', 'beds', 'Waterfront', 'Washer / Dryer',
       'Pets allowed', 'Wireless Internet', 'Air conditioning',
       'Family/kid friendly', 'Dishwasher', 'Kitchen', 'Beachfront', 'Heating',
       'TV', 'Pool', 'num_years_hosting', 'distance_to_Times_Square',
       'distance_to_Empire_State_Building', 'distance_to_Statue_of_Liberty',
       'distance_to_Central_Park', 'distance_to_JFK_Airport'],
      dtype='object')


In [372]:
split_index = int(len(df) * 0.75)
    
# First 75% of rows
df_train = df.iloc[:split_index]

# Remaining 25% of rows
df_test = df.iloc[split_index:]

In [373]:
print(df_train.shape)
print(df_test.shape)

(12202, 32)
(4068, 32)


In [374]:
df_train.to_csv('data_for_other_models_train.csv', index = False)
df_test.to_csv('data_for_other_models_test.csv', index = False)