#### About
Predicting airbnb prices.
> Dataset link - https://www.kaggle.com/datasets/stevezhenghp/airbnb-price-prediction


In [67]:
#importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [68]:
df = pd.read_csv('/home/suraj/ClickUp/Jan-Feb/data-science-ml-dl-projects/03-case-studies/04-predicting-airbnb-prices/train.csv')

In [69]:
df

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201,1.0,1.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,40.766115,-73.989040,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019,3.0,3.0
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,40.808110,-73.943756,The Garden Oasis,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9...,10027,1.0,3.0
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,37.772004,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117.0,2.0,2.0
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,...,38.925627,-77.034596,Great studio in midtown DC,Columbia Heights,4,40.0,,20009,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74106,14549287,4.605170,Apartment,Private room,{},1,1.0,Real Bed,flexible,False,...,40.709025,-73.939405,one room bushwick,Williamsburg,0,,https://a0.muscache.com/im/pictures/55162426/6...,11206.0,1.0,1.0
74107,13281809,5.043425,Apartment,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,2.0,Real Bed,moderate,True,...,33.871549,-118.396053,Spacious Hermosa 2 BR on PCH,Hermosa Beach,16,93.0,https://a0.muscache.com/im/pictures/2b86560b-a...,90254,2.0,4.0
74108,18688039,5.220356,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",5,1.0,Real Bed,moderate,True,...,40.706749,-73.942377,Modern 2 Bedroom Apartment in Williamsburg,Williamsburg,43,94.0,https://a0.muscache.com/im/pictures/7fbe448c-5...,11206.0,2.0,2.0
74109,17045948,5.273000,Apartment,Entire home/apt,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",2,1.0,Real Bed,strict,True,...,40.738535,-74.000157,Designer's Apartment in HEART of NYC,West Village,0,,https://a0.muscache.com/im/pictures/b3971b63-0...,10011,0.0,2.0


In [70]:
df.columns

Index(['id', 'log_price', 'property_type', 'room_type', 'amenities',
       'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
       'cleaning_fee', 'city', 'description', 'first_review',
       'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'host_since', 'instant_bookable', 'last_review', 'latitude',
       'longitude', 'name', 'neighbourhood', 'number_of_reviews',
       'review_scores_rating', 'thumbnail_url', 'zipcode', 'bedrooms', 'beds'],
      dtype='object')

#### 1. Data pre-processing.

In [71]:
# drop unnecessary columns
df.drop(['id', 'amenities', 'description', 'first_review', 'last_review', 'thumbnail_url'], axis=1, inplace=True)


In [72]:
# convert log_price to regular price
df['price'] = np.exp(df['log_price'])
df = df.drop(['log_price'], axis=1)


In [73]:
# handle missing values
df['cleaning_fee'].fillna(value=0, inplace=True)
df.dropna(inplace=True)

df.fillna({'review_scores_rating': df['review_scores_rating'].mean()}, inplace=True)  # fill missing review_scores_rating with mean value

In [74]:
# convert categorical variables to numerical using one-hot encoding
categorical_cols = ['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'city', 'neighbourhood']
df = pd.get_dummies(df, columns=categorical_cols)

In [75]:
df

Unnamed: 0,accommodates,bathrooms,cleaning_fee,host_has_profile_pic,host_identity_verified,host_response_rate,host_since,instant_bookable,latitude,longitude,...,neighbourhood_Windsor Terrace,neighbourhood_Winnetka,neighbourhood_Woodhaven,neighbourhood_Woodland,neighbourhood_Woodland Hills/Warner Center,neighbourhood_Woodlawn,neighbourhood_Woodley Park,neighbourhood_Woodridge,neighbourhood_Woodside,neighbourhood_Wrigleyville
1,7,1.0,True,t,f,100%,2017-06-19,t,40.766115,-73.989040,...,0,0,0,0,0,0,0,0,0,0
2,5,1.0,True,t,t,100%,2016-10-25,t,40.808110,-73.943756,...,0,0,0,0,0,0,0,0,0,0
4,2,1.0,True,t,t,100%,2015-03-01,t,38.925627,-77.034596,...,0,0,0,0,0,0,0,0,0,0
5,2,1.0,True,t,t,100%,2017-06-07,t,37.753164,-122.429526,...,0,0,0,0,0,0,0,0,0,0
7,2,1.0,True,t,t,100%,2013-05-18,f,34.046737,-118.260439,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74102,10,3.0,False,t,f,100%,2013-07-01,f,34.199671,-118.618070,...,0,0,0,0,0,0,0,0,0,0
74103,2,1.0,True,t,f,100%,2011-09-26,f,37.789989,-122.407384,...,0,0,0,0,0,0,0,0,0,0
74107,4,2.0,True,t,f,100%,2016-05-03,f,33.871549,-118.396053,...,0,0,0,0,0,0,0,0,0,0
74108,5,1.0,True,t,t,100%,2012-01-05,t,40.706749,-73.942377,...,0,0,0,0,0,0,0,0,0,0


In [76]:
#converting string values to numeric
df['host_has_profile_pic'] = df['host_has_profile_pic'].map({'t': 1, 'f': 0})
df['host_identity_verified'] = df['host_identity_verified'].map({'t': 1, 'f': 0})
df['instant_bookable'] = df['instant_bookable'].map({'t': 1, 'f': 0})

# convert boolean variables to numerical
df['host_has_profile_pic'] = df['host_has_profile_pic'].astype(int)
df['host_identity_verified'] = df['host_identity_verified'].astype(int)
df['instant_bookable'] = df['instant_bookable'].astype(int)


In [77]:
df

Unnamed: 0,accommodates,bathrooms,cleaning_fee,host_has_profile_pic,host_identity_verified,host_response_rate,host_since,instant_bookable,latitude,longitude,...,neighbourhood_Windsor Terrace,neighbourhood_Winnetka,neighbourhood_Woodhaven,neighbourhood_Woodland,neighbourhood_Woodland Hills/Warner Center,neighbourhood_Woodlawn,neighbourhood_Woodley Park,neighbourhood_Woodridge,neighbourhood_Woodside,neighbourhood_Wrigleyville
1,7,1.0,True,1,0,100%,2017-06-19,1,40.766115,-73.989040,...,0,0,0,0,0,0,0,0,0,0
2,5,1.0,True,1,1,100%,2016-10-25,1,40.808110,-73.943756,...,0,0,0,0,0,0,0,0,0,0
4,2,1.0,True,1,1,100%,2015-03-01,1,38.925627,-77.034596,...,0,0,0,0,0,0,0,0,0,0
5,2,1.0,True,1,1,100%,2017-06-07,1,37.753164,-122.429526,...,0,0,0,0,0,0,0,0,0,0
7,2,1.0,True,1,1,100%,2013-05-18,0,34.046737,-118.260439,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74102,10,3.0,False,1,0,100%,2013-07-01,0,34.199671,-118.618070,...,0,0,0,0,0,0,0,0,0,0
74103,2,1.0,True,1,0,100%,2011-09-26,0,37.789989,-122.407384,...,0,0,0,0,0,0,0,0,0,0
74107,4,2.0,True,1,0,100%,2016-05-03,0,33.871549,-118.396053,...,0,0,0,0,0,0,0,0,0,0
74108,5,1.0,True,1,1,100%,2012-01-05,1,40.706749,-73.942377,...,0,0,0,0,0,0,0,0,0,0


In [78]:
df['zipcode']

1          10019
2          10027
4          20009
5          94131
7          90015
          ...   
74102      91307
74103      94108
74107      90254
74108    11206.0
74110      90802
Name: zipcode, Length: 42776, dtype: object

In [79]:
le = LabelEncoder()
df['zipcode'] = le.fit_transform(df['zipcode'])  # encode zipcode numerically
df['cleaning_fee'] = le.fit_transform(df['cleaning_fee'])


In [80]:
#feature scaling
scaler = StandardScaler()
df[['accommodates', 'bathrooms', 'latitude', 'longitude', 'bedrooms', 'beds']] = scaler.fit_transform(df[['accommodates', 'bathrooms', 'latitude', 'longitude', 'bedrooms', 'beds']])


In [81]:
non_numeric_cols = df.select_dtypes(exclude='number').columns.tolist()
print(non_numeric_cols)

['host_response_rate', 'host_since', 'name']


#### 2. Feature Engineering

In [82]:
# Feature engineering
df['host_response_rate'] = df['host_response_rate'].str.rstrip('%').astype('float') / 100.0  # convert host_response_rate to a float between 0 and 1
df['host_since'] = pd.to_datetime(df['host_since'])  # convert host_since to datetime
df['days_since_host'] = (pd.to_datetime('2023-04-07') - df['host_since']).dt.days  # create a new feature representing the number of days since the host joined AirBNB

In [83]:
df

Unnamed: 0,accommodates,bathrooms,cleaning_fee,host_has_profile_pic,host_identity_verified,host_response_rate,host_since,instant_bookable,latitude,longitude,...,neighbourhood_Winnetka,neighbourhood_Woodhaven,neighbourhood_Woodland,neighbourhood_Woodland Hills/Warner Center,neighbourhood_Woodlawn,neighbourhood_Woodley Park,neighbourhood_Woodridge,neighbourhood_Woodside,neighbourhood_Wrigleyville,days_since_host
1,1.673844,-0.406068,1,1,0,1.0,2017-06-19,1,0.692160,0.810617,...,0,0,0,0,0,0,0,0,0,2118
2,0.767321,-0.406068,1,1,1,1.0,2016-10-25,1,0.706003,0.812723,...,0,0,0,0,0,0,0,0,0,2355
4,-0.592463,-0.406068,1,1,1,1.0,2015-03-01,1,0.085463,0.669026,...,0,0,0,0,0,0,0,0,0,2959
5,-0.592463,-0.406068,1,1,1,1.0,2017-06-07,1,-0.301026,-1.441436,...,0,0,0,0,0,0,0,0,0,2130
7,-0.592463,-0.406068,1,1,1,1.0,2013-05-18,0,-1.522809,-1.247611,...,0,0,0,0,0,0,0,0,0,3611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74102,3.033628,3.060988,0,1,0,1.0,2013-07-01,0,-1.472396,-1.264237,...,0,0,0,0,0,0,0,0,0,3567
74103,-0.592463,-0.406068,1,1,0,1.0,2011-09-26,0,-0.288887,-1.440407,...,0,0,0,0,0,0,0,0,0,4211
74107,0.314060,1.327460,1,1,0,1.0,2016-05-03,0,-1.580558,-1.253916,...,0,0,0,0,0,0,0,0,0,2530
74108,0.767321,-0.406068,1,1,1,1.0,2012-01-05,1,0.672590,0.812787,...,0,0,0,0,0,0,0,0,0,4110


In [84]:
# Create an interaction term between accommodates and bedrooms
df['acc_bed'] = df['accommodates'] * df['bedrooms']

In [85]:
df

Unnamed: 0,accommodates,bathrooms,cleaning_fee,host_has_profile_pic,host_identity_verified,host_response_rate,host_since,instant_bookable,latitude,longitude,...,neighbourhood_Woodhaven,neighbourhood_Woodland,neighbourhood_Woodland Hills/Warner Center,neighbourhood_Woodlawn,neighbourhood_Woodley Park,neighbourhood_Woodridge,neighbourhood_Woodside,neighbourhood_Wrigleyville,days_since_host,acc_bed
1,1.673844,-0.406068,1,1,0,1.0,2017-06-19,1,0.692160,0.810617,...,0,0,0,0,0,0,0,0,2118,3.335722
2,0.767321,-0.406068,1,1,1,1.0,2016-10-25,1,0.706003,0.812723,...,0,0,0,0,0,0,0,0,2355,-0.247767
4,-0.592463,-0.406068,1,1,1,1.0,2015-03-01,1,0.085463,0.669026,...,0,0,0,0,0,0,0,0,2959,0.877303
5,-0.592463,-0.406068,1,1,1,1.0,2017-06-07,1,-0.301026,-1.441436,...,0,0,0,0,0,0,0,0,2130,0.191305
7,-0.592463,-0.406068,1,1,1,1.0,2013-05-18,0,-1.522809,-1.247611,...,0,0,0,0,0,0,0,0,3611,0.191305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74102,3.033628,3.060988,0,1,0,1.0,2013-07-01,0,-1.472396,-1.264237,...,0,0,0,0,0,0,0,0,3567,-0.979553
74103,-0.592463,-0.406068,1,1,0,1.0,2011-09-26,0,-0.288887,-1.440407,...,0,0,0,0,0,0,0,0,4211,0.191305
74107,0.314060,1.327460,1,1,0,1.0,2016-05-03,0,-1.580558,-1.253916,...,0,0,0,0,0,0,0,0,2530,0.262232
74108,0.767321,-0.406068,1,1,1,1.0,2012-01-05,1,0.672590,0.812787,...,0,0,0,0,0,0,0,0,4110,0.640695


In [86]:
# Convert host_since to datetime
df['host_since'] = pd.to_datetime(df['host_since'])

# Create a feature for the year the host joined
df['host_year'] = df['host_since'].dt.year

# Create a feature for the month the host joined
df['host_month'] = df['host_since'].dt.month

In [87]:
df= df.drop(['host_since','name'],axis=1)

In [88]:
df

Unnamed: 0,accommodates,bathrooms,cleaning_fee,host_has_profile_pic,host_identity_verified,host_response_rate,instant_bookable,latitude,longitude,number_of_reviews,...,neighbourhood_Woodland Hills/Warner Center,neighbourhood_Woodlawn,neighbourhood_Woodley Park,neighbourhood_Woodridge,neighbourhood_Woodside,neighbourhood_Wrigleyville,days_since_host,acc_bed,host_year,host_month
1,1.673844,-0.406068,1,1,0,1.0,1,0.692160,0.810617,6,...,0,0,0,0,0,0,2118,3.335722,2017,6
2,0.767321,-0.406068,1,1,1,1.0,1,0.706003,0.812723,10,...,0,0,0,0,0,0,2355,-0.247767,2016,10
4,-0.592463,-0.406068,1,1,1,1.0,1,0.085463,0.669026,4,...,0,0,0,0,0,0,2959,0.877303,2015,3
5,-0.592463,-0.406068,1,1,1,1.0,1,-0.301026,-1.441436,3,...,0,0,0,0,0,0,2130,0.191305,2017,6
7,-0.592463,-0.406068,1,1,1,1.0,0,-1.522809,-1.247611,9,...,0,0,0,0,0,0,3611,0.191305,2013,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74102,3.033628,3.060988,0,1,0,1.0,0,-1.472396,-1.264237,1,...,0,0,0,0,0,0,3567,-0.979553,2013,7
74103,-0.592463,-0.406068,1,1,0,1.0,0,-0.288887,-1.440407,24,...,0,0,0,0,0,0,4211,0.191305,2011,9
74107,0.314060,1.327460,1,1,0,1.0,0,-1.580558,-1.253916,16,...,0,0,0,0,0,0,2530,0.262232,2016,5
74108,0.767321,-0.406068,1,1,1,1.0,1,0.672590,0.812787,43,...,0,0,0,0,0,0,4110,0.640695,2012,1
