## Problem Statement

#### The aim of this case study is to predict the price of AirBnB listings in major U.S. cities.

### Importing Relevant Libraries

In [1]:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from skater.core.explanations import Interpretation
from skater.model import InMemoryModel

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

### Data Inspection

In [2]:
# Setting the location for working Directory

%cd "C:\Users\shubh\Desktop\PYTHON\case study"

C:\Users\shubh\Desktop\PYTHON\case study


In [3]:
train = pd.read_excel("train.xlsx")
test = pd.read_excel("test.xlsx")

In [4]:
train.shape

(49999, 29)

* __We have 49999 rows and 29 columns in Train set whereas Test set has 24111 rows and 28 columns.__

In [5]:
# Viewing the Dataset

train.head()

Unnamed: 0,id,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,...,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds,log_price
0,6901257,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,NYC,...,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201,1.0,1.0,5.010635
1,6304928,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,NYC,...,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019,3.0,3.0,5.129899
2,7919400,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,NYC,...,-73.943756,The Garden Oasis,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9...,10027,1.0,3.0,4.976734
3,13418779,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,SF,...,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117,2.0,2.0,6.620073
4,3808709,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,DC,...,-77.034596,Great studio in midtown DC,Columbia Heights,4,40.0,,20009,0.0,1.0,4.744932


In [6]:
test.head()

Unnamed: 0,id,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,5979389,Apartment,Private room,"{""Wireless Internet"",""Air conditioning"",Kitche...",2,1.0,Real Bed,strict,True,NYC,...,40.705948,-73.915318,Bedroom in Renovated Apartment with Washer/Dryer,Ridgewood,0,,https://a0.muscache.com/im/pictures/b43127ee-b...,11385,1.0,1.0
1,13488121,Apartment,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",4,1.0,Real Bed,moderate,True,LA,...,34.118408,-118.317929,Peaceful and Quiet Hollywood Hills,Hollywood Hills,24,100.0,https://a0.muscache.com/im/pictures/894d8ca5-7...,90068,1.0,1.0
2,8121643,Apartment,Private room,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",2,1.0,Real Bed,flexible,False,NYC,...,40.707888,-74.015211,"Luxury, doorman Building- FIDI",Financial District,0,,,10006,1.0,1.0
3,16490010,Apartment,Private room,"{Internet,""Wireless Internet"",""Air conditionin...",2,1.0,Real Bed,flexible,True,NYC,...,40.721388,-73.945642,Cozy room in sunny historic Greenpoint home,Greenpoint,12,98.0,https://a0.muscache.com/im/pictures/6c99615c-4...,11222,1.0,1.0
4,16274069,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",4,1.0,Real Bed,moderate,True,NYC,...,40.682556,-73.943259,B's Suite,Bedford-Stuyvesant,13,92.0,https://a0.muscache.com/im/pictures/56275358/c...,11216,2.0,3.0


In [6]:
#ratio of null values
test.isnull().sum()/test.shape[0] *100

id                         0.000000
property_type              0.000000
room_type                  0.000000
amenities                  0.000000
accommodates               0.000000
bathrooms                  0.223964
bed_type                   0.000000
cancellation_policy        0.000000
cleaning_fee               0.000000
city                       0.000000
description                0.004147
first_review              21.371988
host_has_profile_pic       0.252997
host_identity_verified     0.252997
host_response_rate        24.723155
host_since                 0.252997
instant_bookable           0.000000
last_review               21.334661
latitude                   0.000000
longitude                  0.000000
name                       0.016590
neighbourhood              9.352578
number_of_reviews          0.000000
review_scores_rating      22.483514
thumbnail_url             10.870557
zipcode                    1.352080
bedrooms                   0.120277
beds                       0

In [7]:
#ratio of null values
train.isnull().sum()/train.shape[0] *100

id                         0.000000
property_type              0.000000
room_type                  0.000000
amenities                  0.000000
accommodates               0.000000
bathrooms                  0.292006
bed_type                   0.000000
cancellation_policy        0.000000
cleaning_fee               0.000000
city                       0.000000
description                0.010000
first_review              21.422428
host_has_profile_pic       0.254005
host_identity_verified     0.254005
host_response_rate        24.676494
host_since                 0.254005
instant_bookable           0.000000
last_review               21.366427
latitude                   0.000000
longitude                  0.000000
name                       0.012000
neighbourhood              9.234185
number_of_reviews          0.000000
review_scores_rating      22.602452
thumbnail_url             11.190224
zipcode                    1.280026
bedrooms                   0.124002
beds                       0

In [8]:
# let's look into the Datatypes of the variables
train.dtypes

id                          int64
property_type              object
room_type                  object
amenities                  object
accommodates                int64
bathrooms                 float64
bed_type                   object
cancellation_policy        object
cleaning_fee                 bool
city                       object
description                object
first_review               object
host_has_profile_pic       object
host_identity_verified     object
host_response_rate        float64
host_since                 object
instant_bookable           object
last_review                object
latitude                  float64
longitude                 float64
name                       object
neighbourhood              object
number_of_reviews           int64
review_scores_rating      float64
thumbnail_url              object
zipcode                    object
bedrooms                  float64
beds                      float64
log_price                 float64
dtype: object

## Data Cleaning

In [7]:
#Change the datatype to datetime
train['host_since'] = pd.to_datetime(train['host_since'])
test['host_since'] = pd.to_datetime(test['host_since'])

In [8]:
# Filling the Null values with the mode of the same column
train['host_since'].fillna((train['host_since'].mode()[0]), inplace=True)
test['host_since'].fillna((test['host_since'].mode()[0]), inplace=True)

In [9]:
# Creating a new column with only years entry
train['host_since_year']=train['host_since'].dt.year
test['host_since_year']=test['host_since'].dt.year

In [10]:
# Now calculating the difference from the present year
train['host_since_year'] = 2021 - (train['host_since_year'])
test['host_since_year'] = 2021 - (test['host_since_year'])

In [11]:
# Deleting the colomn
train.drop(['host_since'], axis=1, inplace=True)
test.drop(['host_since'], axis=1, inplace=True)

In [12]:
# Removing the % sign from the host_reponse_rate column
train['host_response_rate'] = train.host_response_rate.replace('%','')
test['host_response_rate'] = test.host_response_rate.replace('%','')

In [13]:
# Imputing mean in the place of Null
train['host_response_rate'].fillna((train['host_response_rate'].mean()), inplace=True)
train['review_scores_rating'].fillna((train['review_scores_rating'].mean()), inplace=True)

test['host_response_rate'].fillna((test['host_response_rate'].mean()), inplace=True)
test['review_scores_rating'].fillna((test['review_scores_rating'].mean()), inplace=True)

In [14]:
# Imputing null values wih mode
train['host_has_profile_pic'].fillna((train['host_has_profile_pic'].mode()[0]), inplace=True)
train['bathrooms'].fillna((train['bathrooms'].mode()[0]), inplace=True)
train['host_identity_verified'].fillna((train['host_identity_verified'].mode()[0]), inplace=True)
train['bedrooms'].fillna(train['bedrooms'].mode()[0], inplace=True)
train['beds'].fillna(train['beds'].mode()[0], inplace=True)

test['host_has_profile_pic'].fillna((test['host_has_profile_pic'].mode()[0]), inplace=True)
test['bathrooms'].fillna((test['bathrooms'].mode()[0]), inplace=True)
test['host_identity_verified'].fillna((test['host_identity_verified'].mode()[0]), inplace=True)
test['bedrooms'].fillna(test['bedrooms'].mode()[0], inplace=True)
test['beds'].fillna(test['beds'].mode()[0], inplace=True)

In [15]:
# Creating dummies for the categorical variables
dumm_cols = ['room_type', 'bed_type', 'cleaning_fee', 'city', 'host_has_profile_pic', 
               'host_identity_verified', 'instant_bookable', 'cancellation_policy']

In [16]:
# Concatenated dummies cols and dropped the original variable+
for col in dumm_cols:
    
    temp=pd.get_dummies(train[col],prefix=col,dtype=float)
    train=pd.concat([temp,train],1)
    train.drop([col],1,inplace=True)    

In [17]:
for c in dumm_cols:
    
    temps=pd.get_dummies(test[c],prefix=c,dtype=float)
    test=pd.concat([temps,test],1)
    test.drop([c],1,inplace=True)

In [18]:
train['property_type'].value_counts()


Apartment             33113
House                 11138
Condominium            1759
Townhouse              1146
Loft                    834
Other                   392
Guesthouse              324
Bed & Breakfast         320
Bungalow                264
Villa                   120
Dorm                     99
Guest suite              83
Camper/RV                63
Cabin                    49
In-law                   49
Hostel                   48
Timeshare                46
Boutique hotel           44
Boat                     36
Serviced apartment       16
Tent                     12
Castle                    8
Yurt                      7
Vacation home             7
Treehouse                 6
Hut                       5
Chalet                    3
Earth House               2
Tipi                      1
Casa particular           1
Lighthouse                1
Train                     1
Parking Space             1
Cave                      1
Name: property_type, dtype: int64

In [19]:
# Creating dummies seperately for property_type col

dum_pt = pd.get_dummies(train['property_type'], dtype=float)
dum_pts = pd.get_dummies(test['property_type'], dtype=float)

In [None]:
# Dropping the columns which had less then 50 frequency
dum_pt.drop(['In-law','Cabin','Hostel', 'Timeshare', 'Boutique hotel', 'Boat','Serviced apartment','Tent',
             'Castle','Vacation home','Yurt','Treehouse','Hut','Chalet','Earth House','Train','Cave',
             'Casa particular','Parking Space','Lighthouse','Tipi'], axis=1, inplace = True)

In [23]:
dum_pts.drop(['In-law','Cabin','Hostel', 'Timeshare', 'Boutique hotel', 'Boat','Serviced apartment','Tent',
             'Castle','Vacation home','Yurt','Treehouse','Hut','Chalet','Earth House','Train','Cave',
             'Tipi'], axis=1, inplace = True)

In [24]:
train = pd.concat([train, dum_pt], axis=1)
train.drop(['property_type'], axis=1, inplace=True)

test = pd.concat([test, dum_pts], axis=1)
test.drop(['property_type'], axis=1, inplace=True)

In [25]:
train.dtypes

cancellation_policy_flexible           float64
cancellation_policy_moderate           float64
cancellation_policy_strict             float64
cancellation_policy_super_strict_30    float64
cancellation_policy_super_strict_60    float64
instant_bookable_f                     float64
instant_bookable_t                     float64
host_identity_verified_f               float64
host_identity_verified_t               float64
host_has_profile_pic_f                 float64
host_has_profile_pic_t                 float64
city_Boston                            float64
city_Chicago                           float64
city_DC                                float64
city_LA                                float64
city_NYC                               float64
city_SF                                float64
cleaning_fee_False                     float64
cleaning_fee_True                      float64
bed_type_Airbed                        float64
bed_type_Couch                         float64
bed_type_Futo

In [30]:
test.dtypes

cancellation_policy_flexible                  float64
cancellation_policy_moderate                  float64
cancellation_policy_strict                    float64
cancellation_policy_super_strict_30           float64
cancellation_policy_super_strict_60           float64
instant_bookable_f                            float64
instant_bookable_t                            float64
host_identity_verified_f                      float64
host_identity_verified_t                      float64
host_has_profile_pic_f                        float64
host_has_profile_pic_t                        float64
city_Boston                                   float64
city_Chicago                                  float64
city_DC                                       float64
city_LA                                       float64
city_NYC                                      float64
city_SF                                       float64
cleaning_fee_False                            float64
cleaning_fee_True           

In [27]:
object_columns = [col for col in train.columns if train[col].dtypes == 'object']
object_columns1 = [col for col in test.columns if test[col].dtypes == 'object']

In [28]:
object_columns

['amenities',
 'description',
 'first_review',
 'last_review',
 'name',
 'neighbourhood',
 'thumbnail_url',
 'zipcode']

In [29]:
object_columns1

['amenities',
 'description',
 'name',
 'neighbourhood',
 'thumbnail_url',
 'zipcode']

In [31]:
# Dropping these variables
object_columns_drop = ['amenities','description','first_review','last_review','name','neighbourhood','thumbnail_url','zipcode',
                      'latitude', 'longitude']

In [32]:
train.drop(object_columns_drop,1,inplace=True)
test.drop(object_columns_drop,1,inplace=True)

In [40]:
# Remove this column from test data, as it is not present in the trainig data
test.drop(['Island'], axis=1, inplace=True)

In [41]:
train.dtypes

cancellation_policy_flexible           float64
cancellation_policy_moderate           float64
cancellation_policy_strict             float64
cancellation_policy_super_strict_30    float64
cancellation_policy_super_strict_60    float64
instant_bookable_f                     float64
instant_bookable_t                     float64
host_identity_verified_f               float64
host_identity_verified_t               float64
host_has_profile_pic_f                 float64
host_has_profile_pic_t                 float64
city_Boston                            float64
city_Chicago                           float64
city_DC                                float64
city_LA                                float64
city_NYC                               float64
city_SF                                float64
cleaning_fee_False                     float64
cleaning_fee_True                      float64
bed_type_Airbed                        float64
bed_type_Couch                         float64
bed_type_Futo

In [42]:
test.dtypes

cancellation_policy_flexible           float64
cancellation_policy_moderate           float64
cancellation_policy_strict             float64
cancellation_policy_super_strict_30    float64
cancellation_policy_super_strict_60    float64
instant_bookable_f                     float64
instant_bookable_t                     float64
host_identity_verified_f               float64
host_identity_verified_t               float64
host_has_profile_pic_f                 float64
host_has_profile_pic_t                 float64
city_Boston                            float64
city_Chicago                           float64
city_DC                                float64
city_LA                                float64
city_NYC                               float64
city_SF                                float64
cleaning_fee_False                     float64
cleaning_fee_True                      float64
bed_type_Airbed                        float64
bed_type_Couch                         float64
bed_type_Futo

In [43]:
train.isna().sum()

cancellation_policy_flexible           0
cancellation_policy_moderate           0
cancellation_policy_strict             0
cancellation_policy_super_strict_30    0
cancellation_policy_super_strict_60    0
instant_bookable_f                     0
instant_bookable_t                     0
host_identity_verified_f               0
host_identity_verified_t               0
host_has_profile_pic_f                 0
host_has_profile_pic_t                 0
city_Boston                            0
city_Chicago                           0
city_DC                                0
city_LA                                0
city_NYC                               0
city_SF                                0
cleaning_fee_False                     0
cleaning_fee_True                      0
bed_type_Airbed                        0
bed_type_Couch                         0
bed_type_Futon                         0
bed_type_Pull-out Sofa                 0
bed_type_Real Bed                      0
room_type_Entire

In [44]:
test.isna().sum()

cancellation_policy_flexible           0
cancellation_policy_moderate           0
cancellation_policy_strict             0
cancellation_policy_super_strict_30    0
cancellation_policy_super_strict_60    0
instant_bookable_f                     0
instant_bookable_t                     0
host_identity_verified_f               0
host_identity_verified_t               0
host_has_profile_pic_f                 0
host_has_profile_pic_t                 0
city_Boston                            0
city_Chicago                           0
city_DC                                0
city_LA                                0
city_NYC                               0
city_SF                                0
cleaning_fee_False                     0
cleaning_fee_True                      0
bed_type_Airbed                        0
bed_type_Couch                         0
bed_type_Futon                         0
bed_type_Pull-out Sofa                 0
bed_type_Real Bed                      0
room_type_Entire

In [45]:
train.shape, test.shape

((49999, 50), (24111, 49))

In [55]:
# Seperate Features and Target
X= train.drop(columns = ['log_price', 'id'], axis=1)
y= train['log_price']

## Building Final Model

In [62]:
xgb = XGBRegressor(n_estimators=1000, learning_rate=.05)

In [63]:
xgb.fit(X,y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [64]:
x_test=test.drop(['id'],1)

In [65]:
pred = xgb.predict(x_test)

In [66]:
submission=pd.DataFrame(list(zip(test['id'],list(pred))),
                       columns=['id','log_price'])

In [67]:
submission.to_csv("C:/Users/shubh/Desktop/PYTHON/case study/Deloitte Submission.csv",index=False)