In [1]:
import numpy as np
import pandas as pd
import os
#print(os.listdir("../input"))
from re import sub
from decimal import Decimal
from sklearn.preprocessing import MultiLabelBinarizer
from scipy import stats

# Grabbing Airbnb Listings Data

We chose specific data points from the listings data that we thought were easy for us to categorize during our data cleaning process, and, intuitively, seemed most relevant to predicting the pricing.

Data points:
- host_is_superhost
- neighbourhood_group_cleansed
- property_type
- room_type
- latitude
- longitude
- guests_included
- bathrooms
- bedrooms
- beds
- bed_type
- amenities
- price
- cleaning_fee
- instant_bookable
- cancellation_policy

In [2]:
listings = pd.read_csv('./seattle-airbnb/listings.csv')
listings['city'] = 'seattle'

listings1 = pd.read_csv('./boston-airbnb/listings.csv')
listings1['city'] = 'boston'


listings = pd.concat([listings, listings1], ignore_index=True, sort=True)

In [3]:
listings.count()

access                              2096
accommodates                        7403
amenities                           7403
availability_30                     7403
availability_365                    7403
availability_60                     7403
availability_90                     7403
bathrooms                           7373
bed_type                            7403
bedrooms                            7387
beds                                7393
calculated_host_listings_count      7403
calendar_last_scraped               7403
calendar_updated                    7403
cancellation_policy                 7403
city                                7403
cleaning_fee                        5266
country                             7403
country_code                        7403
description                         7403
experiences_offered                 7403
extra_people                        7403
first_review                        6020
guests_included                     7403
has_availability

In [4]:
#listings = pd.read_csv('./boston-airbnb/listings.csv')
ld = listings.loc[:,['id','host_is_superhost', 'property_type', 'room_type', 'latitude', 'longitude', 'guests_included', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price', 'cleaning_fee', 'instant_bookable', 'cancellation_policy', 'city', 'review_scores_value']] 

In [5]:
display(listings.head())

Unnamed: 0,access,accommodates,amenities,availability_30,availability_365,availability_60,availability_90,bathrooms,bed_type,bedrooms,...,space,square_feet,state,street,summary,thumbnail_url,transit,weekly_price,xl_picture_url,zipcode
0,,4,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",14,346,41,71,1.0,Real Bed,1.0,...,Make your self at home in this charming one-be...,,WA,"Gilman Dr W, Seattle, WA 98119, United States",,,,,,98119
1,,4,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",13,291,13,16,1.0,Real Bed,1.0,...,"Beautiful, hypoallergenic apartment in an extr...",,WA,"7th Avenue West, Seattle, WA 98119, United States",Chemically sensitive? We've removed the irrita...,https://a0.muscache.com/ac/pictures/14409893/f...,"Convenient bus stops are just down the block, ...","$1,000.00",https://a0.muscache.com/ac/pictures/14409893/f...,98119
2,,11,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",1,220,6,17,4.5,Real Bed,5.0,...,"Our house is modern, light and fresh with a wa...",,WA,"West Lee Street, Seattle, WA 98119, United States",New modern house built in 2013. Spectacular s...,,A bus stop is just 2 blocks away. Easy bus a...,,,98119
3,,3,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",0,143,0,0,1.0,Real Bed,0.0,...,,,WA,"8th Avenue West, Seattle, WA 98119, United States",A charming apartment that sits atop Queen Anne...,,,$650.00,,98119
4,,6,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",30,365,60,90,2.0,Real Bed,3.0,...,Cozy family craftman house in beautiful neighb...,,WA,"14th Ave W, Seattle, WA 98119, United States",Cozy family craftman house in beautiful neighb...,,The nearest public transit bus (D Line) is 2 b...,,,98119


In [6]:
ld.count()

id                     7403
host_is_superhost      7401
property_type          7399
room_type              7403
latitude               7403
longitude              7403
guests_included        7403
bathrooms              7373
bedrooms               7387
beds                   7393
bed_type               7403
amenities              7403
price                  7403
cleaning_fee           5266
instant_bookable       7403
cancellation_policy    7403
city                   7403
review_scores_value    5926
dtype: int64

# Removing all the listings with missing values

In this step, we are looking for all the listings containing missing values.  We will remove them, and store them in another dataframe. 

In [7]:
ld = ld.dropna(subset=['host_is_superhost', 'property_type', 'room_type', 'latitude', 'longitude', 'guests_included', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price', 'instant_bookable', 'cancellation_policy', 'city']) 
ld.count()

id                     7347
host_is_superhost      7347
property_type          7347
room_type              7347
latitude               7347
longitude              7347
guests_included        7347
bathrooms              7347
bedrooms               7347
beds                   7347
bed_type               7347
amenities              7347
price                  7347
cleaning_fee           5234
instant_bookable       7347
cancellation_policy    7347
city                   7347
review_scores_value    5884
dtype: int64

In [8]:
ld['cleaning_fee'] = ld['cleaning_fee'].fillna(0)

In [9]:
ld['cleaning_fee'].head()

0          0
1     $40.00
2    $300.00
3          0
4    $125.00
Name: cleaning_fee, dtype: object

In [10]:
ld['review_scores_value'] = ld['review_scores_value'].fillna(0)

In [11]:
ld['review_scores_value'].head()

0    10.0
1    10.0
2    10.0
3     0.0
4     9.0
Name: review_scores_value, dtype: float64

In [12]:
ld.head()

Unnamed: 0,id,host_is_superhost,property_type,room_type,latitude,longitude,guests_included,bathrooms,bedrooms,beds,bed_type,amenities,price,cleaning_fee,instant_bookable,cancellation_policy,city,review_scores_value
0,241032,f,Apartment,Entire home/apt,47.636289,-122.371025,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,0,f,moderate,seattle,10.0
1,953595,t,Apartment,Entire home/apt,47.639123,-122.365666,1,1.0,1.0,1.0,Real Bed,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,$40.00,f,strict,seattle,10.0
2,3308979,f,House,Entire home/apt,47.629724,-122.369483,10,4.5,5.0,7.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,$300.00,f,strict,seattle,10.0
3,7421966,f,Apartment,Entire home/apt,47.638473,-122.369279,1,1.0,0.0,2.0,Real Bed,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,0,f,flexible,seattle,0.0
4,278830,f,House,Entire home/apt,47.632918,-122.372471,6,2.0,3.0,3.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,$125.00,f,strict,seattle,9.0


In [13]:
ld[ld['bedrooms'] > 6]

Unnamed: 0,id,host_is_superhost,property_type,room_type,latitude,longitude,guests_included,bathrooms,bedrooms,beds,bed_type,amenities,price,cleaning_fee,instant_bookable,cancellation_policy,city,review_scores_value
3351,5022572,f,House,Entire home/apt,47.673057,-122.351639,6,4.0,7.0,10.0,Real Bed,"{TV,""Cable TV"",""Wireless Internet"",Kitchen,""Pe...",$375.00,$300.00,f,strict,seattle,10.0


## Column 1: host_is_superhost
- Boolean declaring whether host fulfills Airbnb's superhost requirements: https://www.airbnb.ca/help/article/829/how-do-i-become-a-superhost
- Convert `True = 1` and `False = 0`

In [14]:
ld.loc[ld.loc[:, 'host_is_superhost'] == 't', 'host_is_superhost'] = 1
ld.loc[ld.loc[:, 'host_is_superhost'] == 'f', 'host_is_superhost'] = 0

In [15]:
ld.head()

Unnamed: 0,id,host_is_superhost,property_type,room_type,latitude,longitude,guests_included,bathrooms,bedrooms,beds,bed_type,amenities,price,cleaning_fee,instant_bookable,cancellation_policy,city,review_scores_value
0,241032,0,Apartment,Entire home/apt,47.636289,-122.371025,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,0,f,moderate,seattle,10.0
1,953595,1,Apartment,Entire home/apt,47.639123,-122.365666,1,1.0,1.0,1.0,Real Bed,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,$40.00,f,strict,seattle,10.0
2,3308979,0,House,Entire home/apt,47.629724,-122.369483,10,4.5,5.0,7.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,$300.00,f,strict,seattle,10.0
3,7421966,0,Apartment,Entire home/apt,47.638473,-122.369279,1,1.0,0.0,2.0,Real Bed,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,0,f,flexible,seattle,0.0
4,278830,0,House,Entire home/apt,47.632918,-122.372471,6,2.0,3.0,3.0,Real Bed,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,$125.00,f,strict,seattle,9.0


In [16]:
ld_1 = ld

 ## Column 2: Property type
 
 - Column indicates which property type it is(ex. house, apartment, etc)

In [17]:
property_type = pd.get_dummies(ld_1['property_type'])

In [18]:
property_type.head()

Unnamed: 0,Apartment,Bed & Breakfast,Boat,Bungalow,Cabin,Camper/RV,Chalet,Condominium,Dorm,Entire Floor,Guesthouse,House,Loft,Other,Tent,Townhouse,Treehouse,Villa,Yurt
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [19]:
ld_2 = pd.merge(ld_1, property_type, left_index=True, right_index=True)
ld_2 = ld_2.drop('property_type', 1)

In [20]:
ld_2.head()

Unnamed: 0,id,host_is_superhost,room_type,latitude,longitude,guests_included,bathrooms,bedrooms,beds,bed_type,...,Entire Floor,Guesthouse,House,Loft,Other,Tent,Townhouse,Treehouse,Villa,Yurt
0,241032,0,Entire home/apt,47.636289,-122.371025,2,1.0,1.0,1.0,Real Bed,...,0,0,0,0,0,0,0,0,0,0
1,953595,1,Entire home/apt,47.639123,-122.365666,1,1.0,1.0,1.0,Real Bed,...,0,0,0,0,0,0,0,0,0,0
2,3308979,0,Entire home/apt,47.629724,-122.369483,10,4.5,5.0,7.0,Real Bed,...,0,0,1,0,0,0,0,0,0,0
3,7421966,0,Entire home/apt,47.638473,-122.369279,1,1.0,0.0,2.0,Real Bed,...,0,0,0,0,0,0,0,0,0,0
4,278830,0,Entire home/apt,47.632918,-122.372471,6,2.0,3.0,3.0,Real Bed,...,0,0,1,0,0,0,0,0,0,0


## Column 4: Room type
 
 - Column indicates which room type it is(ex. Entire home/apt )

In [21]:
room_type = pd.get_dummies(ld_2['room_type'])

In [22]:
room_type.head()

Unnamed: 0,Entire home/apt,Private room,Shared room
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


In [23]:
ld_3= pd.merge(ld_2, room_type, left_index=True, right_index=True)
ld_3 = ld_3.drop('room_type', 1)

## Column 5: Bed type
 
 - Column states what kind of bed the listing has(ex. Real Bed, Futon, etc )

In [24]:
bed_type = pd.get_dummies(ld_3['bed_type'])

In [25]:
bed_type.head()

Unnamed: 0,Airbed,Couch,Futon,Pull-out Sofa,Real Bed
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1


In [26]:
ld_4= pd.merge(ld_3, bed_type, left_index=True, right_index=True)
ld_4 = ld_4.drop('bed_type', 1)

## Column 6: Instant Bookable
 
- Boolean declaring whether or not the listing can be instant booked. 
- Convert `True = 1` and `False = 0`

In [27]:
ld_4.loc[ld_4.loc[:, 'instant_bookable'] == 't', 'instant_bookable'] = 1
ld_4.loc[ld_4.loc[:, 'instant_bookable'] == 'f', 'instant_bookable'] = 0

In [28]:
ld_4.head()

Unnamed: 0,id,host_is_superhost,latitude,longitude,guests_included,bathrooms,bedrooms,beds,amenities,price,...,Villa,Yurt,Entire home/apt,Private room,Shared room,Airbed,Couch,Futon,Pull-out Sofa,Real Bed
0,241032,0,47.636289,-122.371025,2,1.0,1.0,1.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,...,0,0,1,0,0,0,0,0,0,1
1,953595,1,47.639123,-122.365666,1,1.0,1.0,1.0,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,...,0,0,1,0,0,0,0,0,0,1
2,3308979,0,47.629724,-122.369483,10,4.5,5.0,7.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,...,0,0,1,0,0,0,0,0,0,1
3,7421966,0,47.638473,-122.369279,1,1.0,0.0,2.0,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,...,0,0,1,0,0,0,0,0,0,1
4,278830,0,47.632918,-122.372471,6,2.0,3.0,3.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,...,0,0,1,0,0,0,0,0,0,1


## Column 7: Cancellation policy
- Column indicates which kind of standardlized cancellation policy the host chooses.
- There are three cancellation policies - flexible, moderate and strict 

In [29]:
cancellation = pd.get_dummies(ld_4['cancellation_policy'])

In [30]:
cancellation.head()

Unnamed: 0,flexible,moderate,strict,super_strict_30
0,0,1,0,0
1,0,0,1,0
2,0,0,1,0
3,1,0,0,0
4,0,0,1,0


In [31]:
ld_5= pd.merge(ld_4, cancellation, left_index=True, right_index=True)
   

In [32]:
ld_5 = ld_5.drop('cancellation_policy', 1)

# Column 8: Guest included 
- column states the number of guests can be accomodated for each listing
- we need to normalize the value to 0-1

In [33]:
ld_5['guests_included'].max()

15

In [34]:
def normalizing(column):
    new_column = (column - column.min()) / (column.max() - column.min())
    return new_column

In [35]:
ld_5['guests_included'] = normalizing(ld_5['guests_included'])

In [36]:
ld_5.head()

Unnamed: 0,id,host_is_superhost,latitude,longitude,guests_included,bathrooms,bedrooms,beds,amenities,price,...,Shared room,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30
0,241032,0,47.636289,-122.371025,0.133333,1.0,1.0,1.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,...,0,0,0,0,0,1,0,1,0,0
1,953595,1,47.639123,-122.365666,0.066667,1.0,1.0,1.0,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,...,0,0,0,0,0,1,0,0,1,0
2,3308979,0,47.629724,-122.369483,0.666667,4.5,5.0,7.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,...,0,0,0,0,0,1,0,0,1,0
3,7421966,0,47.638473,-122.369279,0.066667,1.0,0.0,2.0,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,...,0,0,0,0,0,1,1,0,0,0
4,278830,0,47.632918,-122.372471,0.4,2.0,3.0,3.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,...,0,0,0,0,0,1,0,0,1,0


# Column 8, 9, 10: bathrooms, bedrooms, beds
- column states the number of bathrooms, bedrooms, and beds in each listing
- normalize the value to 0-1

In [37]:
ld_5['bathrooms'] = normalizing(ld_5['bathrooms'])

In [38]:
ld_5['bedrooms'] = normalizing(ld_5['bedrooms'])

In [39]:
ld_5['beds'] = normalizing(ld_5['beds'])

In [40]:
ld_5.head()

Unnamed: 0,id,host_is_superhost,latitude,longitude,guests_included,bathrooms,bedrooms,beds,amenities,price,...,Shared room,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30
0,241032,0,47.636289,-122.371025,0.133333,0.125,0.142857,0.0625,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,...,0,0,0,0,0,1,0,1,0,0
1,953595,1,47.639123,-122.365666,0.066667,0.125,0.142857,0.0625,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,...,0,0,0,0,0,1,0,0,1,0
2,3308979,0,47.629724,-122.369483,0.666667,0.5625,0.714286,0.4375,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,...,0,0,0,0,0,1,0,0,1,0
3,7421966,0,47.638473,-122.369279,0.066667,0.125,0.0,0.125,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,...,0,0,0,0,0,1,1,0,0,0
4,278830,0,47.632918,-122.372471,0.4,0.25,0.428571,0.1875,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,...,0,0,0,0,0,1,0,0,1,0


# Column 11, 12: Longitude and Latitude 
- column states the longitude and latitude of each listing 
- we can use these two values, and map them to x, y and z coordinates. In this way we can make sure close points in the 3D space are close to each other. 
- x = cos(lat) * cos(lon)
- y = cos(lat) * sin(lon), 
- z = sin(lat) 

In [41]:
## seattle airport: 47.4502° N, 122.3088° W
airport_boston_lat = 42.3656132,
airport_boston_lon = -71.0095602

airport_seattle_lat = 47.4502
airport_seattle_lon = -122.3088

#42.3656132,-71.0095602


## downtown: 47.6050° N, 122.3344° W
dt_lat = 47.6050
dt_lon = -122.3344

## pike place: 47.6101° N, 122.3421° W
pp_lat = 47.6101
pp_lon = -122.3421

## seattle amazon headquarter: 47.6062° N, 122.3321° W
amazon_lat = 47.6062
amazon_lon = -122.3321

## longitude and latitude in datasets
lat_data = ld_5['latitude']
lon_data = ld_5['longitude']

In [42]:
lat_data[1]

47.63912312136253

In [43]:
lon_data[1]

-122.36566646439582

In [44]:
AVG_EARTH_RADIUS = 6371

In [45]:
def haversine_array(lat1, lng1, ld_5):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, ld_5['latitude'], ld_5['longitude']))
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arctan2(np.sqrt(d), np.sqrt(1-d))
    return h

In [46]:
ld_5_1 = ld_5[ld_5['city'] == 'seattle'].copy()

ld_5_2 = ld_5[ld_5['city'] == 'boston'].copy()

In [47]:
ld_5_1['d_airport'] = haversine_array(airport_seattle_lat, airport_seattle_lon, ld_5_1)

ld_5_2['d_airport'] = haversine_array(airport_boston_lat, airport_boston_lon, ld_5_2)

In [48]:
ld_5 = pd.concat([ld_5_1,ld_5_2], ignore_index=True, axis=0)



In [49]:
ld_5.head()

Unnamed: 0,id,host_is_superhost,latitude,longitude,guests_included,bathrooms,bedrooms,beds,amenities,price,...,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30,d_airport
0,241032,0,47.636289,-122.371025,0.133333,0.125,0.142857,0.0625,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,...,0,0,0,0,1,0,1,0,0,21.212736
1,953595,1,47.639123,-122.365666,0.066667,0.125,0.142857,0.0625,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,...,0,0,0,0,1,0,0,1,0,21.436526
2,3308979,0,47.629724,-122.369483,0.666667,0.5625,0.714286,0.4375,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,...,0,0,0,0,1,0,0,1,0,20.475301
3,7421966,0,47.638473,-122.369279,0.066667,0.125,0.0,0.125,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,...,0,0,0,0,1,1,0,0,0,21.421534
4,278830,0,47.632918,-122.372471,0.4,0.25,0.428571,0.1875,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,...,0,0,0,0,1,0,0,1,0,20.87191


In [50]:
ld_5['d_airport'] = normalizing(ld_5['d_airport'])



In [51]:
ld_5.head()

Unnamed: 0,id,host_is_superhost,latitude,longitude,guests_included,bathrooms,bedrooms,beds,amenities,price,...,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30,d_airport
0,241032,0,47.636289,-122.371025,0.133333,0.125,0.142857,0.0625,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,...,0,0,0,0,1,0,1,0,0,0.651133
1,953595,1,47.639123,-122.365666,0.066667,0.125,0.142857,0.0625,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,...,0,0,0,0,1,0,0,1,0,0.658564
2,3308979,0,47.629724,-122.369483,0.666667,0.5625,0.714286,0.4375,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,...,0,0,0,0,1,0,0,1,0,0.626647
3,7421966,0,47.638473,-122.369279,0.066667,0.125,0.0,0.125,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,...,0,0,0,0,1,1,0,0,0,0.658066
4,278830,0,47.632918,-122.372471,0.4,0.25,0.428571,0.1875,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,...,0,0,0,0,1,0,0,1,0,0.639816


In [52]:
ld_6 = ld_5.drop('latitude', 1)
ld_6.head()

Unnamed: 0,id,host_is_superhost,longitude,guests_included,bathrooms,bedrooms,beds,amenities,price,cleaning_fee,...,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30,d_airport
0,241032,0,-122.371025,0.133333,0.125,0.142857,0.0625,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,0,...,0,0,0,0,1,0,1,0,0,0.651133
1,953595,1,-122.365666,0.066667,0.125,0.142857,0.0625,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,$40.00,...,0,0,0,0,1,0,0,1,0,0.658564
2,3308979,0,-122.369483,0.666667,0.5625,0.714286,0.4375,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,$300.00,...,0,0,0,0,1,0,0,1,0,0.626647
3,7421966,0,-122.369279,0.066667,0.125,0.0,0.125,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,0,...,0,0,0,0,1,1,0,0,0,0.658066
4,278830,0,-122.372471,0.4,0.25,0.428571,0.1875,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,$125.00,...,0,0,0,0,1,0,0,1,0,0.639816


In [53]:
ld_7 = ld_6.drop('longitude', 1)

In [54]:
ld_7.head()

Unnamed: 0,id,host_is_superhost,guests_included,bathrooms,bedrooms,beds,amenities,price,cleaning_fee,instant_bookable,...,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30,d_airport
0,241032,0,0.133333,0.125,0.142857,0.0625,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$85.00,0,0,...,0,0,0,0,1,0,1,0,0,0.651133
1,953595,1,0.066667,0.125,0.142857,0.0625,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",$150.00,$40.00,0,...,0,0,0,0,1,0,0,1,0,0.658564
2,3308979,0,0.666667,0.5625,0.714286,0.4375,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",$975.00,$300.00,0,...,0,0,0,0,1,0,0,1,0,0.626647
3,7421966,0,0.066667,0.125,0.0,0.125,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",$100.00,0,0,...,0,0,0,0,1,1,0,0,0,0.658066
4,278830,0,0.4,0.25,0.428571,0.1875,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",$450.00,$125.00,0,...,0,0,0,0,1,0,0,1,0,0.639816


# Column 13: price, cleaning fee
- column includes the avrage price and cleaning_fee per night for each listing

In [55]:
ld_7['price'] = ld_7['price'].replace('[\$,]','',regex=True).astype(float)
ld_7['cleaning_fee'] = ld_7['cleaning_fee'].replace('[\$,]','',regex=True).astype(float)
ld_7['price'] = normalizing(ld_7['price'])
ld_7['cleaning_fee'] = normalizing(ld_7['cleaning_fee'] )
ld_7['review_scores_value'] = normalizing(ld_7['review_scores_value'] )

In [56]:
ld_7.head()

Unnamed: 0,id,host_is_superhost,guests_included,bathrooms,bedrooms,beds,amenities,price,cleaning_fee,instant_bookable,...,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30,d_airport
0,241032,0,0.133333,0.125,0.142857,0.0625,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",0.018797,0.0,0,...,0,0,0,0,1,0,1,0,0,0.651133
1,953595,1,0.066667,0.125,0.142857,0.0625,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",0.035088,0.133333,0,...,0,0,0,0,1,0,0,1,0,0.658564
2,3308979,0,0.666667,0.5625,0.714286,0.4375,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",0.241855,1.0,0,...,0,0,0,0,1,0,0,1,0,0.626647
3,7421966,0,0.066667,0.125,0.0,0.125,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",0.022556,0.0,0,...,0,0,0,0,1,1,0,0,0,0.658066
4,278830,0,0.4,0.25,0.428571,0.1875,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",0.110276,0.416667,0,...,0,0,0,0,1,0,0,1,0,0.639816


In [57]:
ld_8 = ld_7.copy()
ld_8.head()

Unnamed: 0,id,host_is_superhost,guests_included,bathrooms,bedrooms,beds,amenities,price,cleaning_fee,instant_bookable,...,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30,d_airport
0,241032,0,0.133333,0.125,0.142857,0.0625,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",0.018797,0.0,0,...,0,0,0,0,1,0,1,0,0,0.651133
1,953595,1,0.066667,0.125,0.142857,0.0625,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",0.035088,0.133333,0,...,0,0,0,0,1,0,0,1,0,0.658564
2,3308979,0,0.666667,0.5625,0.714286,0.4375,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",0.241855,1.0,0,...,0,0,0,0,1,0,0,1,0,0.626647
3,7421966,0,0.066667,0.125,0.0,0.125,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",0.022556,0.0,0,...,0,0,0,0,1,1,0,0,0,0.658066
4,278830,0,0.4,0.25,0.428571,0.1875,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",0.110276,0.416667,0,...,0,0,0,0,1,0,0,1,0,0.639816


# Column 14: Amenities 
- column indudes all the amentities for each listing

In [58]:
ld_8["amenities"] = ld_8["amenities"].str.lower().str.replace('{','').str.replace('}','').str.replace('"','').str.replace(' ','_').str.split(',')
ld_8.head()



Unnamed: 0,id,host_is_superhost,guests_included,bathrooms,bedrooms,beds,amenities,price,cleaning_fee,instant_bookable,...,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30,d_airport
0,241032,0,0.133333,0.125,0.142857,0.0625,"[tv, cable_tv, internet, wireless_internet, ai...",0.018797,0.0,0,...,0,0,0,0,1,0,1,0,0,0.651133
1,953595,1,0.066667,0.125,0.142857,0.0625,"[tv, internet, wireless_internet, kitchen, fre...",0.035088,0.133333,0,...,0,0,0,0,1,0,0,1,0,0.658564
2,3308979,0,0.666667,0.5625,0.714286,0.4375,"[tv, cable_tv, internet, wireless_internet, ai...",0.241855,1.0,0,...,0,0,0,0,1,0,0,1,0,0.626647
3,7421966,0,0.066667,0.125,0.0,0.125,"[internet, wireless_internet, kitchen, indoor_...",0.022556,0.0,0,...,0,0,0,0,1,1,0,0,0,0.658066
4,278830,0,0.4,0.25,0.428571,0.1875,"[tv, cable_tv, internet, wireless_internet, ki...",0.110276,0.416667,0,...,0,0,0,0,1,0,0,1,0,0.639816


In [59]:
mlb = MultiLabelBinarizer()
final_df = ld_8.join(pd.DataFrame(mlb.fit_transform(ld_8.pop('amenities')),
                          columns=mlb.classes_,
                          index=ld_8.index))
final_df.head()

Unnamed: 0,id,host_is_superhost,guests_included,bathrooms,bedrooms,beds,price,cleaning_fee,instant_bookable,city,...,smoke_detector,smoking_allowed,suitable_for_events,translation_missing:_en.hosting_amenity_49,translation_missing:_en.hosting_amenity_50,tv,washer,washer_/_dryer,wheelchair_accessible,wireless_internet
0,241032,0,0.133333,0.125,0.142857,0.0625,0.018797,0.0,0,seattle,...,0,0,0,0,0,1,1,0,0,1
1,953595,1,0.066667,0.125,0.142857,0.0625,0.035088,0.133333,0,seattle,...,1,0,0,0,0,1,1,0,0,1
2,3308979,0,0.666667,0.5625,0.714286,0.4375,0.241855,1.0,0,seattle,...,1,0,0,0,0,1,1,0,0,1
3,7421966,0,0.066667,0.125,0.0,0.125,0.022556,0.0,0,seattle,...,1,0,0,0,0,0,1,0,0,1
4,278830,0,0.4,0.25,0.428571,0.1875,0.110276,0.416667,0,seattle,...,1,0,0,0,0,1,0,0,0,1


In [60]:
df = final_df.loc[:,['id','guests_included', 'bathrooms', 'bedrooms', 'beds', 'price']] 

In [61]:
z = np.abs(stats.zscore(final_df.loc[:,['guests_included', 'bathrooms', 'bedrooms', 'beds', 'price']] ))
print(z)

[[3.67184221e-01 4.39882728e-01 3.44098959e-01 6.24938804e-01
  5.25494351e-01]
 [4.63389200e-01 4.39882728e-01 3.44098959e-01 6.24938804e-01
  2.65168397e-03]
 [7.01177159e+00 5.92196452e+00 4.50596349e+00 4.91468564e+00
  6.63342833e+00]
 ...
 [4.63389200e-01 4.39882728e-01 3.44098959e-01 6.24938804e-01
  3.83447517e-01]
 [4.63389200e-01 4.39882728e-01 3.44098959e-01 6.24938804e-01
  6.86369018e-01]
 [4.63389200e-01 4.39882728e-01 3.44098959e-01 6.24938804e-01
  6.86369018e-01]]


In [62]:
df.count()

id                 7347
guests_included    7347
bathrooms          7347
bedrooms           7347
beds               7347
price              7347
dtype: int64

In [63]:
exclude_outlier = df[(z < 3).all(axis=1)]

In [64]:
exclude_outlier.head()

Unnamed: 0,id,guests_included,bathrooms,bedrooms,beds,price
0,241032,0.133333,0.125,0.142857,0.0625,0.018797
1,953595,0.066667,0.125,0.142857,0.0625,0.035088
3,7421966,0.066667,0.125,0.0,0.125,0.022556
5,5956968,0.066667,0.125,0.142857,0.0625,0.027569
6,1909058,0.066667,0.125,0.142857,0.0625,0.017544


In [65]:
exclude_outlier1 = exclude_outlier.drop(['guests_included','bathrooms', 'bedrooms', 'beds', 'price'], 1)

In [66]:
exclude_outlier.count()

id                 6872
guests_included    6872
bathrooms          6872
bedrooms           6872
beds               6872
price              6872
dtype: int64

In [67]:
exclude_outlier1.head()

Unnamed: 0,id
0,241032
1,953595
3,7421966
5,5956968
6,1909058


In [68]:
final = pd.merge(final_df, exclude_outlier1, how='right', on='id')

In [69]:
final.head()

Unnamed: 0,id,host_is_superhost,guests_included,bathrooms,bedrooms,beds,price,cleaning_fee,instant_bookable,city,...,smoke_detector,smoking_allowed,suitable_for_events,translation_missing:_en.hosting_amenity_49,translation_missing:_en.hosting_amenity_50,tv,washer,washer_/_dryer,wheelchair_accessible,wireless_internet
0,241032,0,0.133333,0.125,0.142857,0.0625,0.018797,0.0,0,seattle,...,0,0,0,0,0,1,1,0,0,1
1,953595,1,0.066667,0.125,0.142857,0.0625,0.035088,0.133333,0,seattle,...,1,0,0,0,0,1,1,0,0,1
2,7421966,0,0.066667,0.125,0.0,0.125,0.022556,0.0,0,seattle,...,1,0,0,0,0,0,1,0,0,1
3,5956968,0,0.066667,0.125,0.142857,0.0625,0.027569,0.133333,0,seattle,...,1,0,0,0,0,0,0,0,0,1
4,1909058,1,0.066667,0.125,0.142857,0.0625,0.017544,0.0,0,seattle,...,1,0,0,0,0,0,0,0,0,1


In [70]:
final.count()

id                                            6872
host_is_superhost                             6872
guests_included                               6872
bathrooms                                     6872
bedrooms                                      6872
beds                                          6872
price                                         6872
cleaning_fee                                  6872
instant_bookable                              6872
city                                          6872
review_scores_value                           6872
Apartment                                     6872
Bed & Breakfast                               6872
Boat                                          6872
Bungalow                                      6872
Cabin                                         6872
Camper/RV                                     6872
Chalet                                        6872
Condominium                                   6872
Dorm                           

# Clean the review data

- take 10 random reviews for each listing

In [71]:
reviews = pd.read_csv('./seattle-airbnb/reviews.csv')
reviews1 = pd.read_csv('./boston-airbnb/reviews.csv')

reviews = pd.concat([reviews, reviews1], ignore_index=True, sort=True)



In [72]:
reviews.head()

Unnamed: 0,comments,date,id,listing_id,reviewer_id,reviewer_name
0,Cute and cozy place. Perfect location to every...,2015-07-19,38917982,7202016,28943674,Bianca
1,Kelly has a great room in a very central locat...,2015-07-20,39087409,7202016,32440555,Frank
2,"Very spacious apartment, and in a great neighb...",2015-07-26,39820030,7202016,37722850,Ian
3,Close to Seattle Center and all it has to offe...,2015-08-02,40813543,7202016,33671805,George
4,Kelly was a great host and very accommodating ...,2015-08-10,41986501,7202016,34959538,Ming


In [73]:
rv = reviews.loc[:, ['listing_id', 'id','comments']]

In [74]:
rv.head(20)

Unnamed: 0,listing_id,id,comments
0,7202016,38917982,Cute and cozy place. Perfect location to every...
1,7202016,39087409,Kelly has a great room in a very central locat...
2,7202016,39820030,"Very spacious apartment, and in a great neighb..."
3,7202016,40813543,Close to Seattle Center and all it has to offe...
4,7202016,41986501,Kelly was a great host and very accommodating ...
5,7202016,43979139,"Kelly was great, place was great, just what I ..."
6,7202016,45265631,Kelly was great! Very nice and the neighborhoo...
7,7202016,46749120,hola all bnb erz - Just left Seattle where I h...
8,7202016,47783346,Kelly's place is conveniently located on a qui...
9,7202016,48388999,"The place was really nice, clean, and the most..."


In [75]:
listings_and_reviews = pd.merge(rv, final, left_on = "listing_id", right_on = "id")

In [76]:
len(set(listings_and_reviews['listing_id']))

5587

In [77]:
listings_and_reviews.head()

Unnamed: 0,listing_id,id_x,comments,id_y,host_is_superhost,guests_included,bathrooms,bedrooms,beds,price,...,smoke_detector,smoking_allowed,suitable_for_events,translation_missing:_en.hosting_amenity_49,translation_missing:_en.hosting_amenity_50,tv,washer,washer_/_dryer,wheelchair_accessible,wireless_internet
0,7202016,38917982,Cute and cozy place. Perfect location to every...,7202016,0,0.066667,0.125,0.142857,0.0625,0.016291,...,1,0,0,0,0,0,1,0,0,1
1,7202016,39087409,Kelly has a great room in a very central locat...,7202016,0,0.066667,0.125,0.142857,0.0625,0.016291,...,1,0,0,0,0,0,1,0,0,1
2,7202016,39820030,"Very spacious apartment, and in a great neighb...",7202016,0,0.066667,0.125,0.142857,0.0625,0.016291,...,1,0,0,0,0,0,1,0,0,1
3,7202016,40813543,Close to Seattle Center and all it has to offe...,7202016,0,0.066667,0.125,0.142857,0.0625,0.016291,...,1,0,0,0,0,0,1,0,0,1
4,7202016,41986501,Kelly was a great host and very accommodating ...,7202016,0,0.066667,0.125,0.142857,0.0625,0.016291,...,1,0,0,0,0,0,1,0,0,1


In [78]:
listings_and_reviews.iloc[:,1].count()
combined = listings_and_reviews.dropna()
combined.iloc[:,1].count()

144272

In [79]:
combined.groupby('listing_id').count().head()

Unnamed: 0_level_0,id_x,comments,id_y,host_is_superhost,guests_included,bathrooms,bedrooms,beds,price,cleaning_fee,...,smoke_detector,smoking_allowed,suitable_for_events,translation_missing:_en.hosting_amenity_49,translation_missing:_en.hosting_amenity_50,tv,washer,washer_/_dryer,wheelchair_accessible,wireless_internet
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3353,34,34,34,34,34,34,34,34,34,34,...,34,34,34,34,34,34,34,34,34,34
4291,35,35,35,35,35,35,35,35,35,35,...,35,35,35,35,35,35,35,35,35,35
5506,36,36,36,36,36,36,36,36,36,36,...,36,36,36,36,36,36,36,36,36,36
5682,296,296,296,296,296,296,296,296,296,296,...,296,296,296,296,296,296,296,296,296,296
6606,52,52,52,52,52,52,52,52,52,52,...,52,52,52,52,52,52,52,52,52,52


In [80]:
size = 10 # sample size 
replace = True # with replacement 
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:] 
combined = combined.groupby('listing_id', as_index=False).apply(fn).drop(['id_x','id_y'], axis=1)

In [81]:
comments = combined['comments']

In [82]:
comments.head(10)

0  105551    Giuseppe was a very nice and cordial host, the...
   105564    Although not physically present at the apartme...
   105542    The location is great as it's right next to th...
   105570    Giuseppe es una persona muy atenta, que respon...
   105563    I really enjoyed my stay here. The room is nic...
   105555    Stay exceeded my expectations. Great location,...
   105542    The location is great as it's right next to th...
   105550    Giuseppe is a nice guy. He helped me carry my ...
   105557    Nice room, small ( but that was clear on the a...
   105557    Nice room, small ( but that was clear on the a...
Name: comments, dtype: object

In [83]:
combined['comments'].head()

0  105551    Giuseppe was a very nice and cordial host, the...
   105564    Although not physically present at the apartme...
   105542    The location is great as it's right next to th...
   105570    Giuseppe es una persona muy atenta, que respon...
   105563    I really enjoyed my stay here. The room is nic...
Name: comments, dtype: object

In [84]:
from stop_words import get_stop_words
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize 

In [85]:
def preprocess(sentence):
    outputSentence = sentence.lower()
    outputSentence = replaceContractions(outputSentence)
    outputSentence = removePunc(outputSentence)
    outputSentence = removeNumbers(outputSentence)
    #outputSentence = remove_non_english(outputSentence)
    return outputSentence

In [86]:
def replaceContractions(sentence):
    outputSentence = sentence
    outputSentence = outputSentence.replace("won't", "will not")
    outputSentence = outputSentence.replace("can\'t", "can not")
    outputSentence = outputSentence.replace("n\'t", " not")
    outputSentence = outputSentence.replace("\'re", " are")
    outputSentence = outputSentence.replace("\'s", " is")
    outputSentence = outputSentence.replace("\'d", " would")
    outputSentence = outputSentence.replace("\'ll", " will")
    outputSentence = outputSentence.replace("\'t", " not")
    outputSentence = outputSentence.replace("\'ve", " have")
    outputSentence = outputSentence.replace("\'m", " am")
    return outputSentence


In [87]:
def removePunc(sentence):
    removePuncTrans = str.maketrans("", "", string.punctuation)
    outputSentence = sentence.translate(removePuncTrans)
    return outputSentence

In [88]:
def removeNumbers(sentence):
    outputSentence = sentence
    removeDigitsTrans = str.maketrans('', '', string.digits)
    outputSentence = outputSentence.translate(removeDigitsTrans)
    return outputSentence

def remove_non_english(line):
    split = line.split()
    stemmer = SnowballStemmer("english")
    split = [stemmer.stem(w) for w in split]
    return ' '.join(split)

In [89]:
combined['comments'] = combined['comments'].apply(preprocess)

In [90]:
#import nltk
#nltk.download()

In [91]:
import re
def EngStopword(context):
    english = re.findall("[a-z]+",context)
    e_clean = [t for t in english if t not in stopwords.words('english') and len(t) is not 1]
    return e_clean

In [92]:
combined['comments'] = combined['comments'].apply(EngStopword)

In [93]:
combined['comments'].head(20)

0  105551    [giuseppe, nice, cordial, host, room, bit, sma...
   105564    [although, physically, present, apartment, gui...
   105542    [location, great, right, next, green, stop, ma...
   105570    [giuseppe, es, una, persona, muy, atenta, que,...
   105563    [really, enjoyed, stay, room, nice, big, bed, ...
   105555    [stay, exceeded, expectations, great, location...
   105542    [location, great, right, next, green, stop, ma...
   105550    [giuseppe, nice, guy, helped, carry, language,...
   105557    [nice, room, small, clear, ad, clean, conforta...
   105557    [nice, room, small, clear, ad, clean, conforta...
1  51881                     [great, place, would, love, stay]
   51875     [great, experience, seattle, enjoyed, view, ra...
   51886     [jess, welcoming, responsive, email, exchanges...
   51882     [great, room, matched, expectations, clean, co...
   51905     [stay, sunrise, suite, exactly, looking, priva...
   51877     [good, experience, staying, jess, joey, ho

In [94]:
combined.head()

Unnamed: 0,Unnamed: 1,listing_id,comments,host_is_superhost,guests_included,bathrooms,bedrooms,beds,price,cleaning_fee,instant_bookable,...,smoke_detector,smoking_allowed,suitable_for_events,translation_missing:_en.hosting_amenity_49,translation_missing:_en.hosting_amenity_50,tv,washer,washer_/_dryer,wheelchair_accessible,wireless_internet
0,105551,3353,"[giuseppe, nice, cordial, host, room, bit, sma...",0,0.066667,0.125,0.142857,0.0625,0.007519,0.166667,0,...,1,0,0,1,1,0,1,0,0,1
0,105564,3353,"[although, physically, present, apartment, gui...",0,0.066667,0.125,0.142857,0.0625,0.007519,0.166667,0,...,1,0,0,1,1,0,1,0,0,1
0,105542,3353,"[location, great, right, next, green, stop, ma...",0,0.066667,0.125,0.142857,0.0625,0.007519,0.166667,0,...,1,0,0,1,1,0,1,0,0,1
0,105570,3353,"[giuseppe, es, una, persona, muy, atenta, que,...",0,0.066667,0.125,0.142857,0.0625,0.007519,0.166667,0,...,1,0,0,1,1,0,1,0,0,1
0,105563,3353,"[really, enjoyed, stay, room, nice, big, bed, ...",0,0.066667,0.125,0.142857,0.0625,0.007519,0.166667,0,...,1,0,0,1,1,0,1,0,0,1


In [95]:
combined1 = combined.groupby('listing_id')['comments'].apply(list)

In [96]:
combined4 = combined1.to_frame()

In [97]:
combined2 = combined.drop('comments', 1)

In [98]:
combined3 = combined2.drop_duplicates('listing_id')

In [99]:
combined3.head()

Unnamed: 0,Unnamed: 1,listing_id,host_is_superhost,guests_included,bathrooms,bedrooms,beds,price,cleaning_fee,instant_bookable,city,...,smoke_detector,smoking_allowed,suitable_for_events,translation_missing:_en.hosting_amenity_49,translation_missing:_en.hosting_amenity_50,tv,washer,washer_/_dryer,wheelchair_accessible,wireless_internet
0,105551,3353,0,0.066667,0.125,0.142857,0.0625,0.007519,0.166667,0,boston,...,1,0,0,1,1,0,1,0,0,1
1,51881,4291,0,0.066667,0.125,0.142857,0.0625,0.018045,0.1,0,seattle,...,1,0,0,0,0,1,1,0,0,1
2,87935,5506,0,0.133333,0.125,0.142857,0.0625,0.033835,0.133333,0,boston,...,0,0,0,0,0,1,1,0,0,1
3,57929,5682,0,0.066667,0.125,0.142857,0.0625,0.009524,0.083333,1,seattle,...,1,0,0,0,0,1,0,0,0,1
4,12140,6606,0,0.133333,0.125,0.142857,0.0625,0.02005,0.133333,0,seattle,...,0,0,0,0,0,1,0,0,0,1


In [100]:
combined3.head()

Unnamed: 0,Unnamed: 1,listing_id,host_is_superhost,guests_included,bathrooms,bedrooms,beds,price,cleaning_fee,instant_bookable,city,...,smoke_detector,smoking_allowed,suitable_for_events,translation_missing:_en.hosting_amenity_49,translation_missing:_en.hosting_amenity_50,tv,washer,washer_/_dryer,wheelchair_accessible,wireless_internet
0,105551,3353,0,0.066667,0.125,0.142857,0.0625,0.007519,0.166667,0,boston,...,1,0,0,1,1,0,1,0,0,1
1,51881,4291,0,0.066667,0.125,0.142857,0.0625,0.018045,0.1,0,seattle,...,1,0,0,0,0,1,1,0,0,1
2,87935,5506,0,0.133333,0.125,0.142857,0.0625,0.033835,0.133333,0,boston,...,0,0,0,0,0,1,1,0,0,1
3,57929,5682,0,0.066667,0.125,0.142857,0.0625,0.009524,0.083333,1,seattle,...,1,0,0,0,0,1,0,0,0,1
4,12140,6606,0,0.133333,0.125,0.142857,0.0625,0.02005,0.133333,0,seattle,...,0,0,0,0,0,1,0,0,0,1


In [101]:
combined4.head()

Unnamed: 0_level_0,comments
listing_id,Unnamed: 1_level_1
3353,"[[giuseppe, nice, cordial, host, room, bit, sm..."
4291,"[[great, place, would, love, stay], [great, ex..."
5506,"[[warm, accommodating, host, beautiful, well, ..."
5682,"[[studio, clean, comfortable, nice, neighborho..."
6606,"[[first, time, seattle, loved, proximity, walk..."


In [102]:
final = pd.merge(combined4, combined3 , how='inner', on='listing_id')


In [103]:
final.head()

Unnamed: 0,listing_id,comments,host_is_superhost,guests_included,bathrooms,bedrooms,beds,price,cleaning_fee,instant_bookable,...,smoke_detector,smoking_allowed,suitable_for_events,translation_missing:_en.hosting_amenity_49,translation_missing:_en.hosting_amenity_50,tv,washer,washer_/_dryer,wheelchair_accessible,wireless_internet
0,3353,"[[giuseppe, nice, cordial, host, room, bit, sm...",0,0.066667,0.125,0.142857,0.0625,0.007519,0.166667,0,...,1,0,0,1,1,0,1,0,0,1
1,4291,"[[great, place, would, love, stay], [great, ex...",0,0.066667,0.125,0.142857,0.0625,0.018045,0.1,0,...,1,0,0,0,0,1,1,0,0,1
2,5506,"[[warm, accommodating, host, beautiful, well, ...",0,0.133333,0.125,0.142857,0.0625,0.033835,0.133333,0,...,0,0,0,0,0,1,1,0,0,1
3,5682,"[[studio, clean, comfortable, nice, neighborho...",0,0.066667,0.125,0.142857,0.0625,0.009524,0.083333,1,...,1,0,0,0,0,1,0,0,0,1
4,6606,"[[first, time, seattle, loved, proximity, walk...",0,0.133333,0.125,0.142857,0.0625,0.02005,0.133333,0,...,0,0,0,0,0,1,0,0,0,1


In [104]:
city = pd.get_dummies(final['city'])

In [105]:
city.head()

Unnamed: 0,boston,seattle
0,1,0
1,0,1
2,1,0
3,0,1
4,0,1


In [106]:
final = pd.merge(final, city, left_index=True, right_index=True)

In [107]:
final.to_csv('airbnb_data.csv', index=False)