# Data preparation - Seatle AirBnB

In [38]:
#Importing standard liabraries

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [39]:
# Uploading AirBnB data sets

calendar_df = pd.read_csv('calendar.csv')

listing_df = pd.read_csv('listings.csv')

reviews_df = pd.read_csv('reviews.csv')

As we've seen in the __Data understanding__ step: 

- Some of the attributes from Seattle dataset have missing data
    - Luckily for us only a small proportion of those attributes we are going to use in the model have around 10% of missing values. The rest of data is complete.
- We've identified highly correlated attributes from the same categories
    - Those attributes will be excluded from the training model 

It certainly does not make sense to use all categorical and numerical values for our data model. Hence we will focus on the most reasonable predictors. For instance for numerical variables we will exclude all highly  correlated attributes, this will save us from multicollinearity issues in the future. (See Data Understanding - correlation heatmap)

As for categorical, we will have to apply a bit of common sense and exclude such as: state, city, picture_url, name etc.

In [42]:
# Cleaning categorical variables. 

listings_clean_df = listing_df.drop(columns = ['amenities','calendar_updated','cancellation_policy','city',
                                               'space', 'neighborhood_overview', 'notes',
                                               'medium_url', 'xl_picture_url', 'host_url', 'is_location_exact',
                                               'country','country_code','description','experiences_offered','calendar_last_scraped',
                                               'host_has_profile_pic','host_id','host_name','host_picture_url', 'host_about', 'host_verifications',
                                               'host_thumbnail_url','listing_url','market','name','neighbourhood_group_cleansed',
                                               'jurisdiction_names', 'require_guest_profile_picture', 'smart_location',
                                               'picture_url','scrape_id','state','street','summary','thumbnail_url','zipcode'])



listings_clean_df.columns

Index(['id', 'last_scraped', 'transit', 'host_since', 'host_location',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'latitude', 'longitude', 'property_type',
       'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'bed_type', 'square_feet', 'price', 'weekly_price', 'monthly_price',
       'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people',
       'minimum_nights', 'maximum_nights', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'first_review', 'last_review',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_s

In [43]:
# Cleaning categorical variables. 

listings_clean_df = listings_clean_df.drop(columns = ['id','host_response_rate','accommodates','bedrooms',
                                                      'host_listings_count','host_total_listings_count', 
                                                     'guests_included','availability_60','availability_90', 'availability_365',
                                                     'review_scores_accuracy','review_scores_cleanliness','review_scores_checkin',
                                                     'review_scores_communication','review_scores_value'])
listings_clean_df.columns

Index(['last_scraped', 'transit', 'host_since', 'host_location',
       'host_response_time', 'host_acceptance_rate', 'host_is_superhost',
       'host_neighbourhood', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'latitude', 'longitude', 'property_type',
       'room_type', 'bathrooms', 'beds', 'bed_type', 'square_feet', 'price',
       'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee',
       'extra_people', 'minimum_nights', 'maximum_nights', 'has_availability',
       'availability_30', 'number_of_reviews', 'first_review', 'last_review',
       'review_scores_rating', 'review_scores_location', 'requires_license',
       'license', 'instant_bookable', 'require_guest_phone_verification',
       'calculated_host_listings_count', 'reviews_per_month'],
      dtype='object')

In [44]:
# Let's review attributes with more than 10% of missing values 

missing_10_pct_values = set(listings_clean_df.columns[listings_clean_df.isnull().mean() > 0.1])

missing_10_pct_values

{'cleaning_fee',
 'first_review',
 'host_acceptance_rate',
 'host_response_time',
 'last_review',
 'license',
 'monthly_price',
 'neighbourhood',
 'review_scores_location',
 'review_scores_rating',
 'reviews_per_month',
 'security_deposit',
 'square_feet',
 'transit',
 'weekly_price'}

In [45]:
# Let's drop columns with more than 10% missing values

listings_clean_df = listings_clean_df.drop(columns = ['cleaning_fee','first_review','host_acceptance_rate',
                                                     'last_review','license','monthly_price','neighbourhood',
                                                     'reviews_per_month','security_deposit','square_feet',
                                                     'transit','weekly_price'])

listings_clean_df

Unnamed: 0,last_scraped,host_since,host_location,host_response_time,host_is_superhost,host_neighbourhood,host_identity_verified,neighbourhood_cleansed,latitude,longitude,...,maximum_nights,has_availability,availability_30,number_of_reviews,review_scores_rating,review_scores_location,requires_license,instant_bookable,require_guest_phone_verification,calculated_host_listings_count
0,2016-01-04,2011-08-11,"Seattle, Washington, United States",within a few hours,f,Queen Anne,t,West Queen Anne,47.636289,-122.371025,...,365,t,14,207,95.0,9.0,f,f,f,2
1,2016-01-04,2013-02-21,"Seattle, Washington, United States",within an hour,t,Queen Anne,t,West Queen Anne,47.639123,-122.365666,...,90,t,13,43,96.0,10.0,f,f,t,6
2,2016-01-04,2014-06-12,"Seattle, Washington, United States",within a few hours,f,Queen Anne,t,West Queen Anne,47.629724,-122.369483,...,30,t,1,20,97.0,10.0,f,f,f,2
3,2016-01-04,2013-11-06,"Seattle, Washington, United States",,f,Queen Anne,t,West Queen Anne,47.638473,-122.369279,...,1125,t,0,0,,,f,f,f,1
4,2016-01-04,2011-11-29,"Seattle, Washington, United States",within an hour,f,Queen Anne,t,West Queen Anne,47.632918,-122.372471,...,1125,t,30,38,92.0,9.0,f,f,f,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3813,2016-01-04,2015-04-13,US,within a few hours,f,Holly,t,Fremont,47.664295,-122.359170,...,1125,t,18,1,80.0,10.0,f,f,f,8
3814,2016-01-04,2015-10-14,"Seattle, Washington, United States",within an hour,f,Portage Bay,t,Portage Bay,47.649552,-122.318309,...,29,t,6,2,100.0,10.0,f,f,f,1
3815,2016-01-04,2015-12-30,US,,f,,f,Rainier Beach,47.508453,-122.240607,...,7,t,29,0,,,f,f,f,1
3816,2016-01-04,2015-01-03,"Tacoma, Washington, United States",within an hour,f,,t,Madison Park,47.632335,-122.275530,...,1125,t,30,0,,,f,f,f,1


We can merge two columns last_scraped and host_since to calculate for how long host was on the market

In [46]:
listings_clean_df.columns

Index(['last_scraped', 'host_since', 'host_location', 'host_response_time',
       'host_is_superhost', 'host_neighbourhood', 'host_identity_verified',
       'neighbourhood_cleansed', 'latitude', 'longitude', 'property_type',
       'room_type', 'bathrooms', 'beds', 'bed_type', 'price', 'extra_people',
       'minimum_nights', 'maximum_nights', 'has_availability',
       'availability_30', 'number_of_reviews', 'review_scores_rating',
       'review_scores_location', 'requires_license', 'instant_bookable',
       'require_guest_phone_verification', 'calculated_host_listings_count'],
      dtype='object')