# AirBnB Seattle Project - Additional Data Processing
We'll need to do some additional data processing to reduce the feature set down and convert values into forms usable by modeling algorithms

In [255]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

import sys
sys.path.append("../scripts")

import airbnb_functions as abnb

In [256]:
# Import data
calendar = pickle.load(open('../data/pickles/calendar_munged.pkl', 'rb'))
listings = pickle.load(open('../data/pickles/listings_munged.pkl', 'rb'))
reviews = pickle.load(open('../data/pickles/reviews_munged.pkl', 'rb'))

In [266]:
listings.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3818 entries, 0 to 3817
Data columns (total 104 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   id                                3818 non-null   int64         
 1   name                              3818 non-null   object        
 2   summary                           3641 non-null   object        
 3   space                             3249 non-null   object        
 4   description                       3818 non-null   object        
 5   experiences_offered               3818 non-null   object        
 6   neighborhood_overview             2786 non-null   object        
 7   notes                             2212 non-null   object        
 8   transit                           2884 non-null   object        
 9   thumbnail_url                     3498 non-null   object        
 10  medium_url                        3498 non-null

In [None]:
def probabilistic_nan_replacement(series):
    """Replace NaNs based on the probability distribution of values present in data
    Parameters
    ----------
    series: pandas series
        Series where NaNs must be replaced

    Returns
    -------
    new_series: pandas series
        Series with nan values replaced
    """

    # Create list of values for the probability distribution and probabilities associated with each one
    values = list(series.value_counts.index)
    probs = list(series.value_counts()/series.value_counts().sum())

    nan_values = np.random.choice(values, size=series.isna().sum(), p=probs)

    nan_replacements = pd.Series(nan_values, index=series.loc[series.isna()].index)
    new_series = pd.concat([series.loc[series.notna()], nan_replacements]).sort_index()

    return new_series

def adjusted_r2(x_train, y_pred, y_test):
    r2 = r2_score(y_pred, y_test)
    n = x_train.shape[0]
    k = x_train.shape[1]
    adj_r2 = 1 - (((1-r2) * (n - 1)) / (n - k -1))
    return adj_r2

# Prediction Target - Occupancy Rate
Our prediction target is yearly occupancy rate for a listing so we'll need to add this rate to the dataset.

In [166]:
occupancy_rate = calendar[['listing_id', 'rented']].groupby('listing_id').mean()
listings = pd.merge(left=listings, right=occupancy_rates, left_on='id', right_index=True)
listings.rename(columns={'rented': 'occupancy_rate'}, inplace=True)
listings

Unnamed: 0,id,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,thumbnail_url,...,host_neighbourhood_nan,neighbourhood_nan,square_feet_nan,weekly_price_nan,security_deposit_nan,cleaning_fee_nan,len_summary,len_space,len_description,occupancy_rate
0,241032,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,,,,...,1,1,0,0,0,0,0,1000,1000,0.052055
1,953595,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,none,"Queen Anne is a wonderful, truly functional vi...",What's up with the free pillows? Our home was...,"Convenient bus stops are just down the block, ...",https://a0.muscache.com/ac/pictures/14409893/f...,...,1,1,0,1,1,1,249,1000,1000,0.202740
2,3308979,New Modern House-Amazing water view,New modern house built in 2013. Spectacular s...,"Our house is modern, light and fresh with a wa...",New modern house built in 2013. Spectacular s...,none,Upper Queen Anne is a charming neighborhood fu...,Our house is located just 5 short blocks to To...,A bus stop is just 2 blocks away. Easy bus a...,,...,1,1,0,0,1,1,241,1000,1000,0.397260
3,7421966,Queen Anne Chateau,A charming apartment that sits atop Queen Anne...,,A charming apartment that sits atop Queen Anne...,none,,,,,...,1,1,0,1,0,0,243,0,243,0.608219
4,278830,Charming craftsman 3 bdm house,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,none,We are in the beautiful neighborhood of Queen ...,Belltown,The nearest public transit bus (D Line) is 2 b...,,...,1,1,0,0,1,1,184,488,1000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3813,8101950,3BR Mountain View House in Seattle,Our 3BR/2BA house boasts incredible views of t...,"Our 3BR/2BA house bright, stylish, and wheelch...",Our 3BR/2BA house boasts incredible views of t...,none,We're located near lots of family fun. Woodlan...,,,https://a2.muscache.com/ac/pictures/103217071/...,...,1,1,0,0,0,1,230,1000,1000,0.912329
3814,8902327,Portage Bay View!-One Bedroom Apt,800 square foot 1 bedroom basement apartment w...,This space has a great view of Portage Bay wit...,800 square foot 1 bedroom basement apartment w...,none,The neighborhood is a quiet oasis that is clos...,This is a basement apartment in a newer reside...,Uber and Car2go are good options in Seattle. T...,https://a2.muscache.com/ac/pictures/626d4b1f-6...,...,1,1,0,0,1,1,235,282,1000,0.252055
3815,10267360,Private apartment view of Lake WA,"Very comfortable lower unit. Quiet, charming m...",,"Very comfortable lower unit. Quiet, charming m...",none,,,,https://a2.muscache.com/ac/pictures/a5974f04-2...,...,0,0,0,1,1,1,161,0,161,0.758904
3816,9604740,Amazing View with Modern Comfort!,Cozy studio condo in the heart on Madison Park...,Fully furnished unit to accommodate most needs...,Cozy studio condo in the heart on Madison Park...,none,Madison Park offers a peaceful slow pace upsca...,,Yes,https://a2.muscache.com/ac/pictures/202e4ad6-b...,...,0,0,0,0,1,1,273,662,1000,0.509589


# Convert Numeric & Categorical Columns
Several columns are either numeric values in string form that need conversion to numerical datatypes or categoricals that need to be encoded for use in modeling.

**Numerical Length Conversion**
- 'host_about'

**Numeric Conversion**
- 'host_response_rate'
- 'host_acceptance_rate'
- 'price'
- 'security_deposit'
- 'cleaning_fee'

**Categorical Conversion**
- 'host_response_time'
- 'host_is_superhost'
- 'host_has_profile_pic'
- 'host_identity_verified'
- 'neighbourhood'
- 'is_exact_location'
- 'property_type'
- 'bed_type'
- 'requires_license'
- 'instant_bookable'
- 'cancellation_policy'
- 'require_guest_profile_picture'
- 'require_guest_phone_verification'

## Numerical Length Conversion

In [167]:
# Convert NaN values to ''
listings.fillna({'host_about': ''}, inplace=True)

In [168]:
listings['len_host_about'] = listings['host_about'].apply(len)

In [169]:
list(listings.columns)

['id',
 'name',
 'summary',
 'space',
 'description',
 'experiences_offered',
 'neighborhood_overview',
 'notes',
 'transit',
 'thumbnail_url',
 'medium_url',
 'picture_url',
 'xl_picture_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'city',
 'state',
 'zipcode',
 'market',
 'smart_location',
 'country_code',
 'country',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',
 'square_feet',
 'price',
 'weekly_price',
 'monthly_price',
 'security_deposit',
 'cleaning_fee',
 'guests_include

## Numeric Conversion
Conversion of numeric values in string form:
- 'host_response_rate'
- 'host_acceptance_rate'
- 'price'
- 'security_deposit'
- 'cleaning_fee'

### host_response_rate
Convert string percentages to numerical

#### NaN Conversion to Categorical
We actually missed adding a categorical for the NaNs in this column, so let's do that first.

In [170]:
listings['host_response_rate_nan'] = listings.host_response_rate.isna().replace({True:1, False:0})

#### NaN Handling
We'll have to figure out how to handle the NaNs here. Since there are 523 of them that's a huge amount of the dataset so converting them all to one value could skew the model. 

In [171]:
listings.host_response_rate.value_counts()

100%    2371
90%      165
80%      104
99%       78
88%       66
50%       52
94%       46
67%       41
89%       32
75%       32
86%       30
98%       28
96%       28
70%       25
97%       21
60%       20
83%       20
92%       18
93%       18
33%       14
40%       12
63%       11
71%        8
78%        8
95%        8
91%        5
43%        4
87%        3
25%        3
76%        3
64%        3
57%        2
82%        2
17%        2
58%        2
38%        1
31%        1
81%        1
55%        1
56%        1
69%        1
30%        1
65%        1
53%        1
68%        1
Name: host_response_rate, dtype: int64

- The vast majority of the responses are 100% so I could just make that the default fill in for NaNs, but I think there is a better way to approach this.  
- What we have above is a distribution of the values in the dataset, and because it comprises a vast majority of the set, we can assume that it is an accurate picture of the overall distribution. What we can do is **randomly assign one of these values based on the probability of it being one of these values, based on this distribution**.

In [172]:
percs = list(listings.host_response_rate.value_counts().index)
probs = list(listings.host_response_rate.value_counts()/listings.host_response_rate.value_counts().sum()) 

In [173]:
nan_percs = np.random.choice(percs, size=523, p=probs)

In [174]:
nan_replacements = pd.Series(nan_percs, index=listings.loc[listings.host_response_rate.isna()].index)
host_response_rate_nanfilled = pd.concat([listings.host_response_rate.loc[listings.host_response_rate.notna()], nan_replacements]).sort_index()

In [175]:
listings['host_response_rate_nanfilled'] = host_response_rate_nanfilled
listings

Unnamed: 0,id,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,thumbnail_url,...,weekly_price_nan,security_deposit_nan,cleaning_fee_nan,len_summary,len_space,len_description,occupancy_rate,len_host_about,host_response_rate_nan,host_response_rate_nanfilled
0,241032,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,,,,...,0,0,0,0,1000,1000,0.052055,372,0,96%
1,953595,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,none,"Queen Anne is a wonderful, truly functional vi...",What's up with the free pillows? Our home was...,"Convenient bus stops are just down the block, ...",https://a0.muscache.com/ac/pictures/14409893/f...,...,1,1,1,249,1000,1000,0.202740,74,0,98%
2,3308979,New Modern House-Amazing water view,New modern house built in 2013. Spectacular s...,"Our house is modern, light and fresh with a wa...",New modern house built in 2013. Spectacular s...,none,Upper Queen Anne is a charming neighborhood fu...,Our house is located just 5 short blocks to To...,A bus stop is just 2 blocks away. Easy bus a...,,...,0,1,1,241,1000,1000,0.397260,343,0,67%
3,7421966,Queen Anne Chateau,A charming apartment that sits atop Queen Anne...,,A charming apartment that sits atop Queen Anne...,none,,,,,...,1,0,0,243,0,243,0.608219,0,1,100%
4,278830,Charming craftsman 3 bdm house,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,none,We are in the beautiful neighborhood of Queen ...,Belltown,The nearest public transit bus (D Line) is 2 b...,,...,0,1,1,184,488,1000,0.000000,354,0,100%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3813,8101950,3BR Mountain View House in Seattle,Our 3BR/2BA house boasts incredible views of t...,"Our 3BR/2BA house bright, stylish, and wheelch...",Our 3BR/2BA house boasts incredible views of t...,none,We're located near lots of family fun. Woodlan...,,,https://a2.muscache.com/ac/pictures/103217071/...,...,0,0,1,230,1000,1000,0.912329,0,0,99%
3814,8902327,Portage Bay View!-One Bedroom Apt,800 square foot 1 bedroom basement apartment w...,This space has a great view of Portage Bay wit...,800 square foot 1 bedroom basement apartment w...,none,The neighborhood is a quiet oasis that is clos...,This is a basement apartment in a newer reside...,Uber and Car2go are good options in Seattle. T...,https://a2.muscache.com/ac/pictures/626d4b1f-6...,...,0,1,1,235,282,1000,0.252055,374,0,100%
3815,10267360,Private apartment view of Lake WA,"Very comfortable lower unit. Quiet, charming m...",,"Very comfortable lower unit. Quiet, charming m...",none,,,,https://a2.muscache.com/ac/pictures/a5974f04-2...,...,1,1,1,161,0,161,0.758904,0,1,63%
3816,9604740,Amazing View with Modern Comfort!,Cozy studio condo in the heart on Madison Park...,Fully furnished unit to accommodate most needs...,Cozy studio condo in the heart on Madison Park...,none,Madison Park offers a peaceful slow pace upsca...,,Yes,https://a2.muscache.com/ac/pictures/202e4ad6-b...,...,0,1,1,273,662,1000,0.509589,0,0,100%


In [176]:
listings.host_response_rate_nanfilled.isna().sum()

0

#### Convert to Numerical

In [177]:
host_response_rate_int32 = listings.host_response_rate_nanfilled.str.replace('\%', '').astype('int32')

In [178]:
listings['host_response_rate_nanfilled'] = host_response_rate_int32

### host_acceptance_rate

In [179]:
listings.host_acceptance_rate.isna().sum()

773

There are over 700 NaNs in host_acceptance_rate may have to perform similar operations to what we did above. As such it's best to convert this to a function. 

In [180]:
listings.host_acceptance_rate.value_counts()

100%    3044
0%         1
Name: host_acceptance_rate, dtype: int64

Actually we don't have to do anything. We already have a NaN categorical for the NaN vs not-NaN entries, and because ALL of the not-NaN entries are 100% this column really won't have any value for the model. **We'll go ahead and DROP `host_acceptance_rate` from the dataset**

### Drop host_acceptance_rate

In [181]:
listings.drop('host_acceptance_rate', axis=1, inplace=True)

In [183]:
listings.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3818 entries, 0 to 3817
Data columns (total 107 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   id                                3818 non-null   int64         
 1   name                              3818 non-null   object        
 2   summary                           3641 non-null   object        
 3   space                             3249 non-null   object        
 4   description                       3818 non-null   object        
 5   experiences_offered               3818 non-null   object        
 6   neighborhood_overview             2786 non-null   object        
 7   notes                             2212 non-null   object        
 8   transit                           2884 non-null   object        
 9   thumbnail_url                     3498 non-null   object        
 10  medium_url                        3498 non-null

### price, security_deposit & cleaning_fee Conversion to Numeric

In [184]:
# Convert prices
prices = listings.price.apply(lambda x: x[1:]).str.replace(',','').str.replace('.', '').astype('int32')
listings.price = prices

In [185]:
listings.security_deposit.isna().sum()

1952

In [186]:
listings.iloc[:, -20:]

Unnamed: 0,notes_nan,transit_nan,thumbnail_url_nan,medium_url_nan,xl_picture_url_nan,host_about_nan,host_acceptance_rate_nan,host_neighbourhood_nan,neighbourhood_nan,square_feet_nan,weekly_price_nan,security_deposit_nan,cleaning_fee_nan,len_summary,len_space,len_description,occupancy_rate,len_host_about,host_response_rate_nan,host_response_rate_nanfilled
0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,1000,1000,0.052055,372,0,96
1,1,1,1,1,1,1,1,1,1,0,1,1,1,249,1000,1000,0.202740,74,0,98
2,1,1,0,0,0,1,1,1,1,0,0,1,1,241,1000,1000,0.397260,343,0,67
3,0,0,0,0,0,0,0,1,1,0,1,0,0,243,0,243,0.608219,0,1,100
4,1,1,0,0,0,1,0,1,1,0,0,1,1,184,488,1000,0.000000,354,0,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3813,0,0,1,1,1,0,1,1,1,0,0,0,1,230,1000,1000,0.912329,0,0,99
3814,1,1,1,1,1,1,1,1,1,0,0,1,1,235,282,1000,0.252055,374,0,100
3815,0,0,1,1,1,0,0,0,0,0,1,1,1,161,0,161,0.758904,0,1,63
3816,0,1,1,1,1,0,0,0,0,0,0,1,1,273,662,1000,0.509589,0,0,100


Over half of the observations are missing security_deposit entries so because we've already created a NaN categorical feature for this **we'll drop security deposit from the dataset**.

#### Drop security_deposit

In [187]:
listings.drop('security_deposit', axis=1, inplace=True)

In [188]:
listings.cleaning_fee.isna().sum()

1030

Though it is not as many we are missing over 25\% of cleaning_fee observations too. **We'll drop cleaning fee as well and just keep its NaN categorical feature.**

#### Drop cleaning_fee

In [189]:
listings.drop('cleaning_fee', axis=1, inplace=True)

# Categorical Conversions
We'll convert the following columns to categoricals. Some are binary categoricals where we can just replace the two options with 1 or 0. Other's we'll have to use pandas getDummies function.  
- 'host_response_time'
- 'host_is_superhost'
- 'host_has_profile_pic'
- 'host_identity_verified'
- 'neighbourhood'
- 'is_location_exact'
- 'property_type'
- 'bed_type'
- 'requires_license'
- 'instant_bookable'
- 'cancellation_policy'
- 'require_guest_profile_picture'
- 'require_guest_phone_verification'

In [190]:
listings.loc[:, ['host_response_time', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified',
         'neighbourhood', 'is_location_exact', 'property_type', 'bed_type', 'requires_license', 
         'instant_bookable', 'cancellation_policy', 'require_guest_profile_picture', 
         'require_guest_phone_verification']]

Unnamed: 0,host_response_time,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood,is_location_exact,property_type,bed_type,requires_license,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification
0,within a few hours,f,t,t,Queen Anne,t,Apartment,Real Bed,f,f,moderate,f,f
1,within an hour,t,t,t,Queen Anne,t,Apartment,Real Bed,f,f,strict,t,t
2,within a few hours,f,t,t,Queen Anne,t,House,Real Bed,f,f,strict,f,f
3,,f,t,t,Queen Anne,t,Apartment,Real Bed,f,f,flexible,f,f
4,within an hour,f,t,t,Queen Anne,t,House,Real Bed,f,f,strict,f,f
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3813,within a few hours,f,t,t,Fremont,t,House,Real Bed,f,f,strict,f,f
3814,within an hour,f,t,t,Portage Bay,t,Apartment,Real Bed,f,f,moderate,f,f
3815,,f,t,f,,f,House,Real Bed,f,f,moderate,f,f
3816,within an hour,f,t,t,,f,Condominium,Real Bed,f,f,moderate,f,f


## Binary Categorical Conversion
- 'host_is_superhost'
- 'host_has_profile_pic'
- 'host_identity_verified'
- 'is_location_exact'
- 'requires_license'
- 'instant_bookable'
- 'require_guest_profile_picture'
- 'require_guest_phone_verification'

### Check for NaNs 

In [191]:
listings[['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified',
         'is_location_exact', 'requires_license', 'instant_bookable', 'require_guest_profile_picture', 
         'require_guest_phone_verification']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3818 entries, 0 to 3817
Data columns (total 8 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   host_is_superhost                 3816 non-null   object
 1   host_has_profile_pic              3816 non-null   object
 2   host_identity_verified            3816 non-null   object
 3   is_location_exact                 3818 non-null   object
 4   requires_license                  3818 non-null   object
 5   instant_bookable                  3818 non-null   object
 6   require_guest_profile_picture     3818 non-null   object
 7   require_guest_phone_verification  3818 non-null   object
dtypes: object(8)
memory usage: 268.5+ KB


In [192]:
binary_features = listings.replace({'t':1, 'f':0})[['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified',
         'is_location_exact', 'requires_license', 'instant_bookable', 'require_guest_profile_picture', 
         'require_guest_phone_verification']].astype('Int8')

In [193]:
listings.drop(['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified',
         'is_location_exact', 'requires_license', 'instant_bookable', 'require_guest_profile_picture', 
         'require_guest_phone_verification'], axis=1, inplace=True)

In [194]:
listings = pd.concat([listings, binary_features], axis=1)

## Multiple Categorical Conversion
- 'host_response_time'
- 'neighbourhood'
- 'property_type'
- 'bed_type'
- 'cancellation_policy'

### Investigate Categories
If the number of categories is too large, it may not make sense to add it to the dataset as a categorical.

In [195]:
listings.host_response_time.value_counts()

within an hour        1692
within a few hours     968
within a day           597
a few days or more      38
Name: host_response_time, dtype: int64

In [196]:
listings.neighbourhood.value_counts()

Capitol Hill          351
Ballard               213
Belltown              204
Minor                 192
Queen Anne            187
                     ... 
Pike Market             2
South Beacon Hill       2
North College Park      1
Roxhill                 1
Fairmount Park          1
Name: neighbourhood, Length: 81, dtype: int64

In [197]:
listings.property_type.value_counts()

House              1733
Apartment          1708
Townhouse           118
Condominium          91
Loft                 40
Bed & Breakfast      37
Other                22
Cabin                21
Camper/RV            13
Bungalow             13
Boat                  8
Tent                  5
Treehouse             3
Dorm                  2
Chalet                2
Yurt                  1
Name: property_type, dtype: int64

In [198]:
listings.bed_type.value_counts()

Real Bed         3657
Futon              74
Pull-out Sofa      47
Airbed             27
Couch              13
Name: bed_type, dtype: int64

In [199]:
listings.cancellation_policy.value_counts()

strict      1417
moderate    1251
flexible    1150
Name: cancellation_policy, dtype: int64

Although `neighbourhood` really explodes the feature set, adding 81 columns, we did see that neighborhood seemed to make a difference in terms of occupancy rate, so we'll bear this in mind and keep it for now.

In [200]:
listings = pd.get_dummies(listings, columns=['host_response_time', 'neighbourhood', 'property_type',
                                                    'bed_type', 'cancellation_policy'], drop_first=True)

In [201]:
listings

Unnamed: 0,id,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,thumbnail_url,...,property_type_Tent,property_type_Townhouse,property_type_Treehouse,property_type_Yurt,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed,cancellation_policy_moderate,cancellation_policy_strict
0,241032,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,,,,...,0,0,0,0,0,0,0,1,1,0
1,953595,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,none,"Queen Anne is a wonderful, truly functional vi...",What's up with the free pillows? Our home was...,"Convenient bus stops are just down the block, ...",https://a0.muscache.com/ac/pictures/14409893/f...,...,0,0,0,0,0,0,0,1,0,1
2,3308979,New Modern House-Amazing water view,New modern house built in 2013. Spectacular s...,"Our house is modern, light and fresh with a wa...",New modern house built in 2013. Spectacular s...,none,Upper Queen Anne is a charming neighborhood fu...,Our house is located just 5 short blocks to To...,A bus stop is just 2 blocks away. Easy bus a...,,...,0,0,0,0,0,0,0,1,0,1
3,7421966,Queen Anne Chateau,A charming apartment that sits atop Queen Anne...,,A charming apartment that sits atop Queen Anne...,none,,,,,...,0,0,0,0,0,0,0,1,0,0
4,278830,Charming craftsman 3 bdm house,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,none,We are in the beautiful neighborhood of Queen ...,Belltown,The nearest public transit bus (D Line) is 2 b...,,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3813,8101950,3BR Mountain View House in Seattle,Our 3BR/2BA house boasts incredible views of t...,"Our 3BR/2BA house bright, stylish, and wheelch...",Our 3BR/2BA house boasts incredible views of t...,none,We're located near lots of family fun. Woodlan...,,,https://a2.muscache.com/ac/pictures/103217071/...,...,0,0,0,0,0,0,0,1,0,1
3814,8902327,Portage Bay View!-One Bedroom Apt,800 square foot 1 bedroom basement apartment w...,This space has a great view of Portage Bay wit...,800 square foot 1 bedroom basement apartment w...,none,The neighborhood is a quiet oasis that is clos...,This is a basement apartment in a newer reside...,Uber and Car2go are good options in Seattle. T...,https://a2.muscache.com/ac/pictures/626d4b1f-6...,...,0,0,0,0,0,0,0,1,1,0
3815,10267360,Private apartment view of Lake WA,"Very comfortable lower unit. Quiet, charming m...",,"Very comfortable lower unit. Quiet, charming m...",none,,,,https://a2.muscache.com/ac/pictures/a5974f04-2...,...,0,0,0,0,0,0,0,1,1,0
3816,9604740,Amazing View with Modern Comfort!,Cozy studio condo in the heart on Madison Park...,Fully furnished unit to accommodate most needs...,Cozy studio condo in the heart on Madison Park...,none,Madison Park offers a peaceful slow pace upsca...,,Yes,https://a2.muscache.com/ac/pictures/202e4ad6-b...,...,0,0,0,0,0,0,0,1,1,0


# Remove Unnecessary Columns
There are many columns that we can remove by default, as they won't be used in the prediction. Generally the columns to remove fall into these categories:
- Unnecessary ID fields
- Freeform text fields
- Excessive NaNs
- Non-useful features

In [202]:
# Create df of each feature, its data type, and an example of an entry for easier evaluation of features
feature_data_types = list(zip(list(listings.columns), list(listings.dtypes), [y for x, y in list(listings.iloc[1].iteritems())]))
feature_data_types

[('id', dtype('int64'), 953595),
 ('name', dtype('O'), 'Bright & Airy Queen Anne Apartment'),
 ('summary',
  dtype('O'),
  "Chemically sensitive? We've removed the irritants triggering allergy or asthma attacks, like carpeting, forced air & used pillows, all culprits that harbor fungus, mold & bacteria.  No smoking, no pets.  Designed for healthy living, so breathe easy."),
 ('space',
  dtype('O'),
  "Beautiful, hypoallergenic apartment in an extremely safe, quiet and pedestrian-friendly section of Queen Anne.  A leafy-green location that puts the best of Seattle at your doorstep. Free WiFi, free parking, ...even free pillows!  What's special about this place? Hypo-allergenic bedding, mattresses/covers and new pillows with each rental. (So feel free to take your new pillows with you!)  A beautiful restoration mixing period details & modern sensibilities: Hardwood floors throughout, white subway tile, low/no VOCs & non-toxic paints keep this home-away-from-home clean, smart and healthy.

In [203]:
# List of columns to drop
drop_columns = ['name', 'summary', 'space', 'description', 'neighborhood_overview', 'notes', 'transit', 
'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_name', 'host_since', 
 'host_location', 'host_about', 'host_thumbnail_url', 'host_picture_url', 'host_verifications',
 'host_neighbourhood', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code',
 'latitude', 'longitude', 'weekly_price', 'monthly_price', 'extra_people', 'calendar_updated',
 'has_availability', 'availability_30', 'availability_60', 'availability_90', 'availability_365',
 'calendar_last_scraped', 'first_review', 'last_review', 'jurisdiction_names']

In [204]:
listings.drop(drop_columns, axis=1, inplace=True)
listings

Unnamed: 0,id,experiences_offered,host_response_rate,host_listings_count,host_total_listings_count,street,neighbourhood_cleansed,neighbourhood_group_cleansed,country,room_type,...,property_type_Tent,property_type_Townhouse,property_type_Treehouse,property_type_Yurt,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed,cancellation_policy_moderate,cancellation_policy_strict
0,241032,none,96%,3.0,3.0,"Gilman Dr W, Seattle, WA 98119, United States",West Queen Anne,Queen Anne,United States,Entire home/apt,...,0,0,0,0,0,0,0,1,1,0
1,953595,none,98%,6.0,6.0,"7th Avenue West, Seattle, WA 98119, United States",West Queen Anne,Queen Anne,United States,Entire home/apt,...,0,0,0,0,0,0,0,1,0,1
2,3308979,none,67%,2.0,2.0,"West Lee Street, Seattle, WA 98119, United States",West Queen Anne,Queen Anne,United States,Entire home/apt,...,0,0,0,0,0,0,0,1,0,1
3,7421966,none,,1.0,1.0,"8th Avenue West, Seattle, WA 98119, United States",West Queen Anne,Queen Anne,United States,Entire home/apt,...,0,0,0,0,0,0,0,1,0,0
4,278830,none,100%,2.0,2.0,"14th Ave W, Seattle, WA 98119, United States",West Queen Anne,Queen Anne,United States,Entire home/apt,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3813,8101950,none,99%,354.0,354.0,"Northwest 48th Street, Seattle, WA 98107, Unit...",Fremont,Other neighborhoods,United States,Entire home/apt,...,0,0,0,0,0,0,0,1,0,1
3814,8902327,none,100%,1.0,1.0,"Fuhrman Avenue East, Seattle, WA 98102, United...",Portage Bay,Capitol Hill,United States,Entire home/apt,...,0,0,0,0,0,0,0,1,1,0
3815,10267360,none,,1.0,1.0,"South Laurel Street, Seattle, WA 98178, United...",Rainier Beach,Rainier Valley,United States,Entire home/apt,...,0,0,0,0,0,0,0,1,1,0
3816,9604740,none,100%,1.0,1.0,"43rd Avenue East, Seattle, WA 98112, United St...",Madison Park,Capitol Hill,United States,Entire home/apt,...,0,0,0,0,0,0,0,1,1,0


In [205]:
# Check the variables again to see what was missed
listings.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3818 entries, 0 to 3817
Data columns (total 164 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   id                                       3818 non-null   int64  
 1   experiences_offered                      3818 non-null   object 
 2   host_response_rate                       3295 non-null   object 
 3   host_listings_count                      3816 non-null   float64
 4   host_total_listings_count                3816 non-null   float64
 5   street                                   3818 non-null   object 
 6   neighbourhood_cleansed                   3818 non-null   object 
 7   neighbourhood_group_cleansed             3818 non-null   object 
 8   country                                  3818 non-null   object 
 9   room_type                                3818 non-null   object 
 10  accommodates                             3818 n

In [206]:
feature_data_types = list(zip(list(listings.columns), list(listings.dtypes), [y for x, y in list(listings.iloc[1].iteritems())]))
feature_data_types

[('id', dtype('int64'), 953595),
 ('experiences_offered', dtype('O'), 'none'),
 ('host_response_rate', dtype('O'), '98%'),
 ('host_listings_count', dtype('float64'), 6.0),
 ('host_total_listings_count', dtype('float64'), 6.0),
 ('street', dtype('O'), '7th Avenue West, Seattle, WA 98119, United States'),
 ('neighbourhood_cleansed', dtype('O'), 'West Queen Anne'),
 ('neighbourhood_group_cleansed', dtype('O'), 'Queen Anne'),
 ('country', dtype('O'), 'United States'),
 ('room_type', dtype('O'), 'Entire home/apt'),
 ('accommodates', dtype('int64'), 4),
 ('bathrooms', dtype('float64'), 1.0),
 ('bedrooms', dtype('float64'), 1.0),
 ('beds', dtype('float64'), 1.0),
 ('amenities',
  dtype('O'),
  '{TV,Internet,"Wireless Internet",Kitchen,"Free Parking on Premises","Buzzer/Wireless Intercom",Heating,"Family/Kid Friendly",Washer,Dryer,"Smoke Detector","Carbon Monoxide Detector","First Aid Kit","Safety Card","Fire Extinguisher",Essentials}'),
 ('square_feet', dtype('float64'), nan),
 ('price', dtyp

# Additional columns to process
Looks like there are some columns that were missed.
- experiences_offered - categorical conversion
- host_response_rate - drop
- street - drop
- neighbourhood_cleansed - drop
- neighbourhood_group_cleansed - drop
- country - drop
- room_type - categorical conversion
- square_feet - drop
- amenities - drop
- review_scores_accuracy - drop
- review_scores_cleanliness - drop
- review_scores_location - drop
- review_scores_value - drop
- reviews_per_month - drop
- bathrooms - impute
- bedrooms - impute
- beds - impute

## Categorical Conversions
- experiences_offered
- room_type

In [207]:
listings.experiences_offered.value_counts()

none    3818
Name: experiences_offered, dtype: int64

- Actually it looks like we can just drop experiences_offered, since they are all 'none'

### Drop experiences_offerend

In [208]:
listings.drop('experiences_offered', axis=1, inplace=True)

In [209]:
listings.room_type.value_counts()

Entire home/apt    2541
Private room       1160
Shared room         117
Name: room_type, dtype: int64

In [210]:
listings = pd.get_dummies(listings, columns=['room_type'], drop_first=True)

## Additional Drops
While the additional review scores could potentially be useful, the all feed into the overall review score (review_scores_rating) so I'll used that as a proxy. For host_response_rate I created a separate imputed version of that column so I can drop it. 
- host_response_rate - drop
- street - drop
- neighbourhood_cleansed - drop
- neighbourhood_group_cleansed - drop
- country - drop
- square_feet - drop
- amenities - drop
- review_scores_accuracy - drop
- review_scores_cleanliness - drop
- review_scores_location - drop
- review_scores_value - drop
- reviews_per_month - drop

In [211]:
listings.drop(['host_response_rate', 'street', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed',
               'country', 'amenities', 'review_scores_accuracy', 'review_scores_cleanliness', 
               'review_scores_location', 'review_scores_value', 'reviews_per_month', 
                'square_feet'], axis=1, inplace=True)

In [231]:
feature_data_types_df.loc[feature_data_types_df.null_entries > 1]

Unnamed: 0,feature_names,data_types,observations,null_entries
1,"(host_listings_count,)",float64,6.0,2
2,"(host_total_listings_count,)",float64,6.0,2
12,"(review_scores_rating,)",float64,96.0,647
13,"(review_scores_checkin,)",float64,10.0,658
14,"(review_scores_communication,)",float64,10.0,651
38,"(host_is_superhost,)",Int8,1.0,2
39,"(host_has_profile_pic,)",Int8,1.0,2
40,"(host_identity_verified,)",Int8,1.0,2


## Impute Values
The features below have NaNs, but a very low number of them, so it's probably safe to simply impute their mean values, but I'll take a closer look before imputing to be sure.
- bathrooms
- bedrooms
- beds

### Investigate bathrooms

In [212]:
listings.bathrooms.value_counts()

1.0    2882
2.0     373
1.5     248
2.5     124
3.0      64
3.5      57
0.5      31
0.0       8
4.0       8
4.5       3
5.0       2
8.0       2
Name: bathrooms, dtype: int64

In [213]:
listings.bathrooms.isna().sum()

16

### bathrooms - Impute with distribution imputation method

In [214]:
bathrooms_impute = probabilistic_nan_replacement(listings.bathrooms)
bathrooms_impute

0       1.0
1       1.0
2       4.5
3       1.0
4       2.0
       ... 
3813    2.0
3814    1.0
3815    1.0
3816    1.0
3817    1.5
Length: 3818, dtype: float64

In [215]:
bathrooms_impute.isna().sum()

0

In [216]:
listings.bathrooms = bathrooms_impute

### Investigate bedrooms

In [217]:
listings.bedrooms.value_counts()

1.0    2417
2.0     640
0.0     372
3.0     283
4.0      69
5.0      24
6.0       6
7.0       1
Name: bedrooms, dtype: int64

In [218]:
listings.bedrooms.isna().sum()

6

In [222]:
bedrooms_impute = probabilistic_nan_replacement(listings.bedrooms)
listings.bedrooms = bedrooms_impute

### Investigate beds

In [220]:
listings.beds.value_counts()

1.0     2201
2.0      912
3.0      433
4.0      152
5.0       73
6.0       21
7.0       14
8.0        4
9.0        4
10.0       2
15.0       1
Name: beds, dtype: int64

In [221]:
listings.beds.isna().sum()

1

In [223]:
beds_impute = probabilistic_nan_replacement(listings.beds)
listings.beds = beds_impute

## Check listings
Check listings to confirm all columns have been handled and have no NaNs

In [238]:
null_columns = [entry for entry in list(listings.columns) if listings[entry].isna().sum() > 0]
listings[null_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3816 entries, 0 to 3817
Data columns (total 3 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   review_scores_rating         3171 non-null   float64
 1   review_scores_checkin        3160 non-null   float64
 2   review_scores_communication  3167 non-null   float64
dtypes: float64(3)
memory usage: 119.2 KB


In [230]:
feature_data_types_df.loc[feature_data_types_df.null_entries > 1]

Unnamed: 0,feature_names,data_types,observations,null_entries
1,"(host_listings_count,)",float64,6.0,2
2,"(host_total_listings_count,)",float64,6.0,2
12,"(review_scores_rating,)",float64,96.0,647
13,"(review_scores_checkin,)",float64,10.0,658
14,"(review_scores_communication,)",float64,10.0,651
38,"(host_is_superhost,)",Int8,1.0,2
39,"(host_has_profile_pic,)",Int8,1.0,2
40,"(host_identity_verified,)",Int8,1.0,2


### Drop NaN columns
For the sake of simplicity I'll just drop the rows where the columns have only 2 missing entries. 

In [234]:
listings.dropna(subset=['host_listings_count', 'host_total_listings_count', 'host_is_superhost',
                       'host_has_profile_pic', 'host_identity_verified'], inplace=True)

In [239]:
# Check results
null_columns = [entry for entry in list(listings.columns) if listings[entry].isna().sum() > 0]
listings[null_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3816 entries, 0 to 3817
Data columns (total 3 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   review_scores_rating         3171 non-null   float64
 1   review_scores_checkin        3160 non-null   float64
 2   review_scores_communication  3167 non-null   float64
dtypes: float64(3)
memory usage: 119.2 KB


### Drop Review Columns
Again, the only review column I'm interested in is review_scores_rating, so we'll drop the other two.

In [241]:
listings.drop(['review_scores_checkin', 'review_scores_communication'], axis=1, inplace=True)
null_columns = [entry for entry in list(listings.columns) if listings[entry].isna().sum() > 0]
listings[null_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3816 entries, 0 to 3817
Data columns (total 1 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   review_scores_rating  3171 non-null   float64
dtypes: float64(1)
memory usage: 59.6 KB


### Impute review_scores_rating
This may be a tricky feature to impute for a couple of reasons:
- One, it is the direct evaluation of a user's feelings on the experience it is one of the values that I want least to tamper with.
- Two, there are A LOT of missing values. Imputing incorrectly could significantly affect the effectiveness of this feature. 

We'll first take a look at the feature and what the distribution is. 

In [242]:
listings.review_scores_rating.value_counts()

100.0    781
98.0     292
96.0     278
97.0     266
95.0     260
94.0     198
93.0     184
99.0     146
90.0     138
80.0     111
92.0     105
91.0      97
87.0      55
89.0      53
88.0      47
85.0      24
84.0      23
86.0      17
83.0      13
60.0      12
82.0      10
70.0       8
73.0       7
76.0       5
72.0       4
78.0       4
77.0       4
40.0       4
75.0       3
71.0       3
81.0       3
68.0       2
74.0       2
67.0       2
79.0       2
66.0       2
64.0       1
20.0       1
53.0       1
65.0       1
55.0       1
57.0       1
Name: review_scores_rating, dtype: int64

Many of the values are clustered in the high 90s range, so an imputation will likely put most values in that range as well. While there is a distinct possibiltiy of a "bad roll" with a random probabilistic imputation, I still think it would be far better than imputing the median or mean.  

**With more time I might try to regress the values, but we need to get moving so I'll use the probabilistic impute approach.**

In [243]:
review_ratings_impute = probabilistic_nan_replacement(listings.review_scores_rating)
listings.review_scores_rating = review_ratings_impute
listings.review_scores_rating.isna().sum()

0

In [244]:
# Check for NaNs
null_columns = [entry for entry in list(listings.columns) if listings[entry].isna().sum() > 0]
listings[null_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3816 entries, 0 to 3817
Empty DataFrame

In [246]:
listings.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3816 entries, 0 to 3817
Data columns (total 150 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   id                                       3816 non-null   int64  
 1   host_listings_count                      3816 non-null   float64
 2   host_total_listings_count                3816 non-null   float64
 3   accommodates                             3816 non-null   int64  
 4   bathrooms                                3816 non-null   float64
 5   bedrooms                                 3816 non-null   float64
 6   beds                                     3816 non-null   float64
 7   price                                    3816 non-null   int32  
 8   guests_included                          3816 non-null   int64  
 9   minimum_nights                           3816 non-null   int64  
 10  maximum_nights                           3816 n

All clear.

# Drop id column
Forgot to drop the id column. It is not needed. 

In [250]:
listings.drop('id', axis=1, inplace=True)

# Pickle Processed listings DataFrame

In [251]:
pickle.dump(listings, open('../data/pickles/listings_munged2.pkl', 'wb'))