In [447]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

In [448]:
# Read in data
dat = pd.read_csv('listings.csv')
dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6366 entries, 0 to 6365
Data columns (total 74 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            6366 non-null   int64  
 1   listing_url                                   6366 non-null   object 
 2   scrape_id                                     6366 non-null   float64
 3   last_scraped                                  6366 non-null   object 
 4   name                                          6366 non-null   object 
 5   description                                   6352 non-null   object 
 6   neighborhood_overview                         4663 non-null   object 
 7   picture_url                                   6366 non-null   object 
 8   host_id                                       6366 non-null   int64  
 9   host_url                                      6366 non-null   o

<h3>Unnecessary Columns</h3>

<h4>Drop off irrelevant columns:</h4>

* URLs will not be useful
* Empty columns: 'neighborhood_group_cleansed', 'bathrooms', 'calendar_updated'
* 'neighborhood' column only has blank values or 'Chicago, Illinois, United States' value
* 'scrape_id' is all the same value and not useful for our model
* 'last_scraped', 'calendar_last_scraped' will not be useful

In [449]:
dat = dat.drop(['scrape_id','listing_url','host_thumbnail_url','host_picture_url','picture_url',
               'neighbourhood_group_cleansed','bathrooms','calendar_updated','neighbourhood',
               'last_scraped', 'calendar_last_scraped'], axis = 1)

<h3>Data Cleansing</h3>

First, we will split the bathroom text column into two: one containing a float variable for the number of bathrooms, and the other an additional descriptor of the bathroom (shared/private).

<h4>Bathrooms Column:</h4>

In [451]:
# First let us make all text lowercase to simplify string manipulation

dat['bathrooms_text'] = dat['bathrooms_text'].str.lower()

# Next we must convert and text 'half' to 0.5 so it is included in the subsequent number extraction

dat['bathrooms_text'] = dat['bathrooms_text'].str.replace(r'(half)+','0.5', regex = True)

# Then extract the numbers into the new 'bathrooms' float32 data type column

dat['bathrooms'] = dat['bathrooms_text'].str.extract(r'(\d+\.?\d*)', expand = True).astype(np.float32)

# This leaves us with only float and NaN values

dat['bathrooms'].unique()

array([ 1. ,  2. ,  1.5,  3. ,  2.5,  0. ,  3.5, 11. ,  5. ,  nan,  0.5,
        4. ,  4.5, 11.5,  6.5,  7. ,  5.5,  6. ,  8. , 12.5, 10. ],
      dtype=float32)

<h4>Bathrooms Text Column:</h4>

In [452]:
# Remove the text 'bath', unnecessary symbols and whitespace, then extract the leftover text

dat['bathrooms_text'] = dat['bathrooms_text'].str.replace(r'(bath)s*|(Bath)s*','', regex = True)
dat['bathrooms_text'] = dat['bathrooms_text'].str.replace(r' +|\.+|\-+','', regex = True)
dat['bathrooms_text'] = dat['bathrooms_text'].str.extract(r'(\D+)')

In [459]:
# This leaves only two bathroom descriptors and NaN values

dat['bathrooms_text'].unique()

array(['shared', nan, 'private'], dtype=object)

<h4>DateTime Columns:</h4>

In [455]:
# Convert dates to datetime data type

dat['host_since'] = pd.to_datetime(dat['host_since'])
dat['first_review'] = pd.to_datetime(dat['first_review'])
dat['last_review'] = pd.to_datetime(dat['last_review'])

<h4>Rate Columns:</h4>

In [456]:
# Convert host response rate and acceptance rate columns into float

dat['host_response_rate'] = dat['host_response_rate'].str.replace(r'(\D)','', regex = True).astype(np.float32)/100
dat['host_acceptance_rate'] = dat['host_acceptance_rate'].str.replace(r'(\D)','', regex = True).astype(np.float32)/100

<h4>Boolean Columns:</h4>

In [460]:
# Map superhost column to boolean values

# Need to check this, I think empty values are converted to False:
dat['host_is_superhost'] = dat['host_is_superhost'].map({'t':True,'f':False}).astype('bool')

# These do not convert to boolean, but keeps the empty/nan values:
dat['host_has_profile_pic'] = dat['host_has_profile_pic'].map({'t':True,'f':False})
dat['host_identity_verified'] = dat['host_identity_verified'].map({'t':True,'f':False})

In [458]:
dat['host_is_superhost'].unique()

array([ True, False])