# Imports

In [1]:
#Enable matplotlib to display in jupyter notebook & import it
%matplotlib inline

import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import pandas as pd
import numpy as np
import re
from geopy.geocoders import Nominatim #used in filling missing zipcodes


# Read and Clean Listings.csv

In [2]:
#listings.csv READING

LISTINGS = 'data/listings.csv'

#Choose which columns from the csv to read in.
listings_cols = ['id',
                'host_id',
                'neighbourhood_cleansed',
                'zipcode',
                'latitude',
                'longitude',
                'property_type',
                'room_type',
                'accommodates',
                'bathrooms',
                'amenities',
                'price',
                'cleaning_fee',
                'number_of_reviews',
                'first_review',
                'review_scores_rating',
                'review_scores_accuracy',
                'review_scores_cleanliness',
                'review_scores_checkin',
                'review_scores_communication',
                'review_scores_location',
                'review_scores_value',
                'calculated_host_listings_count',
                'reviews_per_month',
                'bedrooms',
                'beds',
                'cancellation_policy',
                'instant_bookable',
                'minimum_nights'] 

#Read in data from the csv
listings = pd.read_csv(LISTINGS, usecols=listings_cols)

#Rename any Columns as needed
rename_dict = {'id':'listing_id',
              'price':'listed_price'}

listings.rename(columns = rename_dict, inplace=True)

#use listing_id as index
listings.set_index('listing_id', inplace=True)



#############################
#         Cleaning          #
#############################

# 'zipcode' ##########
#Paste this in to zipcode section of cleaning
def latLonToZip(lat, lon):
    '''Take in a latitude and longitude and return the zipcode for that location'''
    geolocator = Nominatim()
    try:
        location = geolocator.reverse(str(lat)+','+str(lon))
        z = re.compile('(\s)([0-9]{5})(,\sUnited)')
        return z.findall(location[0])[0][1]
    except:
        print(str(lat)+','+str(lon),'-----',location)
        return np.nan
    
    print(str(lat)+','+str(lon),'-----',location)
    return np.nan
#Find all missing zippcodes : missing_zipcodes
missing_zipcodes = listings[listings.zipcode.isnull()].copy()

#update rows that are missing zipcodes using latLonToZip to fill missin
listings.zipcode.update(missing_zipcodes.apply(lambda x: latLonToZip(x['latitude'], x['longitude']), axis=1))

#Remove 'zip+4' part of any zipcode 
listings.zipcode = listings.zipcode.apply(lambda x: x[:5])

# 'price' --> 'listed_price' ##########
listings.listed_price = listings.listed_price.replace('[^0-9.]+','',regex=True).astype(float)

# 'cleaning_fee' ##########
listings.cleaning_fee = listings.cleaning_fee.replace('[^0-9.]+','',regex=True).astype(float)
listings.cleaning_fee.fillna(0, inplace = True)

# 'first_review' ##########
listings.first_review = pd.to_datetime(listings.first_review)

# 'amenities' ##########
listings.amenities = listings.amenities.replace('[^\w,\s/]+','',regex=True).apply(lambda x: x.split(','))

# 'instant_bookable' ##########
listings.instant_bookable.replace({'f':False,'t':True}, inplace=True)


# Read and Clean Calendar.csv

In [3]:
#Calendar.csv READING

CALENDAR = 'data/calendar.csv'

#Read in all columns from calendar.csv : listing_id, date, available, price
calendar = pd.read_csv(CALENDAR)


#############################
#         Cleaning          #
#############################

# 'date' ##########
calendar.date = pd.to_datetime(calendar.date)

# 'available' ##########
calendar.available.replace({'f':False,'t':True}, inplace=True)

# 'price' ##########
calendar.price = calendar.price.replace('[^0-9.]+','',regex=True).astype(float)

#############################
#         Augmenting        #
#############################

#create column to represent the day of the week for each date
calendar['day_of_week'] = calendar.date.dt.dayofweek

#Fill in missing price values for each listing using mean value for day of week from that listing
calendar.price.fillna(calendar.groupby(['listing_id','day_of_week'])['price'].transform("mean"), inplace=True)

#create column for revenue generate by property (all prices for occupied days are modeled from mean)
calendar['day_revenue'] = np.where(calendar.available, 0.0, calendar.price)

# Using Calendar to Augment Listings DataFrame

In [4]:

#Create a list of calendars seperated into 4 quarters
quarter_dates = ['2016-09-06','2016-12-06','2017-03-06','2017-06-06','2017-09-06']
q_cal = [calendar[calendar.date.isin(pd.date_range(quarter_dates[n], quarter_dates[n+1]))] for n in range(4)]

#Revnue Per Quarter
for n,q in enumerate(q_cal):
    listings = listings.join(q_cal[n].groupby('listing_id').day_revenue.sum()).rename(columns={'day_revenue':'q'+str(n+1)+'_revenue'})

#Occupancy Per Quarter
for n,q in enumerate(q_cal):
    q_len = len(pd.date_range(quarter_dates[n], quarter_dates[n+1]))
    listings = listings.join((q_len - q_cal[n].groupby('listing_id').available.sum())/q_len).rename(columns={'available':'q'+str(n+1)+'_occupancy_rate'})


# Creating Amenities Dummy Variables

In [5]:
amenities = list(set([item for item_list in listings.amenities for item in item_list]))
for val in ['translation missing enhosting_amenity_49','translation missing enhosting_amenity_50', '']:
    amenities.remove(val)

amn_frame = pd.DataFrame(index = listings.index)

#create the dummy for each amenity and rename the column as you go
for amn in amenities:
    amn_frame = amn_frame.join(listings.amenities.apply(lambda amns: amn in amns)).rename(columns={'amenities':amn})

listings['analysis_table'] = listings.index
listings['analysis_table'] = pd.DataFrame(listings.analysis_table.map(lambda x: amn_frame.loc[x]))


# Write to Pickle

In [6]:
listings.to_pickle('data/listings_cleaned.pkl')
calendar.to_pickle('data/calendar_cleaned.pkl')