# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Packages" data-toc-modified-id="Packages-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Packages</a></div><div class="lev1 toc-item"><a href="#Loading-data" data-toc-modified-id="Loading-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Loading data</a></div><div class="lev1 toc-item"><a href="#Location-of-listings" data-toc-modified-id="Location-of-listings-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Location of listings</a></div>

# Packages

In [3]:
import os
import pandas as pd
import numpy as np
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
%matplotlib inline
import cufflinks as cf

pd.set_option('display.max_columns', 500)

# Loading data

In [4]:
% ls data/
datapath='data/'

[1m[36marchives[m[m/         demographics.csv  listings.csv      venues.csv
calendar.csv      econ_state.csv    real_estate.csv


In [5]:
NROWS = 10000

In [6]:
%%time
listings_df = pd.read_csv(datapath + 'listings.csv')
# Reformat t, f columns
for column in ['instant_bookable', 'has_availability']:
    listings_df[column] = listings_df[column].apply(lambda x: True if x == 't' else False)
listings_df.head()
listings_df.info()
listings_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56417 entries, 0 to 56416
Data columns (total 29 columns):
accommodates                   56417 non-null float64
amenities                      56417 non-null object
availability_30                56417 non-null int64
bathrooms                      56238 non-null float64
bed_type                       56417 non-null object
bedrooms                       56363 non-null float64
beds                           56331 non-null float64
cancellation_policy            56417 non-null object
city                           56417 non-null object
has_availability               4632 non-null object
host_id                        56417 non-null int64
id                             56417 non-null int64
instant_bookable               56417 non-null bool
latitude                       56417 non-null float64
longitude                      56417 non-null float64
metropolitan                   56417 non-null object
name                           56410 non-nu


Columns (9) have mixed types. Specify dtype option on import or set low_memory=False.



In [7]:
%%time
calendar_df = pd.read_csv(datapath + 'calendar.csv', parse_dates=[1])
calendar_df['price'] = calendar_df['price'].replace( '[\$,)]','', regex=True).astype(float)
calendar_df.head()
calendar_df.info()
calendar_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20611185 entries, 0 to 20611184
Data columns (total 5 columns):
listing_id    int64
date          datetime64[ns]
available     object
price         float64
metro_area    object
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 786.3+ MB
CPU times: user 48.6 s, sys: 3.53 s, total: 52.1 s
Wall time: 52.8 s


How many prices for one listing_id?

In [11]:
calendar_df.groupby(by='listing_id').count().sort_values('price', ascending=False)

Unnamed: 0_level_0,date,available,price,metro_area
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18332319,730,730,730,730
13941107,730,730,582,730
1003184,365,365,365,365
17285168,365,365,365,365
5334006,365,365,365,365
1004578,365,365,365,365
5333346,365,365,365,365
13006232,365,365,365,365
2466639,365,365,365,365
17285797,365,365,365,365


In [24]:
# Plot price evolution for some listings
for listing_id in [5334006, 18524042, 5352741, 2471731]:
    calendar_df[calendar_df['listing_id'] == listing_id][['date','price']].iplot(x='date', y='price');

# Location of listings

In [25]:
listings_df['city'].value_counts().iplot()

In [21]:
city = 'san francisco'

In [22]:
listings_san_francisco = listings_df[listings_df['city'] == city]

In [28]:
listings_san_francisco[['latitude', 'longitude']]

Unnamed: 0,latitude,longitude
39536,37.754184,-122.406514
39537,37.754166,-122.421534
39538,37.758506,-122.406152
39539,37.756549,-122.422025
39540,37.760051,-122.421352
39541,37.759495,-122.424874
39542,37.760997,-122.413124
39543,37.750543,-122.416559
39544,37.756288,-122.408738
39545,37.757246,-122.409315


In [33]:
listings = []
scale = 5000

for i in range(listings_san_francisco.shape[0]):
    listing = dict(
            type = 'scattergeo',
            locationmode = 'USA-states',
            lon = listings_san_francisco['longitude'],
            lat = listings_san_francisco['latitude'],
#             text = df_sub['text'],
            marker = dict(
#                 size = df_sub['pop']/scale,
#                 color = colors[i],
#                 line = dict(width=0.5, color='rgb(40,40,40)'),
#                 sizemode = 'area'
            ),
#         name = '{0} - {1}'.format(lim[0],lim[1])
    )
    listings.append(listing)

layout = dict(title = 'San Francisco listings',
                showlegend = True,
                geo = dict(
                    scope='usa',
                    projection=dict(type='albers usa'),
                    showland = True,
                    landcolor = 'rgb(217, 217, 217)',
                    subunitwidth=1,
                    countrywidth=1,
                    subunitcolor="rgb(255, 255, 255)",
                    countrycolor="rgb(255, 255, 255)"
        ),
    )

fig = dict(data=listings, layout=layout )
iplot(fig)

KeyboardInterrupt: 