In [1]:
import os
import copy
from collections import OrderedDict
import json
import vincent
import pandas as pd
import numpy as np
import feather
import folium
from folium import plugins
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
vincent.core.initialize_notebook()

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', -1)

In [2]:
DATA_PATH = '../data/'
df =  feather.read_dataframe(DATA_PATH + 'housing_with_location')
boston_geojson_path = os.path.join(DATA_PATH, '12_brook_districts')
boston_neighborhood_geojson = os.path.join(DATA_PATH, 'boston_neighborhood_')
with open(boston_geojson_path) as f:
    boston_geojson = f.read()
with open(boston_neighborhood_geojson) as f:
    neighborhood_geojson = f.read()

In [3]:
featured_cities = ['Brookline',
            'Cambridge',
            'Fitchburg',
             'Fall River',
             'Lowell',
             'Lynn',
             'Malden',
             'Medfold',
             'New Bedford',
            'Quincy',
             'Revere',
             'Somerville',
             'Watertown',
            ]
unis = OrderedDict([('harvard med',['Mission Hill', 'Longwood Medical Area', 'Fenway']),
            ('harvard',['Cambridge', 'Somerville']),
            ('mit',['Back Bay', 'Beacon Hill', 'West End'])])

In [4]:
neighborhoods_parent_dict = dict()
for dist in json.loads(neighborhood_geojson)['features']:
    neighborhood_name = dist['properties']['name']
    neighborhoods_parent_dict[neighborhood_name] = dist['properties']['parent']

In [5]:
neighborhood_list = list()
for dist in json.loads(neighborhood_geojson)['features']:
    neighborhood_list.append(dist['properties']['name'])

In [6]:
demand_df = df[df.supply == False]
print(f'total demand {demand_df.shape[0]}')
print(f'demand with location {demand_df.location.notna().sum()}')

total demand 689
demand with location 290


In [7]:
def get_locs(text):
    locs = list()
    for city in featured_cities:
        if city.lower() in text:
            locs.append(city)
    for neig in neighborhood_list:
        if neig.lower() in text:
            locs.append(neig)
    for uni in unis:
        if uni in text:
            locs += unis[uni]
            break
    if locs:
        return ','.join(locs)
    else:
        return None

def get_city_by_hood(hood):
    hood_list = hood.split(',')
    cities = set()
    for h in hood_list:
        if h in neighborhoods_parent_dict:
            cities.add(neighborhoods_parent_dict[h])
    if cities:
        return ','.join(list(cities))
    else:
        return hood

In [8]:
demand_df['new_location'] = demand_df.text.apply(get_locs)
# merge location & new_location
demand_df[(demand_df.location.notna()) & (demand_df.new_location.notna())]\
                            .apply(lambda doc: ','.join([doc['new_location'], doc['location']]),axis=1) 
# deduplicate
demand_df['location_2_level'] = demand_df[demand_df.new_location.notna()].new_location.apply(lambda x: ','.join(set(x.split(','))))
demand_df = demand_df[demand_df.location_2_level.notna()]
demand_df['location_1_level'] = demand_df['location_2_level'].apply(get_city_by_hood)

In [9]:
print(f'demand with location {demand_df.location_1_level.notna().sum()}')

demand with location 403


In [10]:
supply_df = df[df.supply == True]
print(f'total supply {supply_df.shape[0]}')
print(f'supply with location {supply_df.location.notna().sum()}')

total supply 2477
supply with location 1900


In [11]:
# supply we'll place only in one region (first if many)
supply_df.loc[supply_df.location.isna(), 'location'] = supply_df[supply_df.location.isna()].text.apply(get_locs).apply(lambda x: x.split(',')[0] if x else None)
supply_df = supply_df[supply_df.location.notna()]
supply_df['location_2_level'] = supply_df.location
supply_df['location_1_level'] = supply_df['location_2_level'].apply(get_city_by_hood)

In [12]:
print(f'supply with location {supply_df.location.notna().sum()}')

supply with location 2250


In [13]:
full_df = pd.concat([supply_df[['price', 'location_2_level', 'location_1_level', 'supply']],
                     demand_df[['price', 'location_2_level', 'location_1_level', 'supply']]], axis=0)

In [14]:
def paint(feature):
    if 'parent' in feature['properties']:
        level = 2
    else:
        level = 1
    dist_name = feature['properties']['name']
    level = f'location_{level}_level'
    count = full_df[(full_df[level].str.contains(dist_name))].shape[0]

    if count > 0 and count < 10:
        color = '#bdc9e1'
    elif count > 10 and count < 50:
        color = '#74a9cf'
    elif count > 50 and count < 250:
        color = '#2b8cbe'
    elif count > 250:
        color = '#045a8d'
    else:
        color = '#f1eef6'
    return {"fillColor":color, "fillOpacity":0.8,"opacity":0}

def popup_html(dist_name, level):
    level = f'location_{level}_level'
    html = f"<h4>{dist_name}</h4>"
    supp = supply_df[(supply_df[level].str.contains(dist_name))]
    dema = demand_df[(demand_df[level].str.contains(dist_name))]
    html += f"<b>Supply: {supp.shape[0]}</b><br>"
    html += f"<i>-mean: {np.round(supp.price.mean())}$</i><br>"
    html += f"<i>-median: {np.round(supp.price.median())}$</i><br>"
    
    html += f"<b>Demand: {dema.shape[0]}</b><br>"
    html += f"<i>-mean: {np.round(dema.price.mean())}$</i><br>"
    html += f"<i>-median: {np.round(dema.price.median())}$</i><br>"
    return html

def make_bar_plot(feature, level):
    level = f'location_{level}_level'
    bins_count = 8
    sub_df = full_df[full_df[level] == feature]
    if not sub_df.shape[0]:
        return None
    bins, bins_range = pd.cut(full_df[full_df[level] == feature].price, bins=bins_count, labels=False,retbins=True)
    b_range = list()
    for i in range(len(bins_range)-1):
        b_range.append((bins_range[i],bins_range[i+1]))
    b_range = [(int(b[0]) // 100)*100 for b in b_range]
    
    tmp = pd.DataFrame(full_df[full_df[level] == feature][['price', 'supply']])
    tmp['bin'] = bins
    agg_bins = tmp.groupby(['bin', 'supply']).count()
    agg_bins = agg_bins.unstack().fillna(0)
    agg_bins.reset_index(inplace=True)
    agg_bins['range'] = [f'{b_range[b]}' for b in agg_bins['bin'].values]
    bar = vincent.GroupedBar(agg_bins.set_index('range').price, width=250, height=200,)
    bar.axis_titles(x='price range', y='adverts')
    bar.legend(title='Supply')
#     print(feature)
#     bar.display()
#     print(bar.to_json())
    return bar.to_json()

In [15]:
m = folium.Map(location=[42.34878, -71.10445], zoom_start=10, min_zoom=8, max_zoom=11)
level = 1
for dist in json.loads(boston_geojson)['features']:
    dist_name = dist['properties']['name']
    gj = folium.GeoJson(data=dist, style_function = paint,
                        control=False, tooltip=popup_html(dist_name, level),
                       highlight_function=lambda x:{"fillOpacity":1, "opacity":1})
    gj.add_to(m)
    if full_df[full_df.location_1_level == dist_name].shape[0] < 3:
        continue
    popup = folium.Popup(max_width=500)
    bar_plot_json = make_bar_plot(dist_name, level)
    folium.Vega(bar_plot_json, width=350, height=250).add_to(popup)
    popup.add_to(gj)
    
m

In [16]:
m = folium.Map(location=[42.34878, -71.10445], zoom_start=10, min_zoom=8, max_zoom=11)
level = 2
for dist in json.loads(neighborhood_geojson)['features']:
    dist_name = dist['properties']['name']
    gj = folium.GeoJson(data=dist, style_function = paint,
                        control=False, smooth_factor=0, tooltip=popup_html(dist_name, level),
                       highlight_function=lambda x:{"fillOpacity":1, "opacity":1})
    gj.add_to(m)
    if full_df[full_df.location_2_level == dist_name].shape[0] < 3:
        continue
    popup = folium.Popup(max_width=500)
    bar_plot_json = make_bar_plot(dist_name, level)
    folium.Vega(bar_plot_json, width=350, height=250).add_to(popup)
    popup.add_to(gj)
    
m