# Project 3, Calculating number of customers around BART stations


University of California, Berkeley

Master of Information and Data Science (MIDS) program

W205, Section 01 Annie Cui, Emily Zhou, Shuo Wang

# Overview

Our goal is to increase AGM brand awareness transbay. One of the ways we'll do that is by piloting pick-up points or delivery points at 3 BART stations. In this notebook, we crate the heat map based on intensity of number of customers.

# BART Map

![Bart Map](bart_map.png)

# Included Modules and Packages

Code cell containing your includes for modules and packages

In [1]:
import math
import numpy as np
import pandas as pd

import psycopg2

import json

import gmaps
import gmaps.geojson_geometries

from geographiclib.geodesic import Geodesic

# Supporting code

Code cells containing any supporting code, such as connecting to the database, any functions, etc.  

In [2]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

In [3]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [4]:
cursor = connection.cursor()

# Connect to Google Maps using your api key; edit the file gmap_api_key.txt and put in your api key

In [5]:
f = open('gmap_api_key.txt', 'r')
my_api_key = f.read()
f.close()

gmaps.configure(api_key=my_api_key)

# Basic map centered on Sather Gate at UC Berkeley; all point are (latitude, longitude) in decimal; zoom levels go from 1 to 21; 1 is world level; 21 is street level; default type of map is Road Map

In [6]:
sather_gate_berkeley = (37.870260430419115, -122.25950168579497)

gmaps.figure(center=sather_gate_berkeley, zoom_level=9)

Figure(layout=FigureLayout(height='420px'))

# Find a list of zip codes for all customers of the Berkeley store; distinct removes duplicates

In [7]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select distinct z.*
from customers as cu
     join zip_codes as z
         on cu.zip = z.zip
where cu.closest_store_id = 1
order by 1,2

"""

df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
df

Unnamed: 0,zip,latitude,longitude,city,state,population,area,density,time_zone
0,94002,37.5135,-122.2991,Belmont,CA,27202,5.9244,4591.53,America/Los_Angeles
1,94005,37.6887,-122.4080,Brisbane,CA,4692,4.8168,974.09,America/Los_Angeles
2,94010,37.5693,-122.3653,Burlingame,CA,42730,12.4205,3440.28,America/Los_Angeles
3,94014,37.6909,-122.4475,Daly City,CA,49515,6.6361,7461.50,America/Los_Angeles
4,94015,37.6812,-122.4805,Daly City,CA,64887,6.1247,10594.35,America/Los_Angeles
...,...,...,...,...,...,...,...,...,...
139,94963,38.0138,-122.6703,San Geronimo,CA,498,2.3307,213.67,America/Los_Angeles
140,94964,37.9431,-122.4918,San Quentin,CA,3418,0.2608,13104.83,America/Los_Angeles
141,94965,37.8499,-122.5236,Sausalito,CA,11408,14.2131,802.64,America/Los_Angeles
142,94970,37.9145,-122.6469,Stinson Beach,CA,689,7.0006,98.42,America/Los_Angeles


# Find a list of zip codes for customers of the Berkeley store; do not remove duplicates so we can use intensity levels for each zip code


In [8]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from stores

"""

df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
df

Unnamed: 0,store_id,street,city,state,zip,latitude,longitude
0,1,3000 Telegraph Ave,Berkeley,CA,94705,37.8555,-122.2604
1,2,1001 Broadway,Seattle,WA,98122,47.6114,-122.3214
2,3,2510 McKinney Ave,Dallas,TX,75201,32.7958,-96.8015
3,4,299 SE 3rd Ave,Miami,FL,33131,25.772,-80.1891
4,5,1202 Broadway,Nashville,TN,37203,36.1568,-86.7881


In [9]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select z.latitude, z.longitude
from customers as cu
     join zip_codes as z
         on cu.zip = z.zip
where cu.closest_store_id = 1
order by 1,2

"""

df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
df

Unnamed: 0,latitude,longitude
0,37.4949,-122.2080
1,37.4949,-122.2080
2,37.4949,-122.2080
3,37.4949,-122.2080
4,37.4949,-122.2080
...,...,...
8133,38.1842,-122.2629
8134,38.1842,-122.2629
8135,38.1842,-122.2629
8136,38.1842,-122.2629


# Heatmap showing intensity as the number of customers in each zip code¶

In [10]:
fig = gmaps.figure(center=sather_gate_berkeley, zoom_level=8)

heatmap_layer = gmaps.heatmap_layer(df)

fig.add_layer(heatmap_layer)

heatmap_layer.point_radius = 20

fig

Figure(layout=FigureLayout(height='420px'))

# Create datafrme df1
Subgroups BART stations into 8 bay area regions by zip codes.

Bay area regions: 
Berkeley, Downtown SF, West Oakland, Orinda, Fruitvale, Down Oakland, Central SF, Mission SF

In [11]:
df1 = pd.read_csv('group_zipcode.csv')
df1

Unnamed: 0,zips,count,Group
0,94102,78,Downtown SF
1,94103,77,Downtown SF
2,94104,5,Downtown SF
3,94108,50,Downtown SF
4,94105,48,Downtown SF
5,94111,36,Downtown SF
6,94133,57,Downtown SF
7,94110,75,Mission SF
8,94114,58,Mission SF
9,94112,40,Central SF


In [12]:
df1['zips'] = df1['zips'].astype(str)

In [13]:
df1.dtypes

zips     object
count     int64
Group    object
dtype: object

# Create dataframe df_zip to include zip_codes table

In [14]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from zip_codes

"""

df_zip = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
df_zip

Unnamed: 0,zip,latitude,longitude,city,state,population,area,density,time_zone
0,08074,39.7158,-75.1640,Richwood,NJ,15,0.0886,169.39,America/New_York
1,08240,39.4873,-74.5318,Pomona,NJ,2293,1.5196,1508.93,America/New_York
2,08876,40.5880,-74.6874,Somerville,NJ,22059,15.1172,1459.20,America/New_York
3,10001,40.7506,-73.9972,New York,NY,22924,0.6675,34341.44,America/New_York
4,32026,30.0541,-82.1815,Raiford,FL,1907,0.6333,3011.38,America/New_York
...,...,...,...,...,...,...,...,...,...
32718,47367,40.0827,-85.3872,Oakville,IN,23,0.0866,265.47,America/Indiana/Indianapolis
32719,63079,38.2606,-91.0998,Stanton,MO,24,0.3523,68.12,America/Chicago
32720,63738,37.0893,-89.9574,Brownwood,MO,31,0.1171,264.70,America/Chicago
32721,68954,40.6227,-98.2374,Inland,NE,14,1.7437,8.03,America/Chicago


In [15]:
df_zip.dtypes

zip            object
latitude      float64
longitude     float64
city           object
state          object
population      Int64
area          float64
density       float64
time_zone      object
dtype: object

# Create dataframe df2
Add latitude and longitude for each zip codes by join dataframes df1 and df_zip

In [16]:
df2 = pd.merge(df1, df_zip[['zip', 'latitude', 'longitude']], how = 'left', left_on = 'zips', right_on = 'zip')
df2

Unnamed: 0,zips,count,Group,zip,latitude,longitude
0,94102,78,Downtown SF,94102,37.7797,-122.4192
1,94103,77,Downtown SF,94103,37.7732,-122.4111
2,94104,5,Downtown SF,94104,37.7915,-122.4021
3,94108,50,Downtown SF,94108,37.792,-122.4086
4,94105,48,Downtown SF,94105,37.7898,-122.3939
5,94111,36,Downtown SF,94111,37.7989,-122.3984
6,94133,57,Downtown SF,94133,37.8038,-122.4107
7,94110,75,Mission SF,94110,37.75,-122.4154
8,94114,58,Mission SF,94114,37.758,-122.4354
9,94112,40,Central SF,94112,37.7203,-122.443


# Create dataframe df_group

- Get average latitude and longitude for each bay area region.
- Sum the number of customer for each bay area region.

In [17]:
df_group = df2.groupby('Group').agg({'latitude': 'mean', 'longitude': 'mean','count': 'sum'})
df_group

Unnamed: 0_level_0,latitude,longitude,count
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Berkeley,37.8689,-122.2717,604
Central SF,37.733833,-122.448433,139
Downtown Oakland,37.8088,-122.2691,161
Downtown SF,37.789843,-122.406286,351
Fruitvale,37.7767,-122.2184,180
Mission SF,37.754,-122.4254,133
Orinda,37.8797,-122.1846,193
West Oakland,37.8073,-122.3002,203


In [18]:
lag_list = list(df_group.latitude)
lag_list

[37.868900000000004,
 37.73383333333334,
 37.8088,
 37.78984285714286,
 37.7767,
 37.754000000000005,
 37.8797,
 37.8073]

# Create df3
Incluces all the latitude and longitude. This is prepared for heatmap. 

In [19]:
lag_list = list(df_group.latitude)
log_list = list(df_group.longitude)
freq = list(df_group['count'])

df3 = pd.DataFrame()

for lag, log, fq in zip(lag_list, log_list, freq):
    temp_df = pd.DataFrame(data = {'latitude':[lag]*fq,'longitude':[log]*fq})
    df3 = pd.concat([df3, temp_df])

df3

Unnamed: 0,latitude,longitude
0,37.8689,-122.2717
1,37.8689,-122.2717
2,37.8689,-122.2717
3,37.8689,-122.2717
4,37.8689,-122.2717
...,...,...
198,37.8073,-122.3002
199,37.8073,-122.3002
200,37.8073,-122.3002
201,37.8073,-122.3002


In [20]:
df_group['count'].sum()

1964

In [21]:
df3['latitude'].value_counts(dropna = False)

37.868900    604
37.789843    351
37.807300    203
37.879700    193
37.776700    180
37.808800    161
37.733833    139
37.754000    133
Name: latitude, dtype: int64

In [22]:
df3['longitude'].value_counts(dropna = False)

-122.271700    604
-122.406286    351
-122.300200    203
-122.184600    193
-122.218400    180
-122.269100    161
-122.448433    139
-122.425400    133
Name: longitude, dtype: int64

# Heatmap showing intensity as the number of customers in each bay area region

In [23]:
fig = gmaps.figure(center=sather_gate_berkeley, zoom_level=12)

heatmap_layer = gmaps.heatmap_layer(df3)

fig.add_layer(heatmap_layer)

heatmap_layer.point_radius = 50

fig

Figure(layout=FigureLayout(height='420px'))