In [128]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import plot, iplot
from plotly.subplots import make_subplots
import requests
import folium
from folium.plugins import HeatMap
import os

print('libraries imported!')

libraries imported!


In [129]:
# Importing product_a.csv dataset into pandas DataFrame with first column as index

df_product_a = pd.read_csv('product_a.csv', index_col=0)
df_product_a.head()

Unnamed: 0,date_w,price,total_vol,plu1,plu2,plu3,bags_t,bags_s,bags_l,bags_lx,type,year,location
0,2016-12-24,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,A,2015,Albany
1,2016-12-17,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,A,2015,Albany
2,2016-12-10,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,A,2015,Albany
3,2016-12-03,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,A,2015,Albany
4,2016-11-26,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,A,2015,Albany


In [130]:
# Converting date_w field to a suitable datetime data type

# Alternative code - df_product_a.date_w.astype('datetime64[ns]')
df_product_a['date_w'] = pd.to_datetime(df_product_a.date_w, format='%Y-%m-%d')
df_product_a.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18249 entries, 0 to 11
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date_w     18249 non-null  datetime64[ns]
 1   price      18249 non-null  float64       
 2   total_vol  18249 non-null  float64       
 3   plu1       18249 non-null  float64       
 4   plu2       18249 non-null  float64       
 5   plu3       18249 non-null  float64       
 6   bags_t     18249 non-null  float64       
 7   bags_s     18249 non-null  float64       
 8   bags_l     18249 non-null  float64       
 9   bags_lx    18249 non-null  float64       
 10  type       18249 non-null  object        
 11  year       18249 non-null  int64         
 12  location   18249 non-null  object        
dtypes: datetime64[ns](1), float64(9), int64(1), object(2)
memory usage: 1.9+ MB


In [131]:
# Correcting 'year' field values

df_product_a['year'] = df_product_a.date_w.dt.year
df_product_a.head()

Unnamed: 0,date_w,price,total_vol,plu1,plu2,plu3,bags_t,bags_s,bags_l,bags_lx,type,year,location
0,2016-12-24,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,A,2016,Albany
1,2016-12-17,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,A,2016,Albany
2,2016-12-10,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,A,2016,Albany
3,2016-12-03,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,A,2016,Albany
4,2016-11-26,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,A,2016,Albany


In [132]:
# Descriptive statistical analysis of 'df_product_a'

# numeric columns to use
columns_to_include = ['price', 'total_vol', 'plu1', 'plu2', 'plu3', 'bags_t', 'bags_s', 'bags_l', 'bags_lx']

df_stats = df_product_a.describe(percentiles=[0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9],
                                exclude=[np.object, np.int64, np.datetime64]).transpose()
# add columns - variance, median, mode
# NOTE: For mode, there is a tie between multiple values in 'total_vol' column, I am choosing one arbitrarily
df_stats['var'], df_stats['median'], df_stats['mode'] = df_product_a[columns_to_include].var(), df_product_a[columns_to_include].median(), df_product_a[columns_to_include].mode().iloc[0]
# adding IQR, skewness and kurtosis
df_stats['IQR'] = (df_stats['75%'] - df_stats['25%'])
df_stats['skewness'], df_stats['kurtosis'] = df_product_a.skew(), df_product_a.kurt()

# add 'field_name' column to 'df_stats'
df_stats.index.name = 'field_name'
df_stats.reset_index(inplace=True)

df_stats

Unnamed: 0,field_name,count,mean,std,min,10%,20%,25%,30%,40%,...,75%,80%,90%,max,var,median,mode,IQR,skewness,kurtosis
0,price,18249.0,1.405978,0.4026766,0.44,0.93,1.05,1.1,1.15,1.26,...,1.66,1.74,1.93,3.25,0.1621484,1.37,1.15,0.56,0.580303,0.325196
1,total_vol,18249.0,850644.013009,3453545.0,84.56,3896.768,8168.866,10838.58,15181.304,42137.088,...,432962.29,604868.968,1387045.76,62505646.52,11926980000000.0,107376.76,2038.99,422123.71,9.007687,92.104458
2,plu1,18249.0,293008.424531,1264989.0,0.0,94.276,483.362,854.07,1368.118,3261.58,...,111020.2,152679.068,538385.184,22743616.17,1600197000000.0,8645.3,0.0,110166.13,8.64822,86.809113
3,plu2,18249.0,295154.568356,1204120.0,0.0,367.484,1918.532,3008.78,4265.508,10961.994,...,150206.86,222163.62,500784.552,20470572.61,1449906000000.0,29061.02,0.0,147198.08,8.942466,91.949022
4,plu3,18249.0,22839.735993,107464.1,0.0,0.0,0.0,0.0,0.0,42.704,...,6243.42,10972.6,31492.442,2546439.11,11548530000.0,184.99,0.0,6243.42,10.159396,132.563441
5,bags_t,18249.0,239639.20206,986242.4,0.0,1299.208,3347.674,5088.64,7316.634,16643.26,...,110783.37,149306.36,442141.928,19373134.37,972674100000.0,39743.83,0.0,105694.73,9.756072,112.272156
6,bags_s,18249.0,182194.686696,746178.5,0.0,583.11,1686.086,2849.42,4761.328,11502.076,...,83337.67,104537.112,354266.852,13384586.8,556782400000.0,26362.82,0.0,80488.25,9.54066,107.012885
7,bags_l,18249.0,54338.088145,243966.0,0.0,0.0,30.918,127.47,329.944,1105.11,...,22029.25,34350.986,94295.338,5719096.61,59519390000.0,2647.71,0.0,21901.78,9.796455,117.999481
8,bags_lx,18249.0,3106.426507,17692.89,0.0,0.0,0.0,0.0,0.0,0.0,...,132.5,557.114,3688.912,551693.65,313038500.0,0.0,0.0,132.5,13.139751,233.602612


In [133]:
# Creating a Pearson correlation matrix

df_coef_p = df_product_a.drop(['year'], axis=1).corr()
df_coef_p.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,price,total_vol,plu1,plu2,plu3,bags_t,bags_s,bags_l,bags_lx
price,1.0,-0.19,-0.21,-0.17,-0.18,-0.18,-0.17,-0.17,-0.12
total_vol,-0.19,1.0,0.98,0.97,0.87,0.96,0.97,0.88,0.75
plu1,-0.21,0.98,1.0,0.93,0.83,0.92,0.93,0.84,0.7
plu2,-0.17,0.97,0.93,1.0,0.89,0.91,0.92,0.81,0.69
plu3,-0.18,0.87,0.83,0.89,1.0,0.79,0.8,0.7,0.68
bags_t,-0.18,0.96,0.92,0.91,0.79,1.0,0.99,0.94,0.8
bags_s,-0.17,0.97,0.93,0.92,0.8,0.99,1.0,0.9,0.81
bags_l,-0.17,0.88,0.84,0.81,0.7,0.94,0.9,1.0,0.71
bags_lx,-0.12,0.75,0.7,0.69,0.68,0.8,0.81,0.71,1.0


In [134]:
# Creating a Spearman Rank correlation matrix

df_coef_sp = df_product_a.corr(method='spearman')
df_coef_sp.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,price,total_vol,plu1,plu2,plu3,bags_t,bags_s,bags_l,bags_lx,year
price,1.0,-0.61,-0.6,-0.51,-0.53,-0.6,-0.54,-0.51,-0.43,0.11
total_vol,-0.61,1.0,0.9,0.94,0.83,0.95,0.93,0.7,0.66,0.09
plu1,-0.6,0.9,1.0,0.79,0.75,0.84,0.82,0.64,0.62,0.01
plu2,-0.51,0.94,0.79,1.0,0.82,0.86,0.83,0.64,0.63,0.01
plu3,-0.53,0.83,0.75,0.82,1.0,0.78,0.78,0.57,0.65,-0.04
bags_t,-0.6,0.95,0.84,0.86,0.78,1.0,0.96,0.75,0.65,0.21
bags_s,-0.54,0.93,0.82,0.83,0.78,0.96,1.0,0.6,0.64,0.18
bags_l,-0.51,0.7,0.64,0.64,0.57,0.75,0.6,1.0,0.5,0.19
bags_lx,-0.43,0.66,0.62,0.63,0.65,0.65,0.64,0.5,1.0,0.17
year,0.11,0.09,0.01,0.01,-0.04,0.21,0.18,0.19,0.17,1.0


In [135]:
# Create a Plotly scatter matrix plot for df_product_a

# What is 'trace' in plotly? 
# A trace is just the name we give a collection of data and the specifications of which we want that data plotted

fig = px.scatter_matrix(df_product_a.drop(['date_w'], axis=1), 
                        width=1500,
                        height=1500,
                        opacity=0.4, 
                        title='Scatter matrix plot for <i>df_product_a</i>'
                       )
fig.update_layout(title_x=0.5)
# open the figure in another tab as it is large in size
plot(fig, filename='scatter_matrix_plot.html')

'scatter_matrix_plot.html'

In [136]:
# Creating weekly and monthly time-series graphs of the numeric fields

# convert the dataframe into a time-series format
df_product_a_ts = df_product_a.drop(['type', 'year', 'location'], axis=1).set_index('date_w').sort_index()

# downsample the data to weekly data points using average values for each week (weekly)
df_product_a_weekly = df_product_a_ts.resample('W').mean().dropna()
# downsample the data to monthly data points using average values for each month (monthly)
df_product_a_monthly = df_product_a_ts.resample('M').mean().dropna()

# creating traces
fig = make_subplots(rows=2, cols=1, specs=[[{"secondary_y": True}], [{"secondary_y": True}]])

# colors to use
colors = px.colors.qualitative.Plotly[0:7]

# columns 'price' and 'total_vol' are not included as there scales are too small or large (compared to others)
upper_cols = ['plu1', 'plu2', 'bags_t', 'bags_s', 'bags_l'] # upper range columns (in 10^5s)
lower_cols = ['plu3', 'bags_lx'] # lower range columns (in 10^4s)

# upper columns
for col, color in zip(upper_cols, colors[0:5]):
    # weekly
    fig.add_trace(go.Scatter(x=df_product_a_weekly.index, y=df_product_a_weekly[col],
                        mode='lines+markers', line=dict(color=color),
                        name='{} weekly'.format(col)), row=1, col=1, secondary_y=False)
    # reducing marker size
    fig.update_traces(marker=dict(size=4), row=1, col=1)
    
    # monthly
    fig.add_trace(go.Scatter(x=df_product_a_monthly.index, y=df_product_a_monthly[col],
                        mode='lines+markers', line=dict(color=color),
                        name='{} monthly'.format(col)), row=2, col=1, secondary_y=False)

# lower columns
for col, color in zip(lower_cols, colors[5:7]):
    # weekly
    fig.add_trace(go.Scatter(x=df_product_a_weekly.index, y=df_product_a_weekly[col],
                        mode='lines+markers', line=dict(color=color),
                        name='{} weekly'.format(col)), row=1, col=1, secondary_y=True)
    
    # reducing marker size
    fig.update_traces(marker=dict(size=4), row=1, col=1)
    
    # monthly
    fig.add_trace(go.Scatter(x=df_product_a_monthly.index, y=df_product_a_monthly[col],
                        mode='lines+markers', line=dict(color=color),
                        name='{} monthly'.format(col)), row=2, col=1, secondary_y=True)
    

fig.update_layout(height=1000, title_text='Weekly and monthly time-series graphs', title_x=0.5)
# setting x-axis title
fig.update_xaxes(title_text='Week', row=1, col=1)
fig.update_xaxes(title_text='Month', row=2, col=1)
# setting y-axis title
fig.update_yaxes(title_text='Field values')

plot(fig, filename='weekly_monthly_timeseries_graphs.html')
#iplot(fig)

'weekly_monthly_timeseries_graphs.html'

In [137]:
# Creating year based location and type bar charts for the total volume

# years to analyse
years = [2016, 2017, 2018, 2019]

# 4 subplots - one for each year
fig = make_subplots(rows=4, cols=1, vertical_spacing=0.12, subplot_titles=('2016', '2017', '2018', '2019'))

for idx, year in enumerate(years):
    # type 'A'
    df_product_a_A = df_product_a.loc[((df_product_a.year == year) & (df_product_a.type == 'A')), ['total_vol', 'location', 'type']]
    # grouped by locations
    _A = df_product_a_A.groupby('location')['total_vol'].sum()
    # dropping 'TotalUS' to avoid scaling issues
    _A.drop('TotalUS', inplace=True)
    # type 'C'
    df_product_a_C = df_product_a.loc[((df_product_a.year == year) & (df_product_a.type == 'C')), ['total_vol', 'location', 'type']]
    # grouped by locations
    _C = df_product_a_C.groupby('location')['total_vol'].sum()
    # dropping 'TotalUS' to avoid scaling issues
    _C.drop('TotalUS', inplace=True)
    
    fig.add_trace(go.Bar(name=f'Type A - {year}', x=_A.index, y=_A), idx + 1, 1)
    fig.add_trace(go.Bar(name=f'Type C - {year}', x=_C.index, y=_C), idx + 1, 1)
    
    # labelling y-axes
    fig.update_yaxes(title_text='Total count of <i>total_vol</i>', row=idx + 1, col=1)


# plotting grouped bar chart
fig.update_layout(barmode='group', height=1500)    

plot(fig, filename='total_vol_by_location_type.html')

'total_vol_by_location_type.html'

In [138]:
# Geolocations API service credentials

GOOGLE_API_KEY = 'AIzaSyCjTXUHjy32MC7xBZuXRXH3B41lAotZq1g'
HERE_API_KEY = '6vX6DMb3Dg8KtKlZMk-RnkRSzi_E94WCyDtXhuQ9zb4'

In [153]:
# Functions to retrieve geo coordinates (i.e. longitudes and latitudes) - Google, Here and ArcGic API services

def get_geo_coordinates_from_google(address:str, connection_params:dict):
    base_url = 'https://maps.googleapis.com/maps/api/geocode'
    url = '{}/{}?address={}&key={}'.format(base_url,
                          connection_params['output_format'],
                          address,
                          connection_params['api_key']    
                         )
    
    # make the GET request
    results = requests.get(url).json()
    #print(address, results)
    
    # check if codes were successfully obtained or not
    if results['status'] == 'ZERO_RESULTS':
        return None
    
    location = results['results'][0]['geometry']['location']
    return {
        'longitude' : location['lng'],
        'latitude' : location['lat']
    }

def get_geo_coordinates_from_here(address:str, connection_params:dict):
    base_url = 'https://geocoder.ls.hereapi.com/6.2/geocode.'
    url = '{}{}?searchtext={}&gen=9&apiKey={}'.format(base_url,
                                                      connection_params['output_format'],
                                                      address, connection_params['api_key']
                                                     )
    # make the GET request
    results = requests.get(url).json()
    
    # check if codes were successfully obtained or not
    if len(results['Response']['View']) == 0:
        return None
    
    location = results['Response']['View'][0]['Result'][0]['Location']['DisplayPosition']
    return {
        'longitude': location['Longitude'],
        'latitude': location['Latitude']
    }

def get_geo_coordinates_from_arcgis(address:str, connection_params:dict):
    base_url = 'https://geocode.arcgis.com/arcgis/rest/services/World/GeocodeServer/findAddressCandidates'
    url = f'{base_url}?f={connection_params["output_format"]}&singleLine={address}&outFields={connection_params["out_fields"]}'
    results = requests.get(url).json()

    location = results['candidates'][0]['location']
    return {
        'longitude': location['x'],
        'latitude': location['y']
    }


# testing functions
# Google API parameters
connection_params = {
    'output_format': 'json',
    'api_key': GOOGLE_API_KEY
}
address = 'Boise,+US'

lng_lat_google = get_geo_coordinates_from_google(address, connection_params)

# Here API paramters
connection_params = {
    'output_format': 'json',
    'api_key': HERE_API_KEY
}
address = 'Boise,+US'

lng_lat_here = get_geo_coordinates_from_here(address, connection_params)

# ArcGis parameters
connection_params = {
    'output_format': 'json',
    'out_fields': 'Match_addr,Addr_type'
}
address = 'Boise, US'

lng_lat_arcgis = get_geo_coordinates_from_arcgis(address, connection_params)

# printing
lng_lat_google, lng_lat_here, lng_lat_arcgis

({'longitude': -116.2023137, 'latitude': 43.6150186},
 {'longitude': -116.19341, 'latitude': 43.60765},
 {'longitude': -116.19339999999994, 'latitude': 43.60764000000006})

In [154]:
# Getting geocodes for all the unique location values in 'product_a' dataset

connection_params = {
    'output_format': 'json',
    'api_key': GOOGLE_API_KEY
}

location_geocodes = {addr: get_geo_coordinates_from_google('{},+US'.format(addr), connection_params) for addr in df_product_a['location'].unique()}

IndexError: list index out of range

In [None]:
# Using Google Reverse Geocoding API service

def get_state_from_lat_lng(lat, lng):
    lat_lng = '{},{}'.format(lat, lng)
    url = 'https://maps.googleapis.com/maps/api/geocode/json?latlng={}&key=AIzaSyDFAdg1lJ2bjv3LfVtFeaaxUmwqU59nZYs'.format(
        lat_lng,
        GOOGLE_API_KEY
    )
    # make GET request to end-point
    results = requests.get(url).json()
    
    for dict_ in results['results'][0]['address_components']:
        if dict_['types'] == ['administrative_area_level_1', 'political']:
            return dict_['short_name']

In [None]:
a = df_product_a[df_product_a.type == 'A']
mean_price_a = df_product_a.groupby('location')[['price']].mean()
mean_price_a.loc['Albany'].values[0]

In [None]:
# Data used by Folium map

# data - total bags_t
# total values of bags_t per location
total_bags_t = df_product_a.groupby('location')['bags_t'].sum()
# total values of bags_t per location and type
total_bags_t_type = df_product_a.groupby(['location', 'type'])['bags_t'].sum()

# data - mean bags_t
mean_bags_t = df_product_a.groupby('location')[['bags_t']].mean()
mean_bags_t.drop(['TotalUS', 'WestTexNewMexico'], inplace=True)
mean_bags_t['binned color'] = pd.cut(mean_bags_t['bags_t'], bins=3, labels=['pink', 'orange', 'darkred'])
# add 'lat' and 'lon' values
mean_bags_t['latitude'], mean_bags_t['longitude'] = mean_bags_t.apply(lambda row: location_geocodes[row.name]['latitude'], axis=1), mean_bags_t.apply(lambda row: location_geocodes[row.name]['longitude'], axis=1)
# add corresponding state_names
mean_bags_t['state name'] = mean_bags_t.apply(lambda row: get_state_from_lat_lng(row['latitude'], row['longitude']), axis=1)
state_mean_bags_t = mean_bags_t[['state name', 'bags_t']].reset_index(drop=True)
# get average bags_t values per state
state_mean_bags_t = mean_bags_t.groupby('state name')[['bags_t']].mean().reset_index()

# data - mean values of all numerical fields
mean_data = df_product_a.groupby('location').mean()
mean_data = mean_data.drop('year', axis=1)

# data - mean price of type A per location
a = df_product_a[df_product_a.type == 'A']
mean_price_a = df_product_a.groupby('location')[['price']].mean()

In [None]:
# Visualising data with Folium map

# USA coordinates
latitude = 37.2754919
longitude = -104.6582933

# create map of United States using latitude and longitude values
map_us = folium.Map(location=[latitude, longitude], zoom_start=4)

# choropleth map geoJSON file
url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data'
state_geo = f'{url}/us-states.json'

folium.Choropleth(
    geo_data=state_geo,
    name='choropleth',
    data=state_mean_bags_t,
    columns=['state name', 'bags_t'],
    key_on='feature.id',
    fill_color='YlOrRd',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Mean bags_t values'
).add_to(map_us)

folium.LayerControl().add_to(map_us)

# add markers to map
for address in location_geocodes.keys():
    # check if location geocodes are present
    if location_geocodes[address] is not None:
        lat = location_geocodes[address]['latitude']
        lng = location_geocodes[address]['longitude']

        tooltip = '{}<br>Total bags_t: {}<br>Total bags_t (Type \'A\'): {}<br>Total bags_t (Type \'C\'): {}'.format(
            address + ', US',
            round(total_bags_t.loc[total_bags_t.index == address].values[0], 2),
            round(total_bags_t_type.loc[(address)]['A'], 2),
            round(total_bags_t_type.loc[(address)]['C'], 2)
        )
                
        label = 'Mean values:<br>'
        for index, value in mean_data.loc[address].iteritems():
            label += '{}: {}<br>'.format(index, round(value, 2))
        
        # add to the map
        folium.CircleMarker(
            [lat, lng],
            radius=mean_price_a.loc[address].values[0],
            tooltip=folium.Tooltip(tooltip, style='bold', sticky=False),
            popup=folium.Popup(label, max_width=150),
            color='#000000',
            fill=True,
            fill_color=mean_bags_t.loc[address, 'binned color'],
            fill_opacity=0.7,
            parse_html=False).add_to(map_us)
    else:
        print('"{}" geocodes not found :('. format(address+', US'))

map_us

In [None]:
# Temporary
# Visualising data with Folium map - using HeatMap

# USA coordinates
latitude = 37.2754919
longitude = -104.6582933
# create map of United States using latitude and longitude values
map_us = folium.Map(location=[latitude, longitude], zoom_start=4)
data = []

# add markers to map
for address in location_geocodes.keys():
    # check if location geocodes are present
    if location_geocodes[address] is not None:
        lat = location_geocodes[address]['latitude']
        lng = location_geocodes[address]['longitude']

        address = address + ', US'
        label = folium.Popup(address, parse_html=True)
        # add to the map
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=folium.Popup(address),
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(map_us)
    else:
        print('"{}" geocodes not found :('. format(address+', US'))

# average values of bags_t per location
mean_bags_t = df_product_a.groupby('location')['bags_t'].mean()
data = dict(location_geocodes)

# remove null values
if 'TotalUS' in data:
    del data['TotalUS']
if 'WestTexNewMexico' in data:
    del data['WestTexNewMexico']

# get 'weight' (i.e. mean bags_t value) for each location
for key in data.keys():
    data[key]['weight'] = mean_bags_t.loc[mean_bags_t.index == key].values[0]
    
# convert it to list of lists for HeatMap method
data = pd.DataFrame(data).T
heat_data = [[row['latitude'],row['longitude'], row['weight']] for index, row in data.iterrows()]

#map_us.add_child(HeatMap(data[:, 0:2], radius=15))
#HeatMap(heat_data, radius=15).add_to(map_us)

#map_us

In [None]:
# Visualising data with Folium map

# USA coordinates
latitude = 37.2754919
longitude = -104.6582933

# create map of United States using latitude and longitude values
map_us = folium.Map(location=[latitude, longitude], zoom_start=4)

# choropleth map geoJSON file
url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data'
state_geo = f'{url}/us-states.json'

folium.Choropleth(
    geo_data=state_geo,
    name='choropleth',
    data=state_mean_bags_t,
    columns=['state name', 'bags_t'],
    key_on='feature.id',
    fill_color='YlOrRd',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Mean bags_t values'
).add_to(map_us)

folium.LayerControl().add_to(map_us)

# add markers to map
for address in location_geocodes.keys():
    # check if location geocodes are present
    if location_geocodes[address] is not None:
        lat = location_geocodes[address]['latitude']
        lng = location_geocodes[address]['longitude']

        tooltip = '{}<br>Total bags_t: {}<br>Total bags_t (Type \'A\'): {}<br>Total bags_t (Type \'C\'): {}'.format(
            address + ', US',
            round(total_bags_t.loc[total_bags_t.index == address].values[0], 2),
            round(total_bags_t_type.loc[(address)]['A'], 2),
            round(total_bags_t_type.loc[(address)]['C'], 2)
        )
                
        label = 'Mean values:<br>'
        for index, value in mean_data.loc[address].iteritems():
            label += '{}: {}<br>'.format(index, round(value, 2))
        
        # add to the map
        folium.CircleMarker(
            [lat, lng],
            radius=mean_price_a.loc[address].values[0],
            tooltip=folium.Tooltip(tooltip, style='bold', sticky=False),
            popup=folium.Popup(label, max_width=150),
            color='#000000',
            fill=True,
            fill_color=mean_bags_t.loc[address, 'binned color'],
            fill_opacity=0.7,
            parse_html=False).add_to(map_us)
    else:
        print('"{}" geocodes not found :('. format(address+', US'))

map_us

In [None]:
# Temporary
# Visualising data with Folium map - using HeatMap

# USA coordinates
latitude = 37.2754919
longitude = -104.6582933
# create map of United States using latitude and longitude values
map_us = folium.Map(location=[latitude, longitude], zoom_start=4)
data = []

# add markers to map
for address in location_geocodes.keys():
    # check if location geocodes are present
    if location_geocodes[address] is not None:
        lat = location_geocodes[address]['latitude']
        lng = location_geocodes[address]['longitude']

        address = address + ', US'
        label = folium.Popup(address, parse_html=True)
        # add to the map
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=folium.Popup(address),
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(map_us)
    else:
        print('"{}" geocodes not found :('. format(address+', US'))

# average values of bags_t per location
mean_bags_t = df_product_a.groupby('location')['bags_t'].mean()
data = dict(location_geocodes)

# remove null values
if 'TotalUS' in data:
    del data['TotalUS']
if 'WestTexNewMexico' in data:
    del data['WestTexNewMexico']

# get 'weight' (i.e. mean bags_t value) for each location
for key in data.keys():
    data[key]['weight'] = mean_bags_t.loc[mean_bags_t.index == key].values[0]
    
# convert it to list of lists for HeatMap method
data = pd.DataFrame(data).T
heat_data = [[row['latitude'],row['longitude'], row['weight']] for index, row in data.iterrows()]

#map_us.add_child(HeatMap(data[:, 0:2], radius=15))
#HeatMap(heat_data, radius=15).add_to(map_us)

#map_us