# Exploring Trends in Most Populous Cities of the United States

### Introduction

Company X wishes to branch out their business chain by opening new locations in ideal US cities. To identify which US cities might be ideal candidates we explore relationships between the most populous US cities and the types of venues that correspond to each. We will then perform agglomerative hierarchical clustering to see which of the top 200 most populous US cities might be worthy of looking into further. The hierarchical clustering will be based on population, population change (percent increase/decrease), and most common types of venues that exist within them. Once the hierarchical clustering is completed, a more in-depth study can then be performed by Company X. Therefore, this work is a preliminary investigation step.

##### Data Collection

To start with, we scrape Wikipedia's page for the list of most populous US cities with their corresponding percent-increase/percent-decrease:

https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population

Then, using this initial data set with the corresponding GPS locations (included in the data table), we will obtain venues using the Foursquare API and investigate the trending venues.

I perform the scrape below.

##### Importing Libraries:

In [None]:
# import necessary libraries

# import numpy
import numpy as np

# import pandas
import pandas as pd

# import web scraping tools
from urllib.request import urlopen
from bs4 import BeautifulSoup

# library to handle JSON files
import json

# import geocoder
import geocoder

# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim

# import folium for maps
import folium

# library to handle requests
import requests

# tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt

In [None]:
def get_html_contents(page_url):
    results = requests.get(page_url) # access url location
    soup = BeautifulSoup(results.text,'html.parser') # parse through html of url and store page info
    return soup
    
cities_url = 'https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population' # define desired url

cities_contents = get_html_contents(cities_url)

In [None]:
city_pop_table = cities_contents.find('table',{'class': 'wikitable sortable'}) # find the first table of class 'wikitable sortable'
city_pop_header = city_pop_table.tr.find_all('th') # create html table headers list

column_names = [] # create empty list

for header in city_pop_header:
    column_names.append(header.text.strip().replace(',','_')) # collects column names as list
print(column_names)

In [None]:
file_name = 'us_cities_data_2020.csv' # give name to file

f = open(file_name,'w+', encoding='utf-8') # utf-8 encoding is used in html;
# will need to clean up

for i in range(len(column_names)): # write to file all column names as header to file
    if column_names[i] != column_names[-1]:
        f.write(column_names[i] + ',')
    else:
        f.write(column_names[i] + '\n')

table_rows = city_pop_table.find_all('tr') # list of all tr

for row in table_rows: # grab data from each cell and write to file
    cells = row.find_all('td')
    if len(cells) > 1:
        for i, cell in enumerate(cells):
            cell_data = cell.text.strip().replace(',','')
            if cells[i] != cells[-1]:
                if i == 7 or i == 9: # ignore metric units
                # assuming this is an American company so English units are desired
                    continue
                f.write(cell_data + ',')
            else:
                f.write(cell_data + '\n')
                    
f.close() # be sure to close the file!!!

In [None]:
us_cities_df = pd.read_csv('us_cities_data_2020.csv') # read csv file to obtain dataframe
us_cities_df.head() # take a peak at dataframe

##### Data Cleaning:

In [None]:
cities_cleaned = [] # initialize an empty list

for i, text in enumerate(us_cities_df.index): # cleans city names and appends to list
    cities_cleaned.append(us_cities_df.loc[i,'City'].split('[')[0])
    
print(cities_cleaned) # take a peak

In [None]:
us_cities_df['City'] = pd.Series(cities_cleaned) # repleace 'City' column with newly cleaned names
us_cities_df.head()

In [None]:
location_cleaned = [] # initialize empty list

for i, content in enumerate(us_cities_df.index): # cleaning location lat/long coords
    location_cleaned.append(us_cities_df.loc[i,'Location'].replace('\ufeff','').split('/')[2].split('(')[0].replace(' ',''))
    
lat_coords = [] # initialize
long_coords = [] # initialize

for i, text in enumerate(location_cleaned): # separate lat and long coords into separate lists
    lat, long = location_cleaned[i].split(';')
    lat_coords.append(lat)
    long_coords.append(long)

us_cities_df['Lat'] = pd.Series(lat_coords).astype(float) # add column for latidude
us_cities_df['Long'] = pd.Series(long_coords).astype(float) # add column for longitude
us_cities_df.head() # take a peak

In [None]:
us_cities_df.drop('Location',axis=1,inplace=True) # drop messy location column
us_cities_df.head()

In [None]:
us_cities_df.rename(columns={'2020rank': '2020 Rank',
                             'State[c]': 'State',
                             '2020estimate': '2018 Estimate',
                             '2010census': '2010 Census',
                             '2016 land area': 'Land Area (sq mi)',
                             '2016 population density': 'Population Density (per sq mi)'},
                    inplace=True) # renaming columns for cleaner look and easier use
us_cities_df.head()

In [None]:
us_cities_df.drop('2020 Rank',axis=1,inplace=True) # rank unnecessary
us_cities_df.head()

In [None]:
land_area = [] # initialize
pop_density = [] # initialize

for i, text in enumerate(us_cities_df.index): # clean land area and population density columns
    land_area.append(us_cities_df.loc[i,'Land Area (sq mi)'].replace('\xa0sq\xa0mi',''))
    pop_density.append(us_cities_df.loc[i,'Population Density (per sq mi)'].split('/')[0])

us_cities_df['Land Area (sq mi)'] = pd.Series(land_area).astype(float)
us_cities_df['Population Density (per sq mi)'] = pd.Series(pop_density).astype(float)
us_cities_df.head() # take a peak

In [None]:
per_change_float = []

for i, content in enumerate(us_cities_df.index):
    per_change_float.append(us_cities_df.loc[i,'Change'].replace('+','').replace('%','').replace('−','-'))

us_cities_df['Change'] = pd.Series(per_change_float)

for i, content in enumerate(us_cities_df['Change']):
    try:
        us_cities_df.loc[i,'Change'] = float(us_cities_df.loc[i,'Change']) # attempt to convert string to float
    except:
        us_cities_df.drop(index=i,inplace=True) # throw exception for new cities and/or unkown % change
    
us_cities_df.rename(columns={'Change': '% Change (Pop)'},inplace=True)
us_cities_df['% Change (Pop)'] = us_cities_df['% Change (Pop)'].astype(float)
us_cities_df.head()

In [None]:
us_cities_df.info() # look at data types within dataframe

In [None]:
high_influx_df = us_cities_df[us_cities_df['% Change (Pop)'] >= 20.0].reset_index(drop=True) # define high
# influx of population dataframe
high_influx_df.head(high_influx_df.shape[0]) # take a peak

In [None]:
high_influx_df.to_csv('High_Influx_US_Cities.csv', index=False)

##### Visualization:

In [None]:
# grabbing the lat/long coords for United States

high_influx_df = pd.read_csv('High_Influx_US_Cities.csv')

address = 'United States'

geolocator = Nominatim(user_agent="us_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of the United States are {}, {}.'.format(latitude, longitude))

In [None]:
# create map of United States using latitude and longitude values
map_us_cities = folium.Map(location=[latitude, longitude], zoom_start=4)

# add markers to map; color coded by % change (20%+)
for lat, lng, city, state, inflx in zip(high_influx_df['Lat'],
                                        high_influx_df['Long'],
                                        high_influx_df['City'],
                                        high_influx_df['State'],
                                        high_influx_df['% Change (Pop)']):
    
    label = '{}, {}, {}\% increase'.format(city, state, inflx)
    
    label = folium.Popup(label, parse_html=True)
    
    # separate % ranges by different colors
    if inflx < 30: 
        marker_color='blue'
    elif inflx < 40:
        marker_color='orange'
    elif inflx < 50:
        marker_color='green'
    else:
        marker_color='red'
        
    # adding markers
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=marker_color,
        fill=True,
        fill_color=marker_color,
        fill_opacity=0.7,
        parse_html=False).add_to(map_us_cities)  
    
map_us_cities

##### Leverage Foursquare API:

In [None]:
# @hidden_cell

CLIENT_ID = 'RNIRYZLUNKHD0IONCWKPVPRB4AZFALKN3ZECYPRETDCQYHJF' # your Foursquare ID
CLIENT_SECRET = 'NLKUJQCLB05PWECWTK3VN5VV0PHNGGQEO2CVTCIJJQ2NGARL' # your Foursquare Secret
ACCESS_TOKEN = 'MGILLFTINOBH02B4H2GBOROCUY2IUWUV5OZ2NFT2FTQANMFX' # your FourSquare Access Token
VERSION = '20180604'
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
high_influx_df = pd.read_csv('High_Influx_US_Cities.csv')

city = high_influx_df.City
state = high_influx_df.State
latitude = high_influx_df.Lat
longitude = high_influx_df.Long
cities_tuples = list(zip(city + ', ' + state, latitude, longitude))
print(cities_tuples)

In [None]:
# function that extracts the category of the venue

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
# start looping through city centers and grabbing max num of venues away from city centers

LIMIT = 100
radius = 16100

cities_dfs = {}
cities_maps = {}

for i in range(len(cities_tuples)):
    
    # select city with lat/long coords
    city = cities_tuples[i][0]
    latitude = cities_tuples[i][1]
    longitude = cities_tuples[i][2]
    
    # define url to get request from
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, 
                                                                                                                     CLIENT_SECRET, 
                                                                                                                     latitude, 
                                                                                                                     longitude, 
                                                                                                                     VERSION, 
                                                                                                                     radius, 
                                                                                                                     LIMIT)
    # print the current url being fetched
    print(url)
    # send get request and examine results for each city

    # get the request from the current url and print info found
    results = requests.get(url).json()
    print(
        '\n\nFound at least {} venues within {} km (~10 mi) from {}\'s city center.\n'.format(len(results['response']['groups'][0]['items']), 
                                                                                       radius / 1000, 
                                                                                       city))
    
    # define venue items
    items = results['response']['groups'][0]['items']
    
    # flatten JSON
    dataframe = pd.json_normalize(items)

    # filter columns
    filtered_columns = ['venue.name', 'venue.categories'] + [col for col in dataframe.columns if col.startswith('venue.location.')] + ['venue.id']
    dataframe_filtered = dataframe.loc[:, filtered_columns]

    # filter the category for each row
    dataframe_filtered['venue.categories'] = dataframe_filtered.apply(get_category_type, axis=1)

    # clean columns
    dataframe_filtered.columns = [col.split('.')[-1] for col in dataframe_filtered.columns]

    # add current city's venue dataframe to dataframe list
    cities_dfs.update({city : dataframe_filtered})
    
    # define current city's map
    venues_map = folium.Map(location=[latitude, longitude], zoom_start=12) # generate map centred around Ecco


    # add current city center as a red circle mark
    folium.CircleMarker(
        [latitude, longitude],
        radius=10,
        popup=city,
        fill=True,
        color='red',
        fill_color='red',
        fill_opacity=0.6
        ).add_to(venues_map)


    # add current city's popular spots to the current map as blue circle markers
    for lat, lng, label in zip(dataframe_filtered.lat, dataframe_filtered.lng, 'Name: ' + dataframe_filtered.name + '\n' + 'Category: ' + dataframe_filtered.categories):
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            min_width=500,
            max_width=500,
            fill=True,
            color='blue',
            fill_color='blue',
            fill_opacity=0.6,
            parse_html=False
            ).add_to(venues_map)
    
    # add current city's map to the dict of city maps
    cities_maps.update({city : venues_map})

In [None]:
# check cities_maps keys (keys are the same for cities_dfs)
cities_maps.keys()

In [None]:
# take a peak at Austin, Texas venues
cities_maps['Austin, Texas']

In [None]:
# Take a peak at Austin, Texas df
cities_dfs['Austin, Texas'].head()

In [None]:
# Creating venue counts dataframe to be used as features with cities as indeces

features_df = pd.DataFrame()

for key in cities_dfs.keys():
    features_df.loc[len(features_df), 'City'] = key

features_df = features_df.set_index('City')

for key, value in cities_dfs.items():
    col_list = sorted(list(value['categories'].unique()))
    counts_list = list(value['categories'].value_counts().sort_index(ascending=True))
    for i in range(len(col_list)):
        features_df.loc[key, col_list[i]] = counts_list[i]

features_df.head()

In [None]:
# explore total number of venue types amongst all 48 cities
features_df.shape

In [None]:
# replace all NaN values with 0
features_df.replace(np.nan, 0, inplace=True)

In [None]:
# take a peak
features_df.head()

In [None]:
# update features df so that population and population increase are accounted for
features_df['2018 Estimate'] = high_influx_df['2018 Estimate'].values
features_df['% Change (Pop)'] = high_influx_df['% Change (Pop)'].values

In [None]:
# take a peak
features_df.head()

In [None]:
features_df.shape

In [None]:
# for normalizing numerical data
from sklearn.preprocessing import normalize

data_scaled_w_pop = normalize(features_df)
data_scaled_w_pop = pd.DataFrame(data_scaled_w_pop, columns=features_df.columns)

In [None]:
# take a peak
data_scaled_w_pop.head()

Let's make sure the "2018 Estimate" (population estimate in 2018) is not necessarily all the same value.

In [None]:
data_scaled_w_pop['2018 Estimate'].values

In [None]:
# Perform hierarchical clustering among most populous US cities with highest % increase included in features
import scipy.cluster.hierarchy as shc

def llf(id):
    return '%s' % (features_df.index[id])

plt.figure(figsize = (15, 10))
plt.title('Most Populous US Cities Based on Top-Rated Venue Type and Pop. Increase', fontsize=20)
dend = shc.dendrogram(shc.linkage(data_scaled_w_pop, method='ward'), leaf_label_func=llf, leaf_font_size=12)
plt.axhline(y=0.0006, color='k', linestyle='--')

Drawing colored boxes around the clusters:

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (15, 10))
plt.title('Most Populous US Cities Based on Top-Rated Venue Type and Pop. Increase', fontsize=20)
dend = shc.dendrogram(shc.linkage(data_scaled_w_pop, method='ward'), leaf_label_func=llf, leaf_font_size=12)
plt.axhline(y=0.0006, color='k', linestyle='--')
for coll in ax.collections[:-1]:  # the last collection is the ungrouped level
    xmin, xmax = np.inf, -np.inf
    ymax = -np.inf
    for p in coll.get_paths():
        box = p.get_extents()
        (x0, _), (x1, y1) = p.get_extents().get_points()
        xmin = min(xmin, x0)
        xmax = max(xmax, x1)
        ymax = max(ymax, y1)
    rec = plt.Rectangle((xmin - 4, 0), xmax - xmin + 8, ymax*1.05,
                        facecolor=coll.get_color()[0], alpha=0.2, edgecolor="none")
    ax.add_patch(rec)

The right, red-colored cluster contains a sub-cluster just to the left of the lower-right corner of the dendrogram. This sub-cluster consists of five cities: Denver, CO; Seattle, WA; Charlotte, NC; Austin, TX; and Fort Worth, TX. Each of these cities appear to have the closest similarity since all of their links have a small and similar height in the dendrogram. For this reason, we explore these five cities further. First we check to see how the dendrogram changes when the population demographics are removed to ensure that we have fairly similar top rated venues around the respective city centers.

In [None]:
# investigate how the dendrogram changes without population demographics
data_scaled_no_pop = normalize(features_df.drop(columns=['2018 Estimate', '% Change (Pop)']))
data_scaled_no_pop = pd.DataFrame(data_scaled_no_pop, columns=features_df.drop(columns=['2018 Estimate', '% Change (Pop)']).columns)

In [None]:
data_scaled_no_pop.head()

In [None]:
# Perform hierarchical clustering among most populous US cities without pop parameters (venue data as features only)

plt.figure(figsize = (15, 10))
plt.title('Most Populous Cities Based on Top-Rated Venue Type Only', fontsize=20)
dend = shc.dendrogram(shc.linkage(data_scaled_no_pop, method='ward'), leaf_label_func=llf, leaf_font_size=12)
plt.axhline(y=0.0006, color='k', linestyle='--')

Comparing the linkage height of the five cities: Seattle, WA; Denver, CO; Charlotte, NC; Austin, TX; and Fort Worth, TX; we see that these cities are fairly similar in top rated venue type. Therefore, we cluster the top rated venues for each city to see which venues appear to be most common amongst them. We cluster venues within each city using k-Means.

### Denver, Colorado

In [None]:
cities_dfs['Denver, Colorado'].head()

In [None]:
cities_maps['Denver, Colorado']

In [None]:
cities_dfs['Denver, Colorado'][cities_dfs['Denver, Colorado']['postalCode'].isnull()]

In [None]:
# import Nominatim to identify zip code of missing venue zips
from geopy.geocoders import Nominatim
  
# initialize Nominatim API 
geolocator = Nominatim(user_agent="denver_explorer")
  
# assign venue address input and get location info
place = cities_dfs['Denver, Colorado'].loc[61, 'address'] + ', Denver, Colorado'
location = geolocator.geocode(place)
  
# traverse the data
data = location.raw
loc_data = data['display_name'].split()
print("Full Location")
print(loc_data)
print("Zip code : ",loc_data[-3])

In [None]:
cities_dfs['Denver, Colorado'].loc[61, 'postalCode'] = loc_data[-3].replace(',', '')
cities_dfs['Denver, Colorado'].loc[61, 'postalCode']

In [None]:
cities_dfs['Denver, Colorado'][cities_dfs['Denver, Colorado']['postalCode'].isnull()]

In [None]:
denver_df = cities_dfs['Denver, Colorado'][['name', 'categories', 'lat', 'lng', 'postalCode']]
denver_df.head()

In [None]:
denver_grouped_zip = denver_df.groupby('postalCode').count()
denver_grouped_zip

In [None]:
print('There are {} unique categories.'.format(len(denver_df['categories'].unique())))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 10))

ax = denver_grouped_zip['categories'].plot(kind='barh', fontsize=12)
ax.set_title('Distribution of Denver\'s Top-Rated Venues by Zip Code', fontsize=20)
ax.set_xlabel('Frequency', fontsize=15)
ax.set_ylabel('Postal Code', fontsize=15)

In [None]:
denver_grouped_zip = denver_df.groupby('postalCode').count()
denver_grouped_zip = denver_grouped_zip.reset_index()
denver_grouped_zip

In [None]:
denver_zip_map = folium.Map(location=[high_influx_df.loc[high_influx_df['City'] == 'Denver', 'Lat'], 
                                  high_influx_df.loc[high_influx_df['City'] == 'Denver','Long']],
                        zoom_start=11)

denver_grouped_zip['postalCode'] = denver_grouped_zip['postalCode'].astype('str')

folium.Choropleth(geo_data='Colorado_ZIP_Code_Tabulation_Areas_(ZCTA).geojson',
             data=denver_grouped_zip, # my dataset
             columns=['postalCode', 'categories'], # zip code is here for matching the geojson zipcode, sales price is the column that changes the color of zipcode areas
             key_on='feature.properties.ZCTA5CE10', # this path contains zipcodes in str type, this zipcodes should match with our ZIP CODE column
             fill_color='BuPu', fill_opacity=0.7, line_opacity=0.2,
             legend_name='categories').add_to(denver_zip_map)

denver_zip_map

### Seattle, Washington

In [None]:
cities_dfs['Seattle, Washington'].head()

In [None]:
cities_maps['Seattle, Washington']

In [None]:
cities_dfs['Seattle, Washington'][cities_dfs['Seattle, Washington']['postalCode'].isnull()]

In [None]:
seattle_df = cities_dfs['Seattle, Washington'][['name', 'categories', 'lat', 'lng', 'postalCode']]
seattle_df.head()

In [None]:
seattle_grouped_zip = seattle_df.groupby('postalCode').count()
seattle_grouped_zip

In [None]:
print('There are {} unique categories.'.format(len(seattle_df['categories'].unique())))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 10))

ax = seattle_grouped_zip['categories'].plot(kind='barh', fontsize=12)
ax.set_title('Distribution of Seattle\'s Top-Rated Venues by Zip Code', fontsize=20)
ax.set_xlabel('Frequency', fontsize=15)
ax.set_ylabel('Postal Code', fontsize=15)

In [None]:
seattle_grouped_zip = seattle_df.groupby('postalCode').count()
seattle_grouped_zip = seattle_grouped_zip.reset_index()
seattle_grouped_zip

In [None]:
seattle_zip_map = folium.Map(location=[high_influx_df.loc[high_influx_df['City'] == 'Seattle', 'Lat'], 
                                  high_influx_df.loc[high_influx_df['City'] == 'Seattle','Long']],
                        zoom_start=12)

seattle_grouped_zip['postalCode'] = seattle_grouped_zip['postalCode'].astype('str')

folium.Choropleth(geo_data='wa_washington_zip_codes_geo.min.json',
             data=seattle_grouped_zip, # my dataset
             columns=['postalCode', 'categories'], # zip code is here for matching the geojson zipcode, sales price is the column that changes the color of zipcode areas
             key_on='feature.properties.ZCTA5CE10', # this path contains zipcodes in str type, this zipcodes should match with our ZIP CODE column
             fill_color='BuPu', fill_opacity=0.7, line_opacity=0.2,
             legend_name='Popular Venues').add_to(seattle_zip_map)

seattle_zip_map

### Charlotte, North Carolina

In [None]:
cities_dfs['Charlotte, North Carolina'].head()

In [None]:
cities_maps['Charlotte, North Carolina']

In [None]:
charlotte_null_zips = cities_dfs['Charlotte, North Carolina'][cities_dfs['Charlotte, North Carolina']['postalCode'].isnull()]
charlotte_null_zips

In [None]:
# initialize Nominatim API 
geolocator = Nominatim(user_agent="charlotte_explorer")

# assign venue address input and get location info
for i in charlotte_null_zips.index:
    place = cities_dfs['Charlotte, North Carolina'].loc[i, 'address'] + ', Charlotte, North Carolina'
    location = geolocator.geocode(place)
    
    # traverse the data
    data = location.raw
    loc_data = data['display_name'].split()
    print("Full Location")
    print(loc_data)
    print("Zip code : ",loc_data[-3])

In [None]:
cities_dfs['Charlotte, North Carolina'].loc[29, 'postalCode'] = '28204'
cities_dfs['Charlotte, North Carolina'].loc[48, 'postalCode'] = loc_data[-3].replace(',', '')

In [None]:
charlotte_df = cities_dfs['Charlotte, North Carolina'][['name', 'categories', 'lat', 'lng', 'postalCode']]
charlotte_df.head()

In [None]:
charlotte_grouped_zip = charlotte_df.groupby('postalCode').count()
charlotte_grouped_zip

In [None]:
print('There are {} unique categories.'.format(len(charlotte_df['categories'].unique())))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 10))

ax = charlotte_grouped_zip['categories'].plot(kind='barh', fontsize=12)
ax.set_title('Distribution of Charlotte\'s Top-Rated Venues by Zip Code', fontsize=20)
ax.set_xlabel('Frequency', fontsize=15)
ax.set_ylabel('Postal Code', fontsize=15)

In [None]:
charlotte_grouped_zip = charlotte_df.groupby('postalCode').count()
charlotte_grouped_zip = charlotte_grouped_zip.reset_index()
charlotte_grouped_zip

In [None]:
charlotte_zip_map = folium.Map(location=[high_influx_df.loc[high_influx_df['City'] == 'Charlotte', 'Lat'], 
                                  high_influx_df.loc[high_influx_df['City'] == 'Charlotte','Long']],
                        zoom_start=12)

charlotte_grouped_zip['postalCode'] = charlotte_grouped_zip['postalCode'].astype('str')

folium.Choropleth(geo_data='nc_north_carolina_zip_codes_geo.min.json',
             data=charlotte_grouped_zip, # my dataset
             columns=['postalCode', 'categories'], # zip code is here for matching the geojson zipcode, sales price is the column that changes the color of zipcode areas
             key_on='feature.properties.ZCTA5CE10', # this path contains zipcodes in str type, this zipcodes should match with our ZIP CODE column
             fill_color='BuPu', fill_opacity=0.7, line_opacity=0.2,
             legend_name='Popular Venues').add_to(charlotte_zip_map)

charlotte_zip_map

### Austin, Texas

In [None]:
cities_dfs['Austin, Texas'].head()

In [None]:
cities_maps['Austin, Texas']

In [None]:
austin_null_zips = cities_dfs['Austin, Texas'][cities_dfs['Austin, Texas']['postalCode'].isnull()]
austin_null_zips

In [None]:
# initialize Nominatim API 
geolocator = Nominatim(user_agent="austin_explorer")

# assign venue address input and get location info
for i in austin_null_zips.index:
#     place = str(cities_dfs['Austin, Texas'].loc[i, 'address']) + ', Austin, Texas'
    location = geolocator.reverse((austin_null_zips.loc[i, 'lat'], austin_null_zips.loc[i, 'lng']))
    
    # traverse the data
    data = location.raw
    loc_data = data['display_name'].split()
    print("Full Location")
    print(loc_data)
    print("Zip code : ",loc_data[-3])

In [None]:
for i in austin_null_zips.index:
    cities_dfs['Austin, Texas'].loc[i, 'postalCode'] = loc_data[-3].replace(',', '')

In [None]:
austin_null_zips = cities_dfs['Austin, Texas'][cities_dfs['Austin, Texas']['postalCode'].isnull()]
austin_null_zips

In [None]:
austin_df = cities_dfs['Austin, Texas'][['name', 'categories', 'lat', 'lng', 'postalCode']]
austin_df.head()

In [None]:
austin_grouped_zip = austin_df.groupby('postalCode').count()
austin_grouped_zip

In [None]:
print('There are {} unique categories.'.format(len(austin_df['categories'].unique())))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 10))

ax = austin_grouped_zip['categories'].plot(kind='barh', fontsize=12)
ax.set_title('Distribution of Austin\'s Top-Rated Venues by Zip Code', fontsize=20)
ax.set_xlabel('Frequency', fontsize=15)
ax.set_ylabel('Postal Code', fontsize=15)

In [None]:
austin_grouped_zip = austin_df.groupby('postalCode').count()
austin_grouped_zip = austin_grouped_zip.reset_index()
austin_grouped_zip

In [None]:
austin_zip_map = folium.Map(location=[high_influx_df.loc[high_influx_df['City'] == 'Austin', 'Lat'], 
                                  high_influx_df.loc[high_influx_df['City'] == 'Austin','Long']],
                        zoom_start=12)

austin_grouped_zip['postalCode'] = austin_grouped_zip['postalCode'].astype('str')

folium.Choropleth(geo_data='tx_texas_zip_codes_geo.min.json',
             data=austin_grouped_zip, # my dataset
             columns=['postalCode', 'categories'], # zip code is here for matching the geojson zipcode, sales price is the column that changes the color of zipcode areas
             key_on='feature.properties.ZCTA5CE10', # this path contains zipcodes in str type, this zipcodes should match with our ZIP CODE column
             fill_color='BuPu', fill_opacity=0.7, line_opacity=0.2,
             legend_name='Popular Venues').add_to(austin_zip_map)

austin_zip_map

### Fort Worth, Texas

In [None]:
cities_dfs['Fort Worth, Texas'].head()

In [None]:
cities_maps['Fort Worth, Texas']

In [None]:
fort_worth_null_zips = cities_dfs['Fort Worth, Texas'][cities_dfs['Fort Worth, Texas']['postalCode'].isnull()]
fort_worth_null_zips

In [None]:
fort_worth_df = cities_dfs['Fort Worth, Texas'][['name', 'categories', 'lat', 'lng', 'postalCode']]
fort_worth_df.head()

In [None]:
fort_worth_grouped_zip = fort_worth_df.groupby('postalCode').count()
fort_worth_grouped_zip

In [None]:
print('There are {} unique categories.'.format(len(fort_worth_df['categories'].unique())))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 10))

ax = fort_worth_grouped_zip['categories'].plot(kind='barh', fontsize=12)
ax.set_title('Distribution of Fort Worth\'s Top-Rated Venues by Zip Code', fontsize=20)
ax.set_xlabel('Frequency', fontsize=15)
ax.set_ylabel('Postal Code', fontsize=15)

In [None]:
fort_worth_grouped_zip = fort_worth_df.groupby('postalCode').count()
fort_worth_grouped_zip = fort_worth_grouped_zip.reset_index()
fort_worth_grouped_zip

In [None]:
fort_worth_zip_map = folium.Map(location=[high_influx_df.loc[high_influx_df['City'] == 'Fort Worth', 'Lat'], 
                                  high_influx_df.loc[high_influx_df['City'] == 'Fort Worth','Long']],
                        zoom_start=12)

fort_worth_grouped_zip['postalCode'] = fort_worth_grouped_zip['postalCode'].astype('str')

folium.Choropleth(geo_data='tx_texas_zip_codes_geo.min.json',
             data=fort_worth_grouped_zip, # my dataset
             columns=['postalCode', 'categories'], # zip code is here for matching the geojson zipcode, sales price is the column that changes the color of zipcode areas
             key_on='feature.properties.ZCTA5CE10', # this path contains zipcodes in str type, this zipcodes should match with our ZIP CODE column
             fill_color='BuPu', fill_opacity=0.7, line_opacity=0.2,
             legend_name='Popular Venues').add_to(fort_worth_zip_map)

fort_worth_zip_map