In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import unicodedata
import datetime
import json
import plotly.express as px

# CrossFit affiliate analysis

Analysing the current landscape of CrossFit affiliates.

**Resources**:

- https://www.crossfit.com/affiliate-list
- https://nominatim.org/release-docs/latest/
- https://plotly.com/python/mapbox-county-choropleth/

# Load and parse CrossFit data

In [2]:
with open('data/cf-affiliates_2020-06-18.html') as f:
    html = f.read()

soup = BeautifulSoup(html)
date = datetime.datetime.now()

In [3]:
cf_locations = {
    'gym_name' : [],
    'gym_url': [],
    'location' : []
}

affiliate_table = soup.find('table', {'id': 'affiliateTable'})
for tr in affiliate_table.find('tbody').find_all('tr'):
    td = tr.find_all('td')
    cf_locations['gym_name'].append(td[0].text)
    cf_locations['gym_url'].append(td[0].a['href'])
    cf_locations['location'].append(td[1].text)

print('Number of affiliates, numer of data points:')
print(pd.DataFrame(cf_locations).shape)

Number of affiliates, numer of data points:
(8877, 3)


# Clean CrossFit data

In [4]:
def clean_text(text):
    """Helper function to clean text data
    
    Parameters
    ----------
    text : str
        The text to clean
        
    Rerturns
    --------
    str
        The clean text
        
    Examples
    --------
    >>> s = clean_text('    àéêöheLlo')
    >>> print(s)
    aeeohello
    """
    # strip accents
    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass
    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    
    # lower case and strip
    test = str(text)
    text = text.lower()
    text = text.strip()

    return text

In [5]:
# clean the data
cf_locations_df = pd.DataFrame(cf_locations)
cf_locations_df['gym_name'] = cf_locations_df['gym_name'].apply(clean_text)
cf_locations_df['location'] = cf_locations_df['location'].apply(clean_text)
cf_locations_df[['city', 'state_prov_country']] = cf_locations_df['location'].str.split(',', expand=True, n=1)
cf_locations_df['city'] = cf_locations_df['city'].apply(clean_text)
cf_locations_df['state_prov_country'] = cf_locations_df['state_prov_country'].apply(clean_text)

cf_locations_df.head(10)

Unnamed: 0,gym_name,gym_url,location,city,state_prov_country
0,crossfit geldrop,https://www.crossfit-geldrop.nl/,"geldrop, netherlands",geldrop,netherlands
1,benmore crossfit,https://benmorenutritionfitness.com/v2/#page/b...,"ballycastle, united kingdom",ballycastle,united kingdom
2,crossfit movement factory,https://www.movementrehab.hr/crossfitmovementf...,"split, croatia",split,croatia
3,crossfit chapelle-lez-herlaimont,http://www.crossfit-chapelle-lez-herlaimont.com/,"chapelle-lez-herlaimont, belgium",chapelle-lez-herlaimont,belgium
4,crossfit 1848,http://www.crossfit1848.com/,"saint denis, reunion",saint denis,reunion
5,crossfit flipside,http://crossfit-flipside.de/,"paderborn, germany",paderborn,germany
6,crossfit 1825,https://crossfit1825.wixsite.com/cf1825,"washington, il",washington,il
7,crossfit altisea,https://www.crossfitaltisea.com/,"toulouges, france",toulouges,france
8,crossfit bsc,https://www.bengkelsnc.com/crossfit,"jakarta, indonesia",jakarta,indonesia
9,crossfit bg,https://crossfitbg.cat/,"berga, spain",berga,spain


# GeoJson data

## GeoJson Helper functions

In [6]:
def query_open_maps(query):
    """Find geoJSON data
    
    Parameters
    ----------
    query : str
        The location to query
        
    Returns
    -------
    dict
        best geoJSON results.
        
    Examples
    --------
    >>> query_open_maps('toronto, on')
    """
    url = 'https://nominatim.openstreetmap.org/'
    params = {
        'format': 'geojson',
        'polygon_geojson': 1,
        'limit': 5,
        'q': query,
    }
    r = requests.get(url, params)
    
    # loop through the results, and take the top result that finds
    # a geometry type of polygon    
    for i in json.loads(r.text)['features']:
        geo_json = i
        if geo_json['geometry']['type'] == 'Polygon':
            print('geojson data found!')
            return geo_json
    print(f'no geojson data found for: {query}')
    return None

In [7]:
def search_geojson(x):
    """Helper function to search the geojson data and return place_id
    
    Examples
    --------
    >>> print(search_geojson('houston, tx'))
    >>> print(search_geojson('toronto'))
    """
    for i in geo_json_dict['features']:
        city = i['properties']['location_id']
        if x == city:
            return i['properties']['place_id']
        else:
            continue
    return np.nan

## Load existing JSON data

Load the existing JSON data if there is any already saved.

In [8]:
try:
    geo_json_dict = json.load(open('data/geo_json.json'))
except FileNotFoundError:
    geo_json_dict = {
    'type': 'FeatureCollection',
     'features': []
}
    
print(len(geo_json_dict['features']))

365


Get a unique list of locations and see which already have geoJSON data

In [9]:
city_mapping = pd.DataFrame(cf_locations_df['location'].value_counts()).reset_index()
city_mapping = city_mapping.rename(columns={'index': 'location_id', 'location': 'num_gyms'})
city_mapping['place_id'] = np.nan

# check existing geoJSON data
city_mapping['place_id'] = city_mapping['location_id'].apply(search_geojson)

# summarise 
num_locations = city_mapping.shape[0]
num_geo_json = city_mapping.query('~place_id.isnull()').shape[0]
print(f'geo json data found for {num_geo_json} of {num_locations} locations.')
city_mapping.head(20)

geo json data found for 365 of 5629 locations.


Unnamed: 0,location_id,num_gyms,place_id
0,"sao paulo, brazil",41,235483727.0
1,"houston, tx",38,235682504.0
2,"san antonio, tx",29,
3,"chicago, il",27,235244846.0
4,"seattle, wa",25,235470814.0
5,"denver, co",23,234888713.0
6,"san diego, ca",23,235088160.0
7,"orlando, fl",21,
8,"miami, fl",21,235588862.0
9,"london, united kingdom",21,235147154.0


## Update geoJSON data for missing locations

In [None]:
# for i in range(0, 500):
for i in range(500, 1500):
# for i in range(0, 500):
# for i in range(0, 500):
    
    location = city_mapping.loc[i, 'location_id']
    print('=' * 64)
    print('#', i, ': ', location, sep='')
    
    # check if geoJSON data already exists for city
    if ~np.isnan(city_mapping.loc[i, 'place_id']):
        print('geo_json already exists')
        continue
    
    # call the open maps api to get data
    geo_json = query_open_maps(location)
    if geo_json is None:
        city_mapping.loc[i, 'place_id'] = np.nan
    else:
        geo_json['properties']['location_id'] = city_mapping.loc[i, 'location_id']
        geo_json_dict['features'].append(geo_json)
        city_mapping.loc[i, 'place_id'] = int(geo_json['properties']['place_id'])
            
# save the update_geoJSON data
with open('data/geo_json.json', 'w') as f:
    json.dump(geo_json_dict, f, indent=4)

city_mapping.head(20)

#500: largo, fl
no geojson data found for: largo, fl
#501: longview, tx
geojson data found!
#502: wheat ridge, co
geo_json already exists
#503: recife, brazil
geojson data found!
#504: dayton, oh
no geojson data found for: dayton, oh
#505: greenville, nc
no geojson data found for: greenville, nc
#506: rome, ga
no geojson data found for: rome, ga
#507: new haven, ct
geo_json already exists
#508: melbourne, fl
geo_json already exists
#509: edmond, ok
geo_json already exists
#510: riverside, ca
geo_json already exists
#511: ferrara, italy
geojson data found!
#512: tilburg, netherlands
geo_json already exists
#513: bundaberg, qld
geo_json already exists
#514: cleveland, tn
geojson data found!
#515: bristol, united kingdom
geojson data found!
#516: sterling, va
geo_json already exists
#517: langley, bc
geojson data found!
#518: norwood, ma
geo_json already exists
#519: hamilton, new zealand
no geojson data found for: hamilton, new zealand
#520: alameda, ca
no geojson data found for: alameda

In [None]:
# summarise 
num_locations = city_mapping.shape[0]
num_geo_json = city_mapping.query('~place_id.isnull()').shape[0]
print(f'geo json data found for {num_geo_json} of {num_locations} locations.')

# Choropleth

In [None]:
fig = px.choropleth_mapbox(
    data_frame=city_mapping,
    geojson=geo_json_dict,
    locations='place_id',
    hover_name='location_id',
    color='num_gyms',
    color_continuous_scale='Viridis',
    featureidkey='properties.place_id',
    center={"lat": 40.0, "lon": -90.0},
    mapbox_style="carto-positron", 
    zoom=2
)

fig.show()