# Food business Segmentation in the Toluca Valley

### Import libraries

In [214]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import io
import numpy as np
import folium # map rendering library
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

### Get the entities conforming the Toluca Valley Zone

Obtain from the Wikipedia

In [215]:
#Get the document
wikiURL='https://es.m.wikipedia.org/wiki/Zona_metropolitana_del_Valle_de_Toluca'
source = requests.get(wikiURL).text
wikiText = BeautifulSoup(source, 'lxml')
#wikiText

### Read the codes HTML table into a dataframe

In [216]:
table = wikiText.find('table', 'wikitable')
table_rows = table.find_all('tr')

l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)

#Clean some trash
l.pop(0)
l.pop(0)
tvData = pd.DataFrame(l, columns=["INEGI_ID", "Municipality","Population"])
#Clean values
tvData['INEGI_ID']= tvData['INEGI_ID'].str.strip()
tvData['Municipality']= tvData['Municipality'].str.strip()
tvData['Population']= tvData['Population'].str.strip()
tvData['Population']= tvData['Population'].str.strip()
tvData['Population']= tvData['Population'].str.replace(',','')
tvData['Population']= pd.to_numeric(tvData['Population'])
tvData = tvData[(tvData.Municipality != 'Total') ]

tvData

Unnamed: 0,INEGI_ID,Municipality,Population
0,106,Toluca,489333
1,54,Metepec,206005
2,118,Zinacantepec,136167
3,5,Almoloya de Juárez,147653
4,51,Lerma de Villada,134799
5,67,Villa Cuauhtémoc,78146
6,76,San Mateo Atenco,72579
7,62,Ocoyoacac,61805
8,18,Calimaya,47033
9,115,Xonacatlán,46331


### Read Municipality Coordinates
The file was obtained from https://www.coordenadas.com.es/mexico/pueblos-de-mexico/ and formatted as a CSV by hand

In [230]:
tvCoords = pd.read_csv('Final/MunicipioCoords.csv')
tvCoords.head()

Unnamed: 0,City,Latitude,Longitude
0,Acambay,19.95389,-99.84306
1,Acolman,19.64146,-98.88145
2,Aculco,20.0982,-99.8281
3,Almoloya de Alquisiras,18.55,-100.30833
4,Almoloya de Juárez,19.370377,-99.758708


#### Obtain only the municipalities for Toluca Valley, not all State of Mexico

In [231]:
#Inner join will be used to use set with coordinates only
municipalityCoords = pd.merge( tvData, tvCoords, left_on='Municipality', right_on='City', how='inner')
municipalityCoords =  municipalityCoords[['Municipality', 'Population','Latitude','Longitude']]
municipalityCoords

Unnamed: 0,Municipality,Population,Latitude,Longitude
0,Toluca,489333,19.288762,-99.655247
1,Metepec,206005,19.2564,-99.6048
2,Zinacantepec,136167,19.286518,-99.727837
3,Almoloya de Juárez,147653,19.370377,-99.758708
4,Lerma de Villada,134799,19.31501,-99.5728
5,San Mateo Atenco,72579,19.275382,-99.536512
6,Ocoyoacac,61805,19.27167,-99.46116
7,Calimaya,47033,19.16564,-99.61778
8,Xonacatlán,46331,19.405233,-99.52339
9,Tenango del Valle,21765,19.108245,-99.589186


### Plot 

In [232]:
address = 'Toluca, MX'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toluca are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toluca are 19.292545, -99.6569007.


In [290]:
map_tolValley = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, municipality in zip(municipalityCoords['Latitude'], municipalityCoords['Longitude'], municipalityCoords['Municipality']):
    label = '{}'.format(municipality)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=50,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tolValley)  
    
map_tolValley

### Read from Foursquare

In [264]:
CLIENT_ID = 'QYHMAGKNC55K0A4XBZV4MZB5FMULHYZPAYXANCSSXXXGQ1JN' # your Foursquare ID
CLIENT_SECRET = 'H0NTY15W3LXYJ0CKV0KDXE3SBYECHFYXDEAZEO2MMXATSBWC' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 1500 # define radius

In [265]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Municipality', 
                  'Municipality Latitude', 
                  'Municipality Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [266]:
TolValley_venues = getNearbyVenues(names=municipalityCoords['Municipality'],
                                   latitudes=municipalityCoords['Latitude'],
                                   longitudes=municipalityCoords['Longitude']
                                  )

Toluca
Metepec
Zinacantepec
Almoloya de Juárez
Lerma de Villada
San Mateo Atenco
Ocoyoacac
Calimaya
Xonacatlán
Tenango del Valle
San Antonio la Isla
Mexicaltzingo
Chapultepec


In [282]:
# Eliminate venues that are not commercial food venues
TolValleyFood = TolValley_venues[
    (TolValley_venues['Venue Category'] != 'Park') & 
    (TolValley_venues['Venue Category'] != 'Amphitheater') &
    (TolValley_venues['Venue Category'] != 'Hotel') &
    (TolValley_venues['Venue Category'] != 'Church') &
    (TolValley_venues['Venue Category'] != 'Spa') &
    (TolValley_venues['Venue Category'] != 'Art Gallery') &
    (TolValley_venues['Venue Category'] != 'Deli / Bodega') &
    (TolValley_venues['Venue Category'] != 'Cosmetics Shop') &
    (TolValley_venues['Venue Category'] != 'Optical Shop') &
    (TolValley_venues['Venue Category'] != 'Record Shop') &
    (TolValley_venues['Venue Category'] != 'Science Museum') &
    (TolValley_venues['Venue Category'] != 'Pharmacy') &
    (TolValley_venues['Venue Category'] != 'Speakeasy') &
    (TolValley_venues['Venue Category'] != 'Bookstore') &
    (TolValley_venues['Venue Category'] != 'Gym') &
    (TolValley_venues['Venue Category'] != 'Bar') &
    (TolValley_venues['Venue Category'] != 'Flea Market') &
    (TolValley_venues['Venue Category'] != 'Historic Site') &
    (TolValley_venues['Venue Category'] != 'Pool Hall') &
    (TolValley_venues['Venue Category'] != 'Boutique') &
    (TolValley_venues['Venue Category'] != 'Garden') &
    (TolValley_venues['Venue Category'] != 'Military Base') &
    (TolValley_venues['Venue Category'] != 'Public Art') &
    (TolValley_venues['Venue Category'] != 'Athletics & Sports') &
    (TolValley_venues['Venue Category'] != 'Soccer Field') &
    (TolValley_venues['Venue Category'] != 'Big Box Store') &
    (TolValley_venues['Venue Category'] != 'Plaza') &
    (TolValley_venues['Venue Category'] != 'Shopping Mall') &
    (TolValley_venues['Venue Category'] != 'Art Museum') &
    (TolValley_venues['Venue Category'] != 'Clothing Store') &
    (TolValley_venues['Venue Category'] != 'Department Store') &
    (TolValley_venues['Venue Category'] != 'Internet Cafe') &
    (TolValley_venues['Venue Category'] != 'Construction & Landscaping') &
    (TolValley_venues['Venue Category'] != 'Farm') &
    (TolValley_venues['Venue Category'] != 'Forest') 
]

print(TolValleyFood.shape)
TolValleyFood.head()


(135, 7)


Unnamed: 0,Municipality,Municipality Latitude,Municipality Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Toluca,19.288762,-99.655247,Bistró Mecha Centro Histórico,19.290751,-99.654963,Bistro
1,Toluca,19.288762,-99.655247,Master chocolatier Turin,19.290175,-99.654965,Chocolate Shop
4,Toluca,19.288762,-99.655247,Café Catalán,19.289049,-99.656387,Café
5,Toluca,19.288762,-99.655247,A Donde Sea,19.287023,-99.657701,Breakfast Spot
6,Toluca,19.288762,-99.655247,The Green Factory,19.289671,-99.657675,Salad Place


In [283]:
# one hot encoding
tolValley_onehot = pd.get_dummies(TolValleyFood[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tolValley_onehot['Municipality'] = TolValleyFood['Municipality'] 

# move neighborhood column to the first column
fixed_columns = [tolValley_onehot.columns[-1]] + list(tolValley_onehot.columns[:-1])
tolValley_onehot = tolValley_onehot[fixed_columns]

tValley_grouped = tolValley_onehot.groupby('Municipality').mean().reset_index()
print(tValley_grouped.shape)
#tValley_grouped



(13, 43)


### Cluster Municipalities

In [284]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Municipality']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
tvFood_sorted = pd.DataFrame(columns=columns)
tvFood_sorted['Municipality'] = tValley_grouped['Municipality']

for ind in np.arange(tValley_grouped.shape[0]):
    tvFood_sorted.iloc[ind, 1:] = return_most_common_venues(tValley_grouped.iloc[ind, :], num_top_venues)

#tvFood_sorted


In [285]:
# set number of clusters
kclusters = 5

tvFood_grouped_clustering = tValley_grouped.drop('Municipality', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tvFood_grouped_clustering)

# check cluster labels generated for each row in the dataframe
print( kmeans.labels_[0:10] )

# add clustering labels
tvFood_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

tvFood_merged = municipalityCoords

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
tvFood_merged = tvFood_merged.join(tvFood_sorted.set_index('Municipality'), on='Municipality')
tvFood_merged = tvFood_merged.fillna(0)
tvFood_merged['Cluster Labels'] = tvFood_merged['Cluster Labels'].astype(int)


[2 1 4 1 2 2 0 3 1 2]


In [286]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tvFood_merged['Latitude'], tvFood_merged['Longitude'], tvFood_merged['Municipality'], tvFood_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=50,
        popup=label,
        color=rainbow[cluster - 1],
        fill=True,
        fill_color=rainbow[cluster - 1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Check the cluster versus the population just for fun

There seems to be not correlation between the cluster and the population

In [287]:
tvFood_merged[['Municipality','Population','Cluster Labels']]

Unnamed: 0,Municipality,Population,Cluster Labels
0,Toluca,489333,2
1,Metepec,206005,2
2,Zinacantepec,136167,0
3,Almoloya de Juárez,147653,2
4,Lerma de Villada,134799,1
5,San Mateo Atenco,72579,1
6,Ocoyoacac,61805,0
7,Calimaya,47033,1
8,Xonacatlán,46331,1
9,Tenango del Valle,21765,2


In [288]:
tvFood_sorted

Unnamed: 0,Cluster Labels,Municipality,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,2,Almoloya de Juárez,Mexican Restaurant,Brewery,Winery,Cocktail Bar,Fish Market,Fast Food Restaurant,Diner,Creperie,Convenience Store,Comfort Food Restaurant
1,1,Calimaya,Taco Place,Winery,Chocolate Shop,Fish Market,Fast Food Restaurant,Diner,Creperie,Convenience Store,Comfort Food Restaurant,Coffee Shop
2,4,Chapultepec,Food Stand,Restaurant,Fish Market,Fast Food Restaurant,Diner,Creperie,Convenience Store,Comfort Food Restaurant,Coffee Shop,Cocktail Bar
3,1,Lerma de Villada,Taco Place,BBQ Joint,Fast Food Restaurant,Comfort Food Restaurant,Chocolate Shop,Fish Market,Diner,Creperie,Convenience Store,Coffee Shop
4,2,Metepec,Mexican Restaurant,Restaurant,Seafood Restaurant,Taco Place,Coffee Shop,Karaoke Bar,Winery,Breakfast Spot,Café,Cocktail Bar
5,2,Mexicaltzingo,Breakfast Spot,Mexican Restaurant,Coffee Shop,Winery,Cocktail Bar,Fish Market,Fast Food Restaurant,Diner,Creperie,Convenience Store
6,0,Ocoyoacac,Mexican Restaurant,Winery,Cocktail Bar,Fish Market,Fast Food Restaurant,Diner,Creperie,Convenience Store,Comfort Food Restaurant,Coffee Shop
7,3,San Antonio la Isla,Pizza Place,Winery,Cocktail Bar,Fish Market,Fast Food Restaurant,Diner,Creperie,Convenience Store,Comfort Food Restaurant,Coffee Shop
8,1,San Mateo Atenco,Taco Place,Beer Garden,Convenience Store,Coffee Shop,Winery,Cocktail Bar,Fish Market,Fast Food Restaurant,Diner,Creperie
9,2,Tenango del Valle,Convenience Store,BBQ Joint,Taco Place,Breakfast Spot,Burger Joint,Burrito Place,Coffee Shop,Fish Market,Fast Food Restaurant,Diner
