In [1]:
import pandas as pd
import numpy as np
import json

from geopy.geocoders import Nominatim # to conver address to lat and long values

import requests # handle requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize # JSON to pandas dataframe

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium # for maps

## Scraping data from Wikipedia

In [10]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_neighbourhoods_in_Mumbai').text

In [11]:
soup = BeautifulSoup(data, 'html.parser') # BeautifulSoup object to parse data
area = []
location = []
latitudes = []
longitudes = []

In [12]:
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0): # condition because some rows may be empty && first row is header
        area.append(cells[0].text.rstrip('\n'))
        location.append(cells[1].text.rstrip('\n'))
        latitudes.append(cells[2].text.rstrip('\n'))
        longitudes.append(cells[3].text.rstrip('\n'))

In [13]:
mumbai_df = pd.DataFrame({'Area': area,
                           'Location': location,
                           'Latitude': latitudes,
                           'Longitude': longitudes})
print(mumbai_df.shape)
mumbai_df.head()


(93, 4)


Unnamed: 0,Area,Location,Latitude,Longitude
0,Amboli,"Andheri,Western Suburbs",19.1293,72.8434
1,"Chakala, Andheri",Western Suburbs,19.111388,72.860833
2,D.N. Nagar,"Andheri,Western Suburbs",19.124085,72.831373
3,Four Bungalows,"Andheri,Western Suburbs",19.124714,72.82721
4,Lokhandwala,"Andheri,Western Suburbs",19.130815,72.82927


## Use geopy to get coordinates of Mumbai

In [14]:
address = 'Mumbai'
geolocator = Nominatim(user_agent = 'lol-app') # for the purpose of assignment; give any name to user_agent
location= geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print("Mumbai coordinates : {}, {}".format(latitude, longitude))

Mumbai coordinates : 19.0759899, 72.8773928


## Create a map of Mumbai

In [15]:
map_mumbai = folium.Map(location = [latitude, longitude], zoom_start = 10.3) #creates map
#add markers to map

for lat, long, area, location in zip(mumbai_df['Latitude'], mumbai_df['Longitude'], mumbai_df['Area'], mumbai_df['Location']):
    label = '{}, {}'.format(area, location)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, long],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity=0.7
    ).add_to(map_mumbai)
    
map_mumbai

## Now, to use the FourSquare API to explore neighborhoods

In [16]:
CLIENT_ID = '4OJ2FUPCP0Z2FXECSBCN0AWBFWKIFIIAV3TCWWD3LST2TEHP'
CLIENT_SECRET = 'CKU0YUOGV3RSK331NVFUQPF0SWXDIMXU1N4FSITMFVB5V2DB'
VERSION = '20180604'

## Getting top 50 venues that are within a radius of 250 mts

In [18]:
radius = 250
LIMIT = 50

venues = []

for lat, long, area, location in zip(mumbai_df['Latitude'], mumbai_df['Longitude'], mumbai_df['Area'], mumbai_df['Location']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            area, 
            location,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [19]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

venues_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,Amboli,"Andheri,Western Suburbs",19.1293,72.8434,Subway,19.12786,72.844461,Sandwich Place
1,Amboli,"Andheri,Western Suburbs",19.1293,72.8434,Cafe Coffee Day,19.127748,72.844663,Coffee Shop
2,Amboli,"Andheri,Western Suburbs",19.1293,72.8434,Spices & Chillies,19.127765,72.844131,Asian Restaurant
3,Amboli,"Andheri,Western Suburbs",19.1293,72.8434,V33,19.129068,72.84367,Gym
4,"Chakala, Andheri",Western Suburbs,19.111388,72.860833,Cafe Coffee Day,19.112272,72.861106,Café


In [21]:
venues_df.columns = ['Area', 'Location', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
print(venues_df.shape)
venues_df.head()

(495, 8)


Unnamed: 0,Area,Location,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Amboli,"Andheri,Western Suburbs",19.1293,72.8434,Subway,19.12786,72.844461,Sandwich Place
1,Amboli,"Andheri,Western Suburbs",19.1293,72.8434,Cafe Coffee Day,19.127748,72.844663,Coffee Shop
2,Amboli,"Andheri,Western Suburbs",19.1293,72.8434,Spices & Chillies,19.127765,72.844131,Asian Restaurant
3,Amboli,"Andheri,Western Suburbs",19.1293,72.8434,V33,19.129068,72.84367,Gym
4,"Chakala, Andheri",Western Suburbs,19.111388,72.860833,Cafe Coffee Day,19.112272,72.861106,Café


In [23]:
venues_df.groupby(['Area', 'Location']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Area,Location,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Altamount Road,South Mumbai,1,1,1,1,1,1
Amboli,"Andheri,Western Suburbs",4,4,4,4,4,4
Amrut Nagar,"Ghatkopar,Eastern Suburbs",1,1,1,1,1,1
Asalfa,"Ghatkopar,Eastern Suburbs",2,2,2,2,2,2
Ballard Estate,"Fort,South Mumbai",4,4,4,4,4,4
...,...,...,...,...,...,...,...
Uttan,"Mira-Bhayandar,Western Suburbs",4,4,4,4,4,4
Vidyavihar,Eastern Suburbs,5,5,5,5,5,5
Vile Parle,Western Suburbs,1,1,1,1,1,1
Walkeshwar,South Mumbai,4,4,4,4,4,4


In [24]:
print("count of unique venue categories : {}".format(len(venues_df["VenueCategory"].unique())))

count of unique venue categories : 120


## Analysing Each Area

In [29]:
# one hot encoding
mumbai_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
mumbai_onehot['Area'] = venues_df['Area'] 
mumbai_onehot['Location'] = venues_df['Location']

fixed_columns = list(mumbai_onehot.columns[-3:]) + list(mumbai_onehot.columns[:-3])
mumbai_onehot = mumbai_onehot[fixed_columns]

print(mumbai_onehot.shape)
mumbai_onehot.head()

(495, 122)


Unnamed: 0,Women's Store,Area,Location,Amphitheater,Arcade,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Auto Workshop,...,Spanish Restaurant,Sports Bar,Steakhouse,Tea Room,Theater,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Whisky Bar,Wine Bar
0,0,Amboli,"Andheri,Western Suburbs",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,Amboli,"Andheri,Western Suburbs",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,Amboli,"Andheri,Western Suburbs",0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,Amboli,"Andheri,Western Suburbs",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,"Chakala, Andheri",Western Suburbs,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Now to group this data by Area and take the mean of occurence of each category in that neighborhood

In [30]:
mumbai_grouped = mumbai_onehot.groupby(['Area', 'Location']).mean().reset_index()

print(mumbai_grouped.shape)
mumbai_grouped

(77, 122)


Unnamed: 0,Area,Location,Women's Store,Amphitheater,Arcade,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Auto Workshop,...,Spanish Restaurant,Sports Bar,Steakhouse,Tea Room,Theater,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Whisky Bar,Wine Bar
0,Altamount Road,South Mumbai,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Amboli,"Andheri,Western Suburbs",0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Amrut Nagar,"Ghatkopar,Eastern Suburbs",0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Asalfa,"Ghatkopar,Eastern Suburbs",0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Ballard Estate,"Fort,South Mumbai",0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Uttan,"Mira-Bhayandar,Western Suburbs",0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73,Vidyavihar,Eastern Suburbs,0.0,0.0,0.0,0.0,0.00,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74,Vile Parle,Western Suburbs,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75,Walkeshwar,South Mumbai,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Top 10 venues by category for each postal code

In [37]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['Area', 'Location']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Area'] = mumbai_grouped['Area']
neighborhoods_venues_sorted['Location'] = mumbai_grouped['Location']

for ind in np.arange(mumbai_grouped.shape[0]):
    row_categories = mumbai_grouped.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 2:] = row_categories_sorted.index.values[0:num_top_venues]

# neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted

(77, 12)


Unnamed: 0,Area,Location,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Altamount Road,South Mumbai,Café,Wine Bar,Convenience Store,Cupcake Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner,Electronics Store
1,Amboli,"Andheri,Western Suburbs",Sandwich Place,Coffee Shop,Asian Restaurant,Gym,Wine Bar,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner
2,Amrut Nagar,"Ghatkopar,Eastern Suburbs",Shopping Mall,Wine Bar,French Restaurant,Cupcake Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner,Electronics Store
3,Asalfa,"Ghatkopar,Eastern Suburbs",Playground,Light Rail Station,Wine Bar,Food Truck,Cupcake Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner
4,Ballard Estate,"Fort,South Mumbai",Harbor / Marina,Hotel,Convenience Store,Wine Bar,Food Truck,Cupcake Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop
...,...,...,...,...,...,...,...,...,...,...,...,...
72,Uttan,"Mira-Bhayandar,Western Suburbs",Indian Restaurant,Juice Bar,Resort,Bus Station,Food Truck,Cupcake Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop
73,Vidyavihar,Eastern Suburbs,Restaurant,Athletics & Sports,Fast Food Restaurant,Bus Station,Wine Bar,French Restaurant,Dance Studio,Deli / Bodega,Department Store,Dessert Shop
74,Vile Parle,Western Suburbs,Turkish Restaurant,Wine Bar,French Restaurant,Cupcake Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner,Electronics Store
75,Walkeshwar,South Mumbai,Indian Restaurant,Lighthouse,Coffee Shop,Ice Cream Shop,Gym / Fitness Center,Food & Drink Shop,Cupcake Shop,Hotel,Dance Studio,Deli / Bodega
