# Neighbourhoods in Mumbai

In [1]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 

# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize
import folium # plotting library

# import k-means from clustering stage
from sklearn.cluster import KMeans
from sklearn.datasets.samples_generator import make_blobs

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors



In [2]:
colors_old = { 'Western Suburbs':'red','Eastern Suburbs':'blue','Harbour Suburbs':'purple','South Mumbai':'green',}
df=pd.read_excel('Neighbourhoods_in_Mumbai.xlsx')
df

Unnamed: 0,Suburbs,Neighborhood,Suburbs_Zone,Latitude,Longitude
0,Andheri,Andheri,Western Suburbs,19.119,72.847
1,Andheri,Marol,Western Suburbs,19.1173,72.884
2,Bhayandar,Bhayandar,Western Suburbs,19.29,72.85
3,Bandra,Bandra,Western Suburbs,19.054444,72.840556
4,Borivali (West),Borivali (West),Western Suburbs,19.2381,72.8523
5,Borivali (East),Borivali (East),Western Suburbs,19.2298,72.8609
6,Dahisar,Dahisar,Western Suburbs,19.250069,72.859347
7,Goregaon,Goregaon,Western Suburbs,19.155,72.85
8,Jogeshwari,Jogeshwari,Western Suburbs,19.1439,72.8428
9,Juhu,Juhu,Western Suburbs,19.1,72.83


In [3]:
address = 'Mumbai Suburban'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

19.13095765 72.88593095460952


In [4]:
venues_map = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, label, zone in zip(df.Latitude, df.Longitude, df.Neighborhood, df.Suburbs_Zone):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        color=colors_old[zone],
        popup=label,
        fill = True,
        fill_color=colors_old[zone],
        fill_opacity=0.6
    ).add_to(venues_map)
    
venues_map

#### Define Foursquare Credentials and Version

In [5]:
CLIENT_ID = 'WZXGGFYR4BVJAFMNNNS2QBSQBPHNDRW2TWAVESILDLYCW552' # your Foursquare ID
CLIENT_SECRET = 'XGNR3QG4HZGSA5QBU0FCKFUJFD5DQXHPWROG14KOQSG1DKKH' # your Foursquare Secret
VERSION = '20180604' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: WZXGGFYR4BVJAFMNNNS2QBSQBPHNDRW2TWAVESILDLYCW552
CLIENT_SECRET:XGNR3QG4HZGSA5QBU0FCKFUJFD5DQXHPWROG14KOQSG1DKKH


# Explore Neighborhoods in Mumbai

#### Let's create a function to repeat the same process to all the neighborhoods.

In [6]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    LIMIT = 100 # limit of number of venues returned by Foursquare API
    radius = 500 # define radius
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [None]:
mumbai_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Andheri
Marol
Bhayandar
Bandra
Borivali (West)
Borivali (East)
Dahisar
Goregaon
Jogeshwari
Juhu
Kandivali west
Kandivali east
Khar
Malad
Santacruz
Vasai
Naigaon
Nalasopara
Virar
Vile Parle
Bhandup
Ghatkopar
Kanjurmarg
Kurla
Mulund
Powai
Vidyavihar
Vikhroli
Chembur
Govandi
Mankhurd
Trombay
Agripada
Churchgate
Cotton Green
Cuffe Parade
Cumbala Hill
Dhobitalao
Dongri
Kala Ghoda
Lower Parel
Mahalaxmi
Mahim
Malabar Hill
Marine Lines
Mumbai Central
Nariman Point
Prabhadevi
Sion
Worli
Dharavi
Koliwada
Kumbarwara
Antop Hill
Byculla
Colaba


#### Let's check the size of the resulting dataframe

In [None]:
print(mumbai_venues.shape)
type(mumbai_venues)

Let's check how many venues were returned for each neighborhood

In [None]:
mumbai_venues.groupby('Neighborhood').count()

#### Let's find out how many unique categories can be curated from all the returned venues

In [None]:
print('There are {} uniques categories.'.format(len(mumbai_venues['Venue Category'].unique())))

# Analyze Each Neighborhood

In [None]:
# one hot encoding
mumbai_onehot = pd.get_dummies(mumbai_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
mumbai_onehot['Neighborhood'] = mumbai_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [mumbai_onehot.columns[-1]] + list(mumbai_onehot.columns[:-1])
mumbai_onehot = mumbai_onehot[fixed_columns]

mumbai_onehot.head()

And let's examine the new dataframe size.

In [None]:
mumbai_onehot.shape

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [None]:
mumbai_grouped = mumbai_onehot.groupby('Neighborhood').mean().reset_index()
mumbai_grouped

#### Let's confirm the new size

In [None]:
mumbai_grouped.shape

#### Let's print each neighborhood along with the top 5 most common venues

In [None]:
num_top_venues = 5

for hood in mumbai_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = mumbai_grouped[mumbai_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

#### Let's put that into a pandas dataframe

First, let's write a function to sort the venues in descending order.

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = mumbai_grouped['Neighborhood']

for ind in np.arange(mumbai_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(mumbai_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

# Cluster Neighborhoods

Run k-means to cluster the neighborhood into 5 clusters.

In [None]:
# set number of clusters
kclusters = 5

mumbai_grouped_clustering = mumbai_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(mumbai_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

mumbai_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
mumbai_merged = mumbai_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')



# check the last columns!
mumbai_merged#.head()

Finally, let's visualize the resulting clusters

In [None]:
mumbai_merged=mumbai_merged.dropna()
#int(mumbai_merged['Cluster Labels'][0])

In [None]:
mumbai_merged

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(mumbai_merged['Latitude'], mumbai_merged['Longitude'], mumbai_merged['Neighborhood'], mumbai_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    colorid=int(cluster)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[colorid],
        fill=True,
        fill_color=rainbow[colorid],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Examine Clusters

Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.

### Cluster 1

In [None]:
mumbai_merged.loc[mumbai_merged['Cluster Labels'] == 0, mumbai_merged.columns[[1] + list(range(5, mumbai_merged.shape[1]))]]

### Cluster 2

In [None]:
mumbai_merged.loc[mumbai_merged['Cluster Labels'] == 1, mumbai_merged.columns[[1] + list(range(5, mumbai_merged.shape[1]))]]

### Cluster 3

In [None]:
mumbai_merged.loc[mumbai_merged['Cluster Labels'] == 2, mumbai_merged.columns[[1] + list(range(5, mumbai_merged.shape[1]))]]

### Cluster 4

In [None]:
mumbai_merged.loc[mumbai_merged['Cluster Labels'] == 3, mumbai_merged.columns[[1] + list(range(5, mumbai_merged.shape[1]))]]

### Cluster 5

In [None]:
mumbai_merged.loc[mumbai_merged['Cluster Labels'] == 4, mumbai_merged.columns[[1] + list(range(5, mumbai_merged.shape[1]))]]