# Coursera Capstone Project

### Importing necessary libraries

In [13]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import csv

### Webscraping using BeautifulSoup

In [14]:
source = requests.get('https://en.wikipedia.org/wiki/Category:Neighbourhoods_in_Bangalore').text 
soup = BeautifulSoup(source, 'lxml')

In [15]:
file = open('bangalore.csv', 'w')
csv_writer = csv.writer(file)
csv_writer.writerow(['Neighbourhood'])

15

In [16]:
mwcg_grps = soup.find_all(class_ = "mw-category-group")

length = len(mwcg_grps) 

for i in range(1, length):  # Iterating through all neighborhoods
    lists = mwcg_grps [i].find_all('a')
    for list in lists:
        nbd = list.get('title') # Name of the neighbourhood
        csv_writer.writerow([nbd]) 
file.close()

In [17]:
df = pd.read_csv('bangalore.csv')

In [18]:
df.shape

(129, 1)

In [26]:
df.head()

Unnamed: 0,Neighbourhood
0,Adugodi
1,"Agara, Bangalore"
2,Ananthnagar
3,Anjanapura
4,Arekere


### Finding the latitude and the longitude of Bangalore using Nominatim

In [28]:
address = 'Bangalore, India'

geolocator = Nominatim(user_agent="ny_explorer")
loc = geolocator.geocode(address)
lat = loc.latitude
long = loc.longitude
print('The geograpical coordinate of Bangalore are {}, {}.'.format(lat, long))

The geograpical coordinate of Bangalore are 12.6201123, 77.4793074.


### Finding the latitudes and longitudes of all the neighborhoods

In [20]:
from geopy.geocoders import Nominatim      #Importing Nominatim API for geocoding
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

In [43]:
latitudes = [] 
longitudes = [] 
neighborhood = []
for nbd in df["Neighbourhood"] : 
    address = nbd
    geolocator = Nominatim(user_agent="blr_explorer")

    location=geolocator.geocode(address)
    print(location)
    if location!= None:                                                         #Excluding neighborhoods that return None as address
        lat = location.latitude 
        lng = location.longitude 
    neighbourhood.append(location)
    latitudes.append(lat) 
    longitudes.append(lng) 

Adugodi, South Zone, Bengaluru, Bangalore South, Bangalore Urban, Karnataka, India
Agara, Kanakapura taluk, Ramanagara district, Karnataka, India
None
Anjanapura, Ramanagara taluk, Ramanagara district, Karnataka, 571511, India
Arekere, Sakaleshapura taluk, Hassan district, Karnataka, India
Austin Town, East Zone, Bengaluru, Bangalore North, Bangalore Urban, Karnataka, - 560095, India
None
Bagaluru, Bangalore North, Bangalore Urban, Karnataka, 562135, India
None
Banashankari, Badami taluku, Bagalkote district, Karnataka, 587200, India
Banaswadi, East Zone, Bengaluru, Bangalore East, Bangalore Urban, Karnataka, 560102, India
Basavanagudi, Bhadravati taluk, Shimoga district, Karnataka, 577300, India
Basaveshwaranagar, West Zone, Bengaluru, Bangalore North, Bangalore Urban, Karnataka, 79, India
Bellandur Lake, Swagger Road, HSR Layout Ward, Bommanahalli Zone, Bengaluru, Bangalore South, Bangalore Urban, Karnataka, 530103, India
Bharathnagar 2nd Stage, Byadarahalli, Herohalli, Rajarajeshwar

Puttenahalli, Bommanahalli Zone, Bengaluru, Bangalore South, Bangalore Urban, Karnataka, 560083, India
Halcyon Technologies, CIT Nagar 1st Main Road, Kannammapet, Ward 141, Zone 10 Kodambakkam, சென்னை - Chennai, Chennai district, Tamil Nadu, 600022, India
Rajajinagar, West Zone, Bengaluru, Bangalore North, Bangalore Urban, Karnataka, 560010, India
Rajarajeshwari Nagar, Rajarajeshwari Nagar Zone, Bengaluru, Bangalore South, Bangalore Urban, Karnataka, India
Ramachandrapura, Gauribidanuru taluk, Chikkaballapura district, Karnataka, 561208, India
Ramagondanahalli, Tumkur taluk, Tumkur district, Karnataka, 572106, India
Ramamurthy Nagar, Mahadevapura Zone, Bengaluru, Bangalore East, Bangalore Urban, Karnataka, India
None
Richmond Town, Ontario County, New York, 14471, United States of America
13th Cross Sadashivanagar, Aramane Nagara Ward, West Zone, Bengaluru, Bangalore North, Bangalore Urban, Karnataka, 560080", India
Sahakara Nagar CQAL Kids Park, Tatanagar, Kodigehalli, Yelahanka Zone,

In [68]:
df['Latitude'] = latitudes
df['Longitude'] = longitudes


In [69]:
df.head()

Unnamed: 0,Neighbourhood,Latitude,Longitude
0,Adugodi,12.942847,77.610416
1,"Agara, Bangalore",12.620112,77.479307
2,Ananthnagar,12.620112,77.479307
3,Anjanapura,12.684704,77.271172
4,Arekere,12.888434,75.779485


In [66]:
df.shape    # Checking the number of neighborhoods obtained

(129, 2)

In [None]:
df['Address'] = neighborhoods

In [72]:
df = df.dropna()    #Dropping neighboorhoods with address None

In [73]:
df.head()

Unnamed: 0,Neighbourhood,Address,Latitude,Longitude
0,Adugodi,"(Adugodi, South Zone, Bengaluru, Bangalore Sou...",12.942847,77.610416
1,"Agara, Bangalore","(Agara, Kanakapura taluk, Ramanagara district,...",12.620112,77.479307
3,Anjanapura,"(Anjanapura, Bommanahalli Zone, Bengaluru, Ban...",12.858081,77.558071
4,Arekere,"(Arekere, Bommanahalli Zone, Bengaluru, Bangal...",12.887209,77.596049
5,Austin Town,"(Austin Town, East Zone, Bengaluru, Bangalore ...",12.961274,77.615294


In [74]:
df.drop(['Address'],axis=1,inplace=True)     #Dropping the address column as it is not needed for further analysis

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [76]:
df.shape   #Number of neighborhoods with accurate geographical locations

(113, 3)

In [77]:
df.to_csv('Bangalore_loc.csv')   #converting to csv file

### Plotting a map of Bangalore

In [79]:
# create map of Bangalore using latitude and longitude values
map_bangalore = folium.Map(location=[lat, long], zoom_start=11)   #Passing location info of Bangalore

# adding markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_bangalore)  
    
map_bangalore

### Obtaining venue details using Foursquare API

In [80]:
def get_category_type(row):                 #function to assign category labels
    categories_list = row['Category']  
    
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']


In [82]:
CLIENT_ID = 'CLIENT_ID'
CLIENT_SECRET = 'CLIENT_SECRET'                #Foursquare credentials
VERSION = 'VERSION'

In [83]:
explore_df_list = []

for i, nbd_name in enumerate(df3['Neighbourhood']):   
    try :
                                         # Getting the data of neighbourhood
        nbd_name = df3.loc[i, 'Neighbourhood']
        nbd_lat = df3.loc[i, 'Latitude']
        nbd_lng = df3.loc[i, 'Longitude']
        radius = 1000                                # Setting the radius as 1000 metres
        LIMIT = 30                              # Getting the top 30 venues
        url = 'https://api.foursquare.com/v2/venues/explore?client_id={} \
        &client_secret={}&ll={},{}&v={}&radius={}&limit={}'\
        .format(CLIENT_ID, CLIENT_SECRET, nbd_lat, nbd_lng, VERSION, radius, LIMIT)

        results = json.loads(requests.get(url).text)
        results = results['response']['groups'][0]['items']
        nearby = json_normalize(results)   # Flattens JSON
         # Filtering the columns
        filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
        nearby = nearby.loc[:, filtered_columns]
        columns = ['Name', 'Category', 'Latitude', 'Longitude']
        nearby.columns = columns
        nearby['Category'] = nearby.apply(get_category_type, axis=1)    

        for i, name in enumerate(nearby['Name']):
            s_list = nearby.loc[i, :].values.tolist()  
            f_list = [nbd_name, nbd_lat, nbd_lng] + s_list
            explore_df_list.append(f_list)
    
    except Exception as e:
        pass

In [84]:
explore_df = pd.DataFrame([item for item in explore_df_list])
explore_df.columns = ['Neighbourhood', 'Neighbourhood Latitude', 'Neighbourhood Longitude', 'Venue Name', 'Venue Category', 'Venue Latitude', 'Venue Longitude']
explore_df.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue Name,Venue Category,Venue Latitude,Venue Longitude
0,Adugodi,12.942847,77.610416,PVR IMAX,Movie Theater,12.934595,77.611321
1,Adugodi,12.942847,77.610416,Lot Like Crêpes,Creperie,12.936421,77.613284
2,Adugodi,12.942847,77.610416,Zingron - Naga Kitchen,Indian Restaurant,12.936271,77.615051
3,Adugodi,12.942847,77.610416,Koramangala Social,Lounge,12.935518,77.614097
4,Adugodi,12.942847,77.610416,Tommy Hilfiger,Clothing Store,12.934552,77.611347


### Performing one-hot encoding to categorize the venues

In [85]:
# One hot encoding
blr_onehot = pd.get_dummies(explore_df[['Venue Category']], prefix="", prefix_sep="")

# Add neighborhood column back to dataframe
blr_onehot['Neighbourhood'] = explore_df['Neighbourhood'] 

# Move neighborhood column to the first column
fixed_columns = [blr_onehot.columns[-1]] + blr_onehot.columns[:-1].values.tolist()
blr_onehot = blr_onehot[fixed_columns]

blr_onehot.head()

Unnamed: 0,Neighbourhood,ATM,Accessories Store,American Restaurant,Andhra Restaurant,Animal Shelter,Arcade,Art Gallery,Arts & Crafts Store,Asian Restaurant,...,Tennis Court,Theater,Toy / Game Store,Trail,Train Station,Udupi Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Shop,Women's Store
0,Adugodi,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Adugodi,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Adugodi,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Adugodi,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Adugodi,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [86]:
blr_grouped = blr_onehot.groupby('Neighbourhood').mean().reset_index()
blr_grouped.head()

Unnamed: 0,Neighbourhood,ATM,Accessories Store,American Restaurant,Andhra Restaurant,Animal Shelter,Arcade,Art Gallery,Arts & Crafts Store,Asian Restaurant,...,Tennis Court,Theater,Toy / Game Store,Trail,Train Station,Udupi Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Shop,Women's Store
0,Adugodi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Anjanapura,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Arekere,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Austin Town,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BTM Layout,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0


In [87]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Finding top 10 venues

In [88]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = blr_grouped['Neighbourhood']

for ind in np.arange(blr_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(blr_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adugodi,Indian Restaurant,Lounge,Dessert Shop,Multiplex,Donut Shop,Coffee Shop,Clothing Store,Juice Bar,Punjabi Restaurant,Gaming Cafe
1,Anjanapura,ATM,Pool,Campground,Electronics Store,Flower Shop,Flea Market,Fish Market,Financial or Legal Service,Field,Fast Food Restaurant
2,Arekere,Indian Restaurant,Pizza Place,Sporting Goods Shop,Department Store,Ice Cream Shop,Chinese Restaurant,Dessert Shop,Liquor Store,Fish Market,Café
3,Austin Town,Tea Room,Indian Restaurant,Hotel,Cocktail Bar,Market,Italian Restaurant,Mediterranean Restaurant,Middle Eastern Restaurant,Toy / Game Store,Donut Shop
4,BTM Layout,Bakery,Vegetarian / Vegan Restaurant,Snack Place,Ice Cream Shop,Café,Chinese Restaurant,Pizza Place,Indian Restaurant,Coffee Shop,Sandwich Place


In [89]:

blr_grouped_clustering = blr_grouped.drop('Neighbourhood', 1)

### Performing K-means clustering

In [91]:
fin = 20    #max k value

In [93]:
from sklearn.metrics import silhouette_score

index = []
score = []

for kclusters in range(2, fin) :
    bgc = blr_grouped_clustering
    kmeans = KMeans(n_clusters = kclusters, init = 'k-means++', random_state = 0).fit_predict(bgc)
    score = silhouette_score(bgc, kmeans)
    index.append(kclusters)
    score.append(score)

In [95]:
K_best = np.argmax(scores) + 2 # Finds the best K value
K_best

4

In [114]:
K = 4
bgc = blr_grouped_clustering
kmeans = KMeans(n_clusters = K, init = 'k-means++', random_state = 0).fit(bgc)

### Joining initial dataframe with venue categories

In [116]:
blr_merged = df
blr_merged = blr_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
blr_merged.dropna(inplace = True)
blr_merged['Cluster Labels'] = blr_merged['Cluster Labels'].astype(int)
blr_merged.head()

Unnamed: 0,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adugodi,12.942847,77.610416,0,Indian Restaurant,Lounge,Dessert Shop,Multiplex,Donut Shop,Coffee Shop,Clothing Store,Juice Bar,Punjabi Restaurant,Gaming Cafe
3,Anjanapura,12.858081,77.558071,0,ATM,Pool,Campground,Electronics Store,Flower Shop,Flea Market,Fish Market,Financial or Legal Service,Field,Fast Food Restaurant
4,Arekere,12.887209,77.596049,0,Indian Restaurant,Pizza Place,Sporting Goods Shop,Department Store,Ice Cream Shop,Chinese Restaurant,Dessert Shop,Liquor Store,Fish Market,Café
5,Austin Town,12.961274,77.615294,0,Tea Room,Indian Restaurant,Hotel,Cocktail Bar,Market,Italian Restaurant,Mediterranean Restaurant,Middle Eastern Restaurant,Toy / Game Store,Donut Shop
7,"Bagalur, Bangalore Urban",13.133187,77.668709,0,Food Truck,Memorial Site,Electronics Store,Flower Shop,Flea Market,Fish Market,Financial or Legal Service,Field,Fast Food Restaurant,Farmers Market


### Creating a map of the clusters

In [117]:
map_clusters = folium.Map(location=[blr_lat, blr_long], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(blr_merged['Latitude'], blr_merged['Longitude'], blr_merged['Neighbourhood'], blr_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' (Cluster ' + str(cluster + 1) + ')', parse_html=True)
    map_clusters.add_child(
        folium.features.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7))
       
map_clusters

## Neighbourhoods to open cafes

In [149]:
blr_merged[(blr_merged['Cluster Labels'] == 0 ) & (blr_merged['1st Most Common Venue']== 'Café')]

Unnamed: 0,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
26,Cooke Town,13.002785,77.624747,0,Café,Bakery,Department Store,Fast Food Restaurant,Shopping Mall,Chinese Restaurant,Indian Restaurant,Clothing Store,Coffee Shop,Flea Market
30,Devarachikkanahalli,12.902105,77.60122,0,Café,Multiplex,Pizza Place,Indian Restaurant,Italian Restaurant,Coffee Shop,Clothing Store,Mediterranean Restaurant,Garden,Middle Eastern Restaurant
33,Ejipura,12.945245,77.626914,0,Café,Pizza Place,Indian Restaurant,Gym / Fitness Center,Clothing Store,Ice Cream Shop,Department Store,Fast Food Restaurant,Food Court,Seafood Restaurant
47,HSR Layout,12.911623,77.638862,0,Café,Indian Restaurant,Ice Cream Shop,Pizza Place,Snack Place,Coffee Shop,Kebab Restaurant,Liquor Store,Burger Joint,Farmers Market
48,Hulimavu,12.877349,77.602803,0,Café,Movie Theater,Multiplex,South Indian Restaurant,Department Store,Badminton Court,Lake,Fast Food Restaurant,Bowling Alley,Electronics Store
101,Sadashivanagar,13.007708,77.579589,0,Café,Coffee Shop,Indian Restaurant,Department Store,Dessert Shop,Gym,Ice Cream Shop,Women's Store,Field,Fast Food Restaurant
105,Shikaripalya,12.835985,77.657181,0,Café,Indian Restaurant,Fast Food Restaurant,Hotel,Coffee Shop,Juice Bar,Department Store,Pizza Place,Chinese Restaurant,Asian Restaurant
