Applied Data Science Capstone-Week 5

In [1]:
!pip install bs4
!pip install geopy
import requests
from bs4 import BeautifulSoup
import pandas as pd
from geopy.geocoders import Nominatim
import folium
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors



The following cell contains the code for: 1) Creating a Dataframe via Web Scraping 2) Adding the Latitude and Longitude coordinates of each Capital City to the Dataframe

In [2]:
urlData = requests.get('https://en.wikipedia.org/wiki/List_of_state_and_union_territory_capitals_in_India').text
soup=BeautifulSoup(urlData,'html.parser')
soupTableDataStates=soup.find_all('table')[1].find_all('td')
stateUT=[]
for i in range(1,len(soupTableDataStates),6):
    if '(Summer)' in soupTableDataStates[i].text:
        stateUT.append(soupTableDataStates[i].text.split("(Summer)")[0].strip())
        if ']' in soupTableDataStates[i].text.split("(Summer)")[1].split("(Winter)")[0]:
            stateUT.append(soupTableDataStates[i].text.split("(Summer)")[1].split("(Winter)")[0].split("]")[1].strip())
        else:
            stateUT.append(soupTableDataStates[i].text.split("(Summer)")[1].split("(Winter)")[0].strip())
    elif '(' in soupTableDataStates[i].text:
        stateUT.append(soupTableDataStates[i].text.split("(")[0].strip())
    else:
        stateUT.append(soupTableDataStates[i].text.split("[")[0].strip())
df=pd.DataFrame(data=[stateUT]).transpose()
df.columns=['Capital City']
df['Latitude']=0
df['Longitude']=0
geolocator=Nominatim(user_agent="India")
for index, row in df.iterrows(): 
    df.loc[index, 'Latitude']='None' if geolocator.geocode(row['Capital City'])==None else geolocator.geocode(row['Capital City']).latitude
    df.loc[index, 'Longitude']='None' if geolocator.geocode(row['Capital City'])==None else geolocator.geocode(row['Capital City']).longitude
df.drop_duplicates(subset="Capital City",keep='first',inplace=True) 
df.reset_index(inplace=True,drop=True) 
df

Unnamed: 0,Capital City,Latitude,Longitude
0,Port Blair,11.664535,92.739045
1,Amaravati,16.57442,80.355608
2,Itanagar,27.097966,93.623729
3,Dispur,26.151308,91.79338
4,Patna,25.609324,85.123525
5,Chandigarh,30.719402,76.764655
6,Naya Raipur,21.161027,81.786441
7,Daman,20.420005,72.863763
8,New Delhi,28.614179,77.202266
9,Panaji,15.498995,73.828214


The following cell contains the code for visualizing the capital cities of India 

In [3]:
address='India'
geolocator=Nominatim(user_agent="india_explorer")
location=geolocator.geocode(address)
latitude=location.latitude
longitude=location.longitude
print('The geograpical coordinates of India are {}, {}.'.format(latitude, longitude))
map_india=folium.Map(location=[latitude, longitude], zoom_start=4)
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Capital City']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_india)  
map_india

The geograpical coordinates of India are 22.3511148, 78.6677428.


The following cell contains the code for exploring the capital cities of India

In [4]:
CLIENT_ID='VSFLJD2ZWXQECAKVX35E31LYZHJGVPDLLWT50TBBNDDD1TD5' # your Foursquare ID
CLIENT_SECRET='45LTCSIVWTECKFB010KP5BEQ3HCISK2MHASG2BNKKS2X2IWF' # your Foursquare Secret
VERSION='20180605' # Foursquare API version
LIMIT=100
radius=500
venues_list=[]
for city, lat, lng in zip(df['Capital City'], df['Latitude'], df['Longitude']):
    url='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        lng, 
        radius, 
        LIMIT)
    results=requests.get(url).json()["response"]['groups'][0]['items']
    venues_list.append([(
        city, 
        lat, 
        lng, 
        v['venue']['name'], 
        v['venue']['location']['lat'], 
        v['venue']['location']['lng'],  
        v['venue']['categories'][0]['name']) for v in results])
india_venues=pd.DataFrame([item for venue_list in venues_list for item in venue_list])
india_venues.columns=['Capital City', 
              'City Latitude', 
              'City Longitude', 
              'Venue', 
              'Venue Latitude', 
              'Venue Longitude', 
              'Venue Category']
india_venues

Unnamed: 0,Capital City,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Port Blair,11.664535,92.739045,MM Tours & Travels,11.664988,92.738922,Tourist Information Center
1,Port Blair,11.664535,92.739045,Sagar Emporium,11.663861,92.735847,Arts & Crafts Store
2,Port Blair,11.664535,92.739045,andaman fried chicken,11.667943,92.737825,Fast Food Restaurant
3,Port Blair,11.664535,92.739045,Wandoor,11.667975,92.737324,Beach
4,Amaravati,16.574420,80.355608,Amaravathi Buddhist Museum,16.575949,80.356569,History Museum
...,...,...,...,...,...,...,...
244,Kolkata,22.545412,88.356775,Shisha Reincarnated,22.546911,88.353341,Nightclub
245,Kolkata,22.545412,88.356775,Cafe Coffee Day,22.545478,88.352501,Café
246,Kolkata,22.545412,88.356775,Pantaloons,22.546955,88.353446,Clothing Store
247,Kolkata,22.545412,88.356775,Urban Desi,22.544248,88.352160,Indian Restaurant


The following cell contains the code for listing the various venues in the capital cities of India

In [5]:
india_venue_categories = india_venues.groupby(['Capital City', 'Venue Category']).count()
india_venue_categories.drop(['City Latitude', 'City Longitude',  'Venue Latitude', 'Venue Longitude'], axis = 1, inplace=True)
india_venue_categories.rename(columns = {'Venue':'Count'}, inplace = True) 
india_venue_categories.drop(['Count'], axis = 1, inplace=True)
india_venue_categories

Capital City,Venue Category
Agartala,Electronics Store
Agartala,Fast Food Restaurant
Agartala,Indian Restaurant
Agartala,Multiplex
Aizawl,Bakery
...,...
Srinagar,Shopping Mall
Thiruvananthapuram,Electronics Store
Thiruvananthapuram,Historic Site
Thiruvananthapuram,History Museum


The following cell contains the code for listing the various types of restaurants in the capital cities of India

In [6]:
india_venue_categories_col=india_venue_categories.reset_index()
india_venue_restaurants=india_venue_categories_col[india_venue_categories_col['Venue Category'].str.contains("Restaurant")]
india_venue_restaurants.rename(columns = {'Venue Category':'Restaurant Category'}, inplace = True) 
india_venue_restaurants_list=india_venue_restaurants.groupby(['Capital City', 'Restaurant Category']).count()
india_venue_restaurants_list

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Capital City,Restaurant Category
Agartala,Fast Food Restaurant
Agartala,Indian Restaurant
Aizawl,Restaurant
Bhubaneswar,Asian Restaurant
Bhubaneswar,Fast Food Restaurant
Chandigarh,American Restaurant
Chandigarh,Asian Restaurant
Chandigarh,Fast Food Restaurant
Chandigarh,Indian Restaurant
Chandigarh,Italian Restaurant


The following cell contains the code for listing out the top five most common restaurant types in each capital city of India

In [7]:
india_restaurants_onehot=pd.get_dummies(india_venue_restaurants[['Restaurant Category']], prefix="", prefix_sep="")
india_restaurants_onehot['Capital City']=india_venue_restaurants['Capital City']
fixed_columns=[india_restaurants_onehot.columns[-1]] + list(india_restaurants_onehot.columns[:-1])
india_restaurants_onehot=india_restaurants_onehot[fixed_columns]
india_restaurants_grouped=india_restaurants_onehot.groupby('Capital City').mean().reset_index()
num_top_venues=5
for hood in india_restaurants_grouped['Capital City']:
    print("----"+hood+"----")
    temp=india_restaurants_grouped[india_restaurants_grouped['Capital City'] == hood].T.reset_index()
    temp.columns=['restaurant','freq']
    temp=temp.iloc[1:]
    temp['freq']=temp['freq'].astype(float)
    temp=temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agartala----
                       restaurant  freq
0            Fast Food Restaurant   0.5
1               Indian Restaurant   0.5
2               Afghan Restaurant   0.0
3  Multicuisine Indian Restaurant   0.0
4              Tibetan Restaurant   0.0


----Aizawl----
                        restaurant  freq
0                       Restaurant   1.0
1                Afghan Restaurant   0.0
2               Mughlai Restaurant   0.0
3               Tibetan Restaurant   0.0
4  Southern / Soul Food Restaurant   0.0


----Bhubaneswar----
                       restaurant  freq
0                Asian Restaurant   0.5
1            Fast Food Restaurant   0.5
2               Afghan Restaurant   0.0
3  Multicuisine Indian Restaurant   0.0
4              Tibetan Restaurant   0.0


----Chandigarh----
             restaurant  freq
0      Asian Restaurant   0.2
1  Fast Food Restaurant   0.2
2     Indian Restaurant   0.2
3    Italian Restaurant   0.2
4   American Restaurant   0.2


----Dehradun---

The following cell contains the code for creating a dataframe that stores the top ten restaurant types of each capital city of India

In [8]:
def return_most_common_venues(row, num_top_venues):
    row_categories=row.iloc[1:]
    row_categories_sorted=row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]
num_top_venues=10
indicators=['st', 'nd', 'rd']
columns=['Capital City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Restaurant'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Restaurant'.format(ind+1))
india_restaurants_sorted=pd.DataFrame(columns=columns)
india_restaurants_sorted['Capital City']=india_restaurants_grouped['Capital City']
for ind in np.arange(india_restaurants_grouped.shape[0]):
    india_restaurants_sorted.iloc[ind, 1:]=return_most_common_venues(india_restaurants_grouped.iloc[ind, :], num_top_venues)
india_restaurants_sorted

Unnamed: 0,Capital City,1st Most Common Restaurant,2nd Most Common Restaurant,3rd Most Common Restaurant,4th Most Common Restaurant,5th Most Common Restaurant,6th Most Common Restaurant,7th Most Common Restaurant,8th Most Common Restaurant,9th Most Common Restaurant,10th Most Common Restaurant
0,Agartala,Fast Food Restaurant,Indian Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,American Restaurant,Asian Restaurant,Chinese Restaurant,Dumpling Restaurant,French Restaurant,Himalayan Restaurant
1,Aizawl,Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,American Restaurant,Asian Restaurant,Chinese Restaurant,Dumpling Restaurant,Fast Food Restaurant,French Restaurant,Himalayan Restaurant
2,Bhubaneswar,Asian Restaurant,Fast Food Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,American Restaurant,Chinese Restaurant,Dumpling Restaurant,French Restaurant,Himalayan Restaurant,Indian Restaurant
3,Chandigarh,Italian Restaurant,American Restaurant,Asian Restaurant,Fast Food Restaurant,Indian Restaurant,Vegetarian / Vegan Restaurant,Chinese Restaurant,Dumpling Restaurant,French Restaurant,Himalayan Restaurant
4,Dehradun,Fast Food Restaurant,Indian Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,American Restaurant,Asian Restaurant,Chinese Restaurant,Dumpling Restaurant,French Restaurant,Himalayan Restaurant
5,Dispur,Northeast Indian Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,American Restaurant,Asian Restaurant,Chinese Restaurant,Dumpling Restaurant,Fast Food Restaurant,French Restaurant,Himalayan Restaurant
6,Gangtok,Dumpling Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,American Restaurant,Asian Restaurant,Chinese Restaurant,Fast Food Restaurant,French Restaurant,Himalayan Restaurant,Indian Restaurant
7,Hyderabad,Afghan Restaurant,Indian Restaurant,Italian Restaurant,American Restaurant,Asian Restaurant,Chinese Restaurant,Dumpling Restaurant,Fast Food Restaurant,French Restaurant,Himalayan Restaurant
8,Imphal,Indian Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,American Restaurant,Asian Restaurant,Chinese Restaurant,Dumpling Restaurant,Fast Food Restaurant,French Restaurant,Himalayan Restaurant
9,Jammu,Asian Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,American Restaurant,Chinese Restaurant,Dumpling Restaurant,Fast Food Restaurant,French Restaurant,Himalayan Restaurant,Indian Restaurant


The following cell contains the code for creating a dataframe that stores the cluster as well as the top ten restaurant types of each capital city of India

In [9]:
kclusters=5
india_restaurants_grouped_clustering=india_restaurants_grouped.drop('Capital City', 1)
kmeans=KMeans(n_init=300, n_clusters=kclusters, random_state=5).fit(india_restaurants_grouped_clustering)
india_restaurants_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
india_restaurants_sorted['Latitude']=0
india_restaurants_sorted['Longitude']=0
geolocator=Nominatim(user_agent="India")
for index, row in india_restaurants_sorted.iterrows(): 
    india_restaurants_sorted.loc[index, 'Latitude']='None' if geolocator.geocode(row['Capital City'])==None else geolocator.geocode(row['Capital City']).latitude
    india_restaurants_sorted.loc[index, 'Longitude']='None' if geolocator.geocode(row['Capital City'])==None else geolocator.geocode(row['Capital City']).longitude
india_restaurants_sorted

Unnamed: 0,Cluster Labels,Capital City,1st Most Common Restaurant,2nd Most Common Restaurant,3rd Most Common Restaurant,4th Most Common Restaurant,5th Most Common Restaurant,6th Most Common Restaurant,7th Most Common Restaurant,8th Most Common Restaurant,9th Most Common Restaurant,10th Most Common Restaurant,Latitude,Longitude
0,1,Agartala,Fast Food Restaurant,Indian Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,American Restaurant,Asian Restaurant,Chinese Restaurant,Dumpling Restaurant,French Restaurant,Himalayan Restaurant,23.831238,91.282382
1,2,Aizawl,Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,American Restaurant,Asian Restaurant,Chinese Restaurant,Dumpling Restaurant,Fast Food Restaurant,French Restaurant,Himalayan Restaurant,23.741409,92.72093
2,4,Bhubaneswar,Asian Restaurant,Fast Food Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,American Restaurant,Chinese Restaurant,Dumpling Restaurant,French Restaurant,Himalayan Restaurant,Indian Restaurant,20.266777,85.843559
3,3,Chandigarh,Italian Restaurant,American Restaurant,Asian Restaurant,Fast Food Restaurant,Indian Restaurant,Vegetarian / Vegan Restaurant,Chinese Restaurant,Dumpling Restaurant,French Restaurant,Himalayan Restaurant,30.719402,76.764655
4,1,Dehradun,Fast Food Restaurant,Indian Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,American Restaurant,Asian Restaurant,Chinese Restaurant,Dumpling Restaurant,French Restaurant,Himalayan Restaurant,30.325565,78.043681
5,3,Dispur,Northeast Indian Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,American Restaurant,Asian Restaurant,Chinese Restaurant,Dumpling Restaurant,Fast Food Restaurant,French Restaurant,Himalayan Restaurant,26.151308,91.79338
6,3,Gangtok,Dumpling Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,American Restaurant,Asian Restaurant,Chinese Restaurant,Fast Food Restaurant,French Restaurant,Himalayan Restaurant,Indian Restaurant,27.329046,88.612267
7,0,Hyderabad,Afghan Restaurant,Indian Restaurant,Italian Restaurant,American Restaurant,Asian Restaurant,Chinese Restaurant,Dumpling Restaurant,Fast Food Restaurant,French Restaurant,Himalayan Restaurant,17.388786,78.461065
8,0,Imphal,Indian Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,American Restaurant,Asian Restaurant,Chinese Restaurant,Dumpling Restaurant,Fast Food Restaurant,French Restaurant,Himalayan Restaurant,24.800609,93.937
9,4,Jammu,Asian Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,American Restaurant,Chinese Restaurant,Dumpling Restaurant,Fast Food Restaurant,French Restaurant,Himalayan Restaurant,Indian Restaurant,32.718561,74.858092


The following cell contains the code for visualizing the clusters of capital cities of India based on the cuisines served by the popular restaurants located there

In [10]:
map_clusters=folium.Map(location=[latitude, longitude], zoom_start=4)
x=np.arange(kclusters)
ys=[i + x + (i*x)**2 for i in range(kclusters)]
colors_array=cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow=[colors.rgb2hex(i) for i in colors_array]
markers_colors=[]
for lat, lon, poi, cluster in zip(india_restaurants_sorted['Latitude'], india_restaurants_sorted['Longitude'], india_restaurants_sorted['Capital City'], india_restaurants_sorted['Cluster Labels']):
    label=folium.Popup(str(poi) + ' (Cluster ' + str(cluster) + ')', parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)       
map_clusters