# The Battle of Neighbourhood

###  Downloading and Importing Libraries

In [2]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.18.1-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  20.12 MB/s
geopy-1.18.1-p 100% |################################| Time: 0:00:00  31.47 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  48.73 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  26.32 MB/s
vincent-0.4.4- 100% |###################

###  Scraping the Website using BeautifulSoup Package

In [33]:
#Getting the table from the website into the Dataframe

from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pandas as pd
import numpy as np
page_link='https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population'
page=requests.get(page_link)
soup=BeautifulSoup(page.content,'html.parser')
table_soup=soup.find_all('table')[4]
table = pd.read_html(str(table_soup))
df=pd.DataFrame(table[0])
df.columns=['Rank','City','State','Estimate','Census','Change','Land Area','Land Area2','population density1','Population Density','Location']
df.drop(columns=['Rank','Estimate','Census','Change','Land Area2','population density1'],inplace=True)
df=df.drop(0).reset_index(drop=True)

In [34]:
df.head()

Unnamed: 0,City,State,Land Area,Population Density,Location
0,New York[6],New York,301.5 sq mi,"10,933/km2",40°39′49″N 73°56′19″W﻿ / ﻿40.6635°N 73.9387°W﻿...
1,Los Angeles,California,468.7 sq mi,"3,276/km2",34°01′10″N 118°24′39″W﻿ / ﻿34.0194°N 118.4108°...
2,Chicago,Illinois,227.3 sq mi,"4,600/km2",41°50′15″N 87°40′54″W﻿ / ﻿41.8376°N 87.6818°W﻿...
3,Houston[7],Texas,637.5 sq mi,"1,395/km2",29°47′12″N 95°23′27″W﻿ / ﻿29.7866°N 95.3909°W﻿...
4,Phoenix,Arizona,517.6 sq mi,"1,200/km2",33°34′20″N 112°05′24″W﻿ / ﻿33.5722°N 112.0901°...


### Seperating the Location Values into Latitudes and Longitudes

In [35]:
df['Location']=df['Location'].str.split("/",n=2,expand=True)[1]
new=df['Location'].str.split(" ",n=0,expand=False)
lat=[]
log=[]
for ind in range(len(new)):
    lat.append(new[ind][1][:-2])
    log.append(new[ind][2][:-3])
df['lat']=lat
df['log']=log
df.drop(columns=['Location'],inplace=True)
df.head()


Unnamed: 0,City,State,Land Area,Population Density,lat,log
0,New York[6],New York,301.5 sq mi,"10,933/km2",﻿40.6635,73.9387
1,Los Angeles,California,468.7 sq mi,"3,276/km2",﻿34.0194,118.4108
2,Chicago,Illinois,227.3 sq mi,"4,600/km2",﻿41.8376,87.6818
3,Houston[7],Texas,637.5 sq mi,"1,395/km2",﻿29.7866,95.3909
4,Phoenix,Arizona,517.6 sq mi,"1,200/km2",﻿33.5722,112.0901


#### Converting the Land Area Column from str to float

In [36]:
import numpy as np
new=df['Land Area'].str.split("s",n=1,expand=True)
df['Land Area']=new[0].str.replace(u'\xa0',u'')
df['Land Area']=df['Land Area'].str.replace(',','').astype(float)
df['lat']=df['lat'].str.replace(u'\ufeff',u'').astype(float)
df['log']=-df['log'].str.replace(u'\ufeff',u'').astype(float)

In [37]:
df['Radius']=np.sqrt(df['Land Area']).round(2).mul(1000)
df.drop(columns=['Land Area'],inplace=True)

#####  FourSquare API Credentials

In [41]:
CLIENT_ID = '33MAOBUFOQV4JD2DRIVJP1DKT5WKB4DOYVRUFQPJDEL3BMPU' # your Foursquare ID
CLIENT_SECRET = 'BWJAUUER4DPCGCTA2GOFY1IB1RAEFCNWHDDFFW0QNUGQ5XGH' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 33MAOBUFOQV4JD2DRIVJP1DKT5WKB4DOYVRUFQPJDEL3BMPU
CLIENT_SECRET:BWJAUUER4DPCGCTA2GOFY1IB1RAEFCNWHDDFFW0QNUGQ5XGH


### Function to get the required attributes from the json file into a DataFrame

In [39]:
def getCityVenues(names,lat,log,radius):
    venues_list=[]
    for name,lat,log,radius in zip(names,lat,log,radius):
        LIMIT=150
        url='https://api.foursquare.com/v2/venues/explore?&client_id={0}&client_secret={1}&v={2}&ll={3},{4}&radius={5}&limit={6}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            log, 
            radius, 
            LIMIT)
        results = requests.get(url).json()['response']['groups'][0]['items']
        venues_list.append([(
            name,
            lat,
            log,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venue=pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venue.columns=['City','Latitude','Logitude','Venue','Venue Latitude','Venue Longitude','Venue Category']
    return(nearby_venue)
        

In [42]:
city_venues=getCityVenues(df['City'],df['lat'],df['log'],df['Radius'])
city_venues.head()

Unnamed: 0,City,Latitude,Logitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,New York[6],40.6635,-73.9387,Barboncino,40.672104,-73.957412,Pizza Place
1,New York[6],40.6635,-73.9387,Brooklyn Botanic Garden,40.667884,-73.963587,Botanical Garden
2,New York[6],40.6635,-73.9387,Covenhoven,40.675143,-73.960203,Beer Garden
3,New York[6],40.6635,-73.9387,Brooklyn Museum,40.671069,-73.963619,Art Museum
4,New York[6],40.6635,-73.9387,Kings Theatre,40.64611,-73.957175,Theater


### Adding the Weightage Column to the selected Venue Category in city_venues dataframe

In [43]:
weightage_dict={'Movie Theater':3,'Beach':3,'Concert Hall':2.5,'Playground':3,'Coffee Shop':3.5,'Food Court':4,'Nightclub':4,'Toy / Game Store':4.5,'Theme Park Ride / Attraction':4,'Pub':4,'Others':0}

def get_weightage(category_dataframe,weights_dict=weightage_dict):
    
    weightage_list=[]
    for cat in category_dataframe:
        if cat in weights_dict.keys():
            weightage_list.append(weights_dict[cat])
        else:
            cat='Others'
            weightage_list.append(weights_dict[cat])
    return weightage_list



In [44]:
city_venues['weightage']=get_weightage(city_venues['Venue Category'],weightage_dict) 
city_venues=city_venues[city_venues['weightage']!=0.0]

In [45]:
df_venues=city_venues[['City','weightage']]
df_venues=df_venues.groupby(['City'],axis=0).mean()
df_venues.reset_index(drop=False,inplace=True)

In [46]:
df_venues.head()

Unnamed: 0,City,weightage
0,Abilene,3.416667
1,Akron,3.6
2,Albuquerque,3.5
3,Alexandria[16],3.5
4,Allen,3.3125


In [47]:
df_data=pd.merge(df_venues,df,on='City')
df_data.drop(columns=['State','Radius','lat','log'],inplace=True)

In [48]:
km_split=df_data['Population Density'].str.split("/",n=1,expand=True)
df_data['Population Density']=km_split[0].str.replace(',','')
df_data['Population Density']=df_data['Population Density'].astype(float)

In [49]:
df_data.head()

Unnamed: 0,City,weightage,Population Density
0,Abilene,3.416667,442.0
1,Akron,3.6,1231.0
2,Albuquerque,3.5,1147.0
3,Alexandria[16],3.5,4010.0
4,Allen,3.3125,1434.0


### Normalizing the selected columns in the DataFrame

In [50]:
from sklearn import preprocessing


normalize_columns=['weightage','Population Density']

x=df_data[normalize_columns].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_data[normalize_columns]=pd.DataFrame(x_scaled)
df_data.head()

Unnamed: 0,City,weightage,Population Density
0,Abilene,0.528455,0.034422
1,Akron,0.707317,0.107041
2,Albuquerque,0.609756,0.09931
3,Alexandria[16],0.609756,0.362816
4,Allen,0.426829,0.125725


#### Calculating the sum of the normalized columns and getting the City having the maximum sum value

In [51]:
df_data['Sum']=df_data['weightage']+df_data['Population Density']

In [52]:
df_data.loc[df_data['Sum'].idxmax()]

City                  Cambridge
weightage              0.853659
Population Density     0.608099
Sum                     1.46176
Name: 35, dtype: object

##### Here the city __"Cambridge"__ has the maximum sum value. So we will select the City Cambridge

### Getting the nearby Venues from the Cambridge City from the FourSquareAPI

In [53]:
city_latitude=df.loc[df['City']=='Cambridge','lat'].item()
city_longitude=df.loc[df['City']=='Cambridge','log'].item()
radius=5000
LIMIT=500

In [54]:
def getCambridgeCityVenues(lat,log):
    venues_list=[]
    radius=5000
    LIMIT=150
    url='https://api.foursquare.com/v2/venues/explore?&client_id={0}&client_secret={1}&v={2}&ll={3},{4}&radius={5}&limit={6}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        log, 
        radius, 
        LIMIT)
    results = requests.get(url).json()['response']['groups'][0]['items']
    venues_list.append([(
        v['venue']['location']['city'],
        v['venue']['name'],
        v['venue']['location']['lat'],
        v['venue']['location']['lng'],
        v['venue']['categories'][0]['name']) for v in results])
    nearby_venue=pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venue.columns=['City','Venue','Venue Latitude','Venue Longitude','Venue Category']
    return(nearby_venue)
        

In [55]:
cambridge_venues=getCambridgeCityVenues(city_latitude,city_longitude)


In [56]:
final_venues=cambridge_venues[cambridge_venues['City']=='Cambridge']

In [57]:
final_venues.reset_index(drop=True)

Unnamed: 0,City,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Cambridge,Harvard Square,42.373501,-71.119098,Plaza
1,Cambridge,Felipe's Mexican Taqueria,42.373431,-71.120515,Mexican Restaurant
2,Cambridge,Harvard Book Store,42.372708,-71.116464,Bookstore
3,Cambridge,Sanders Theatre,42.375835,-71.114786,Theater
4,Cambridge,The Sinclair,42.374094,-71.120757,Rock Club
5,Cambridge,Brattle Theatre,42.373538,-71.121291,Indie Movie Theater
6,Cambridge,Harvard Art Museums,42.374206,-71.114368,Art Museum
7,Cambridge,Flour Bakery + Cafe,42.373117,-71.122349,Bakery
8,Cambridge,L.A. Burdick Chocolate,42.374456,-71.121906,Café
9,Cambridge,Alden & Harlow,42.373572,-71.121319,New American Restaurant


In [58]:
weights={'Plaza':3,'Rock Club':3.5,'Theater':3.5,'Pub':4,'Cocktail Bar':3.8,'Movie Theater':3.5,'Others':0,'Bookstore':1.5,'Indie Movie Theater':2.5,'Café':3,'Pizza Place':2.5,'Sandwich Place':2.5,'Coffee Shop':3.5}


### Plotting the Cambridge City and the City Venues

In [59]:
#visualisation of cabridge City and the Venues of the City
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

map_venue=folium.Map(location=[city_latitude,city_longitude],zoom_start=15)
folium.features.CircleMarker(
   [city_latitude, city_longitude],
   radius=10,
   popup='Cambridge City',
   fill=True,
   color='red',
   fill_color='red',
   fill_opacity=0.6
   ).add_to(map_venue)

for lat,lng,label in zip(cambridge_venues['Venue Latitude'],cambridge_venues['Venue Longitude'],cambridge_venues['Venue']):
    label=folium.Popup(label,parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_venue)
map_venue

#### Adding the Weightage column for the selected categories

In [60]:
final_venues['weightage']=get_weightage(final_venues['Venue Category'],weights_dict=weights) 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [233]:
cambridge_final_data=final_venues[final_venues['weightage']!=0.0]


In [234]:
cambridge_final_data.reset_index(drop=True,inplace=True)

In [235]:
cambridge_final_data.head()

Unnamed: 0,City,Venue,Venue Latitude,Venue Longitude,Venue Category,weightage
0,Cambridge,Harvard Square,42.373501,-71.119098,Plaza,3.0
1,Cambridge,Harvard Book Store,42.372708,-71.116464,Bookstore,1.5
2,Cambridge,Sanders Theatre,42.375835,-71.114786,Theater,3.5
3,Cambridge,The Sinclair,42.374094,-71.120757,Rock Club,3.5
4,Cambridge,Brattle Theatre,42.373538,-71.121291,Indie Movie Theater,2.5


### Applying K Means for the Attributes Venue Latitude and Venue Longitude

In [237]:
from sklearn.cluster import KMeans

kclusters = 5
cambridge_grouped_clustering = cambridge_final_data.drop(columns=['City','Venue','Venue Category','weightage'])
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cambridge_grouped_clustering)
kmeans.labels_

array([3, 1, 1, 3, 3, 3, 3, 1, 3, 1, 2, 0, 0, 0, 0, 4, 4], dtype=int32)

In [238]:
cambridge_final_data['Cluster Labels']=kmeans.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [239]:
cambridge_final_data.head()

Unnamed: 0,City,Venue,Venue Latitude,Venue Longitude,Venue Category,weightage,Cluster Labels
0,Cambridge,Harvard Square,42.373501,-71.119098,Plaza,3.0,3
1,Cambridge,Harvard Book Store,42.372708,-71.116464,Bookstore,1.5,1
2,Cambridge,Sanders Theatre,42.375835,-71.114786,Theater,3.5,1
3,Cambridge,The Sinclair,42.374094,-71.120757,Rock Club,3.5,3
4,Cambridge,Brattle Theatre,42.373538,-71.121291,Indie Movie Theater,2.5,3


### Grouping Based on the Cluster Labels and calculating the weightage Sum

In [241]:
cluster_group=cambridge_final_data.groupby(['Cluster Labels'])['weightage'].sum()

In [242]:
cluster_group

Cluster Labels
0    13.8
1    10.5
2     1.5
3    18.0
4     6.0
Name: weightage, dtype: float64

In [243]:
arcade_location=kmeans.cluster_centers_[3].tolist()
arcade_location

[42.37375037888433, -71.12099559190885]

### The above results shows that the cluster 3 has more weightage so we will choose that coordinates as the Clients First Arcade in that City

In [247]:
#visualisation of cambridge
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

map_venue=folium.Map(location=[arcade_location[0],arcade_location[1]],zoom_start=11)
folium.features.CircleMarker(
   [city_latitude, city_longitude],
   radius=50,
   popup='Our Arcade',
   fill=True,
   color='red',
   fill_color='red',
   fill_opacity=0.6
   ).add_to(map_venue)

for lat,lng,label in zip(cambridge_final_data['Venue Latitude'],cambridge_final_data['Venue Longitude'],cambridge_final_data['Venue Category']):
    label=folium.Popup(label,parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=10,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_venue)
map_venue