# The Battle of Neighborhoods
## Final report : 
**_Opening a new Shopping Mall in Lille, France_**
***

### Import librairies 

In [41]:
import folium # map rendering library
import pandas as pd 
import numpy as np 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from geopy.geocoders import Nominatim
import geocoder
from bs4 import BeautifulSoup
import folium
import json 
import requests 

### Scrap data from wikipedia to Dataframe

In [42]:
url = 'http://www.mapcrow.info/Lille-FR-suburbs'

In [43]:
# Send the get request
data = requests.get(url).text
#parse data
soup = BeautifulSoup(data, 'html.parser')

In [44]:
# Append data into the list
neighborhood_list = []
for row in soup.find_all("div", class_ ="w3-third w3-container")[7].findAll("button"):
    neighborhood_list.append(row.text)
neighborhood_list

['Annappes',
 'Ascq',
 'Assebroek',
 'Babylone',
 'Berchem',
 'Brigode',
 'Cité Scientifique',
 'Elst',
 'Faubourg Fernand Duchâteau',
 'Faubourg de Cambrai',
 'Faubourg de Roubaix',
 'Faubourg de Tournai',
 'Flers Bourg',
 'Flers Neuf',
 'Fort-Mardyck',
 'Grimminge',
 'Groenendijk',
 'Hemelveerdegem',
 'Hempempont',
 'Hôtel de Ville',
 'Kluisbergen',
 'La Cousinerie',
 'La Haute-Borne',
 'La Poste',
 'La Résidence',
 'Le Breucq',
 'Le Capreau',
 'Le Château',
 'Le Recueil',
 'Le Sart',
 'Les Près',
 'Malo-les-Bains',
 'Mardyck',
 'Michelbeke',
 'Moerbeke',
 'Nederbrakel',
 'Nieuwenhove',
 'Onkerzele',
 'Opbrakel',
 'Parc Europe',
 'Parike',
 'Petite-Synthe',
 'Pont-de-Bois',
 'Rosendaël',
 'Ruien',
 'Saint-Pol-sur-Mer',
 'Saint-Sauveur',
 'Sint-Andries',
 'Sint-Martens-Lierde',
 'Sint-Pieters',
 'Triolo',
 'Viane',
 'Waarbeke',
 'Zarlardinge',
 'Zegelsem']

In [45]:
df_n = pd.DataFrame({"Neighborhood": neighborhood_list})
df_n.head()

Unnamed: 0,Neighborhood
0,Annappes
1,Ascq
2,Assebroek
3,Babylone
4,Berchem


### Get the geographical coordinates

In [46]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Lille, France'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [47]:
coords = [ get_latlng(neighborhood) for neighborhood in df_n["Neighborhood"].tolist() ]

In [48]:
coords

[[50.62582688175406, 3.144688915913704],
 [50.62820000000005, 3.068810000000042],
 [50.62820000000005, 3.068810000000042],
 [43.743220000000065, -0.6978099999999472],
 [50.62820000000005, 3.068810000000042],
 [50.62532932178564, 3.0434379945905223],
 [50.60827000000006, 3.139340000000061],
 [42.38239000000004, 2.6101300000000265],
 [43.64592000000005, 6.062850000000026],
 [50.62410393784956, 3.074390008546284],
 [50.64152380502193, 3.0837909150657916],
 [50.62204286045128, 3.0889672227722698],
 [50.64271000000008, 2.979060000000061],
 [50.63358000000005, 3.126000000000033],
 [51.028170000000046, 2.304430000000025],
 [50.62820000000005, 3.068810000000042],
 [50.62820000000005, 3.068810000000042],
 [50.62820000000005, 3.068810000000042],
 [50.65384000000006, 3.172990000000027],
 [50.63035000000008, 3.069530000000043],
 [50.62820000000005, 3.068810000000042],
 [48.13685000000004, 0.0744900000000257],
 [47.370340000000056, 1.152590000000032],
 [50.642280029430395, 3.0876399744648233],
 [50

In [49]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [50]:
# merge the coordinates into the original dataframe
df_n['Latitude'] = df_coords['Latitude']
df_n['Longitude'] = df_coords['Longitude']
df_n

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Annappes,50.625827,3.144689
1,Ascq,50.6282,3.06881
2,Assebroek,50.6282,3.06881
3,Babylone,43.74322,-0.69781
4,Berchem,50.6282,3.06881
5,Brigode,50.625329,3.043438
6,Cité Scientifique,50.60827,3.13934
7,Elst,42.38239,2.61013
8,Faubourg Fernand Duchâteau,43.64592,6.06285
9,Faubourg de Cambrai,50.624104,3.07439


In [51]:
df_n.shape

(55, 3)

### Create a map with neiborhoods 

In [52]:
# get the coordinates of Lille
address = 'Lille, France'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Lille, France {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Lille, France 50.6365654, 3.0635282.


In [53]:
# create map of Lille using latitude and longitude values
map_n = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(df_n['Latitude'], df_n['Longitude'], df_n['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_n)  
    
map_n

In [54]:
# save the map as HTML file
map_n.save('map_n.html')

### Use the Foursquare API to explore the neighborhoods

In [55]:
CLIENT_ID = 'URHPUSEWESRNWDYSI3SEPC2RYQRSXJCSUZK2OED4NZUUFQCD' 
CLIENT_SECRET = 'O1RDWQU1AGZ4BUIZVCJEE5FWC411PVSLSFB5MXTH4CVJEXA2' 
VERSION = '20180605' # Foursquare API version

In [56]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(df_n['Latitude'], df_n['Longitude'], df_n['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [57]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(3239, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Annappes,50.625827,3.144689,La Maison Commune,50.626619,3.149605,French Restaurant
1,Annappes,50.625827,3.144689,Parc du Héron,50.634601,3.148742,Park
2,Annappes,50.625827,3.144689,LAM - Lille Musée d'Art Moderne,50.638436,3.152103,Art Museum
3,Annappes,50.625827,3.144689,Stade Pierre Mauroy,50.611826,3.130395,Soccer Stadium
4,Annappes,50.625827,3.144689,Kiabi V2,50.617097,3.126257,Boutique


In [58]:
venues_df.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Annappes,76,76,76,76,76,76
Ascq,100,100,100,100,100,100
Assebroek,100,100,100,100,100,100
Babylone,1,1,1,1,1,1
Berchem,100,100,100,100,100,100
Brigode,100,100,100,100,100,100
Cité Scientifique,67,67,67,67,67,67
Elst,2,2,2,2,2,2
Faubourg Fernand Duchâteau,3,3,3,3,3,3
Faubourg de Cambrai,92,92,92,92,92,92


In [59]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 171 uniques categories.


In [60]:
venues_df['VenueCategory'].unique()[:50]

array(['French Restaurant', 'Park', 'Art Museum', 'Soccer Stadium',
       'Boutique', 'Salad Place', 'Brewery', 'Pizza Place',
       'Toy / Game Store', 'Furniture / Home Store', 'University',
       'Burger Joint', 'Multiplex', 'Fast Food Restaurant', 'Supermarket',
       'Train Station', 'Restaurant', 'Bookstore', 'Italian Restaurant',
       'Coffee Shop', 'Mobile Phone Shop', 'Golf Course',
       'Clothing Store', 'Bed & Breakfast', 'Bakery', 'Science Museum',
       'Hotel', 'Shopping Mall', 'Steakhouse', 'Japanese Restaurant',
       'Sandwich Place', 'Sushi Restaurant', 'Bowling Alley',
       'Sporting Goods Shop', 'Lounge', 'Café', 'Lake', 'Wine Shop',
       'Flower Shop', 'Electronics Store', 'Pharmacy', 'Shoe Store',
       'Chinese Restaurant', 'Garden', 'Health Food Store', 'Gym',
       'Beer Garden', 'Tea Room', 'Mediterranean Restaurant', 'Bar'],
      dtype=object)

In [61]:
# one hot encoding
df_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
df_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [df_onehot.columns[-1]] + list(df_onehot.columns[:-1])
df_onehot = df_onehot[fixed_columns]

print(df_onehot.shape)
df_onehot.head()

(3239, 172)


Unnamed: 0,Neighborhoods,Apres Ski Bar,Aquarium,Art Gallery,Art Museum,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Auto Garage,Auto Workshop,...,Theater,Toy / Game Store,Train Station,Tram Station,University,Vegetarian / Vegan Restaurant,Wine Bar,Wine Shop,Women's Store,Zoo
0,Annappes,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Annappes,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Annappes,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Annappes,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Annappes,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
df_grouped = df_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(df_grouped.shape)
df_grouped

(54, 172)


Unnamed: 0,Neighborhoods,Apres Ski Bar,Aquarium,Art Gallery,Art Museum,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Auto Garage,Auto Workshop,...,Theater,Toy / Game Store,Train Station,Tram Station,University,Vegetarian / Vegan Restaurant,Wine Bar,Wine Shop,Women's Store,Zoo
0,Annappes,0.0,0.0,0.0,0.013158,0.0,0.0,0.0,0.0,0.0,...,0.0,0.013158,0.039474,0.0,0.013158,0.0,0.0,0.013158,0.0,0.0
1,Ascq,0.0,0.0,0.01,0.02,0.0,0.0,0.0,0.0,0.0,...,0.0,0.01,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0
2,Assebroek,0.0,0.0,0.01,0.02,0.0,0.0,0.0,0.0,0.0,...,0.0,0.01,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0
3,Babylone,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Berchem,0.0,0.0,0.01,0.02,0.0,0.0,0.0,0.0,0.0,...,0.0,0.01,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0
5,Brigode,0.0,0.0,0.01,0.01,0.0,0.01,0.0,0.0,0.0,...,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01
6,Cité Scientifique,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.014925,0.029851,0.0,0.014925,0.0,0.0,0.0,0.0,0.0
7,Elst,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Faubourg Fernand Duchâteau,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Faubourg de Cambrai,0.0,0.0,0.01087,0.021739,0.0,0.0,0.0,0.0,0.0,...,0.01087,0.01087,0.021739,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
len(df_grouped[df_grouped["Shopping Mall"] > 0])

33

In [64]:
df_mall = df_grouped[["Neighborhoods","Shopping Mall"]]

In [65]:
df_mall.head(10)

Unnamed: 0,Neighborhoods,Shopping Mall
0,Annappes,0.013158
1,Ascq,0.01
2,Assebroek,0.01
3,Babylone,0.0
4,Berchem,0.01
5,Brigode,0.0
6,Cité Scientifique,0.014925
7,Elst,0.0
8,Faubourg Fernand Duchâteau,0.0
9,Faubourg de Cambrai,0.01087


In [66]:
# set number of clusters
kclusters = 3

df_clustering = df_mall.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 0, 1, 0, 1, 0, 0, 1], dtype=int32)

In [67]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
df_merged = df_mall.copy()

# add clustering labels
df_merged["Cluster Labels"] = kmeans.labels_

In [68]:
df_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
df_merged.head()

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels
0,Annappes,0.013158,1
1,Ascq,0.01,1
2,Assebroek,0.01,1
3,Babylone,0.0,0
4,Berchem,0.01,1


In [69]:
# merge df_grouped with df_n to add latitude/longitude for each neighborhood
df_merged = df_merged.join(df_n.set_index("Neighborhood"), on="Neighborhood")

print(df_merged.shape)
df_merged.head() # check the last columns!

(54, 5)


Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
0,Annappes,0.013158,1,50.625827,3.144689
1,Ascq,0.01,1,50.6282,3.06881
2,Assebroek,0.01,1,50.6282,3.06881
3,Babylone,0.0,0,43.74322,-0.69781
4,Berchem,0.01,1,50.6282,3.06881


In [70]:
# sort the results by Cluster Labels
print(df_merged.shape)
df_merged.sort_values(["Cluster Labels"], inplace=True)
df_merged

(54, 5)


Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
26,Le Capreau,0.0,0,45.78379,4.99574
29,Les Près,0.0,0,45.02508,-0.02698
28,Le Sart,0.0,0,49.69428,4.78779
27,Le Recueil,0.0,0,50.65225,3.15961
25,Le Breucq,0.0,0,50.75002,1.74312
22,La Haute-Borne,0.0,0,47.37034,1.15259
21,La Cousinerie,0.0,0,48.13685,0.07449
18,Hempempont,0.0,0,50.65384,3.17299
39,Parike,0.0,0,48.859569,2.326102
40,Petite-Synthe,0.0,0,51.02298,2.34667


In [71]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Neighborhood'], df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine clusters

#### Cluster 0

In [72]:
df_merged.loc[df_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
26,Le Capreau,0.0,0,45.78379,4.99574
29,Les Près,0.0,0,45.02508,-0.02698
28,Le Sart,0.0,0,49.69428,4.78779
27,Le Recueil,0.0,0,50.65225,3.15961
25,Le Breucq,0.0,0,50.75002,1.74312
22,La Haute-Borne,0.0,0,47.37034,1.15259
21,La Cousinerie,0.0,0,48.13685,0.07449
18,Hempempont,0.0,0,50.65384,3.17299
39,Parike,0.0,0,48.859569,2.326102
40,Petite-Synthe,0.0,0,51.02298,2.34667


#### Cluster 1

In [73]:
df_merged.loc[df_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
51,Waarbeke,0.01,1,50.6282,3.06881
50,Viane,0.01,1,50.6282,3.06881
33,Moerbeke,0.01,1,50.6282,3.06881
49,Triolo,0.014706,1,50.61632,3.14422
43,Ruien,0.01,1,50.6282,3.06881
35,Nieuwenhove,0.01,1,50.6282,3.06881
36,Onkerzele,0.01,1,50.6282,3.06881
37,Opbrakel,0.01,1,50.6282,3.06881
45,Saint-Sauveur,0.016949,1,50.626823,3.108161
34,Nederbrakel,0.01,1,50.6282,3.06881


#### Cluster 2

In [74]:
df_merged.loc[df_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
42,Rosendaël,0.025641,2,51.04202,2.40486
30,Malo-les-Bains,0.025,2,51.04517,2.39972
12,Flers Bourg,0.035714,2,50.64271,2.97906
38,Parc Europe,0.035088,2,50.637597,2.980238


#### Observations:
Most of the shopping malls are concentrated in the central area of Lille city, with the highest number in cluster 1 and moderate number in cluster 0. On the other hand, cluster 2 has very low number to totally no shopping mall in the neighborhoods. This represents a great opportunity and high potential areas to open new shopping malls as there is very little to no competition from existing malls. Meanwhile, shopping malls in cluster 1 are likely suffering from intense competition due to oversupply and high concentration of shopping malls. From another perspective, this also shows that the oversupply of shopping malls mostly happened in the central area of the city, with the suburb area still have very few shopping malls. Therefore, this project recommends property developers to capitalize on these findings to open new shopping malls in neighborhoods in cluster 2 with little to no competition. Property developers with unique selling propositions to stand out from the competition can also open new shopping malls in neighborhoods in cluster 0 with moderate competition. Lastly, property developers are advised to avoid neighborhoods in cluster 1 which already have high concentration of shopping malls and suffering from intense competition.