In [1]:
# Normal stack of pandas, numpy, matplotlib and seaborn
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from tqdm import * #progress bar

import os
import json

%matplotlib inline

## Load Factual categories taxonomy

http://developer.factual.com/working-with-categories/

In [2]:
root = 'data/'
categories = pd.read_json(root + 'factual_en_taxonomy.json', orient='index')
categories["parents"] = categories["parents"].apply(lambda x: ", ".join(x))
categories

Unnamed: 0,abstract,label,parents
1,1,Factual Places,
10,0,Oil and Lube,9
100,0,Psychiatrists,83
101,0,Radiologists,83
102,0,Respiratory,83
103,0,Urologists,83
104,0,Podiatrists,62
105,0,Pregnancy and Sexual Health,62
106,0,Weight Loss and Nutritionists,62
107,0,Landmarks,1


## Load data crawled with Factual API

### Lausanne

In [3]:
lausanne_raw = pd.read_pickle(root + 'lausanne_crawled.pickle')
lausanne_raw_geo = gpd.GeoDataFrame(lausanne_raw, geometry="geometry")
lausanne_raw_geo

Unnamed: 0,address,category_ids,category_labels,country,edge_attributes.address,edge_attributes.address_extended,edge_attributes.locality,edge_attributes.postcode,email,factual_id,...,hours_display,latitude,locality,longitude,name,neighborhood,postcode,region,tel,website
0,Rue du Grand-Chêne 7-9,[436],"[[Travel, Lodging, Hotels and Motels]]",ch,,,,,,7f827ab5-d759-4a4f-b097-7fe5d8cb5e6c,...,Open Daily 00:00-23:59,46.519577,Lausanne,6.631063,Lausanne Palace & Spa,,1002,Vaud,021 331 31 31,http://www.lausanne-palace.ch
1,Place Saint-François 2,"[347, 341, 342]","[[Social, Food and Dining, Restaurants], [Soci...",ch,,,,,contact@cafe-romand.ch,7b4b8a4a-601b-4c20-8250-585c49a4fd5f,...,Mon-Sat 8:00-23:59,46.519874,Lausanne,6.632516,Café Romand,[Centre],1003,Vaud,021 312 63 75,http://www.caferomand.com/
2,Rue Centrale 4,"[347, 341, 312]","[[Social, Food and Dining, Restaurants], [Soci...",ch,,,,,,56c745d9-a4e9-4527-824e-0e1d9af5f097,...,"Mon-Sat 11:30-13:30, 17:30-21:30; Sun 17:30-21:30",46.520609,Lausanne,6.632493,Les Brasseurs,,1003,Vaud,021 351 14 24,http://www.les-brasseurs.ch
3,Avenue Sainte-Luce 1,[436],"[[Travel, Lodging, Hotels and Motels]]",ch,,,,,info@elite-lausanne.ch,bad577fa-419d-47b5-aebf-ac3044123692,...,Open Daily 00:00-23:59,46.518183,Lausanne,6.631169,Hotel Elite,,1003,Vaud,021 320 23 61,http://www.elite-lausanne.ch
4,Rue Centrale 15,[80],"[[Healthcare, Pharmacies]]",ch,,,,,,7cbe2da8-333c-45b8-bb28-50f863f5d20e,...,Mon 10:00-18:30; Tue-Fri 8:00-18:30; Sat 9:00-...,46.521029,Lausanne,6.633765,Pharmacie Nouvelle,,1003,Vaud,021 323 84 84,http://www.pharmacieplus.ch/
5,Place Saint-François 1,"[342, 347]","[[Social, Food and Dining, Cafes, Coffee and T...",ch,,,,,,483c0b49-ddbb-464d-a0ae-1ca2dd23ec86,...,,46.519928,Lausanne,6.632186,Nespresso Boutique,,1003,Vaud,0800 555 253,http://www.nespresso.com
6,Rue Pepinet 3,[347],"[[Social, Food and Dining, Restaurants]]",ch,,Vaud,,,reservations@eat-me.ch,11b2af34-485c-40dc-a58c-d5d1f8feb7a3,...,"Tue-Thu 12:00-23:59; Fri 00:00-1:00, 12:00-23:...",46.520199,Lausanne,6.632264,Eat Me,,1003,Vaud,021 311 76 59,http://eat-me.ch
7,Rue du Grand-Chêne 7,"[356, 341]","[[Social, Food and Dining, Restaurants, French...",ch,,,,,reservation@lausanne-palace.ch,075e7ea9-bd4b-492a-b813-8914dcf86e18,...,"Mon 11:00-15:00, 18:00-22:00; Tue 11:00-14:00,...",46.519592,Lausanne,6.631104,Brasserie du Grand-Chêne,[Centre],1003,Vaud,021 331 32 24,http://www.lausanne-palace.com
8,Place Saint-François 15,[51],"[[Community and Government, Post Offices]]",ch,,,,,,9ce48a39-c76e-41b5-84bd-53b88d91de9c,...,"Mon 7:00-17:00; Tue 7:00-9:00, 11:00-17:00; We...",46.519386,Lausanne,6.633026,La Poste,,1003,Vaud,0848 888 888,http://www.post.ch/
9,Place Pépinet 3,[142],"[[Retail, Fashion]]",ch,,,,,lausanne@natureetdecouvertes.ch,c07e9427-6310-45f2-8981-9ce8f3f7a43f,...,Mon-Fri 9:00-19:00; Sat 9:00-18:00,46.521023,Lausanne,6.632635,Nature et Découvertes,,1003,Vaud,021 331 22 30,http://www.natureetdecouvertes.ch/


In [4]:
#Since we covered area greater than Lausanne we need to limit crawled data to Lausanne only

lausanne_geo = lausanne_raw_geo[lausanne_raw_geo['locality'] == 'Lausanne']
lausanne_geo.shape

(13560, 30)

In [5]:
lausanne_geo

Unnamed: 0,address,category_ids,category_labels,country,edge_attributes.address,edge_attributes.address_extended,edge_attributes.locality,edge_attributes.postcode,email,factual_id,...,hours_display,latitude,locality,longitude,name,neighborhood,postcode,region,tel,website
0,Rue du Grand-Chêne 7-9,[436],"[[Travel, Lodging, Hotels and Motels]]",ch,,,,,,7f827ab5-d759-4a4f-b097-7fe5d8cb5e6c,...,Open Daily 00:00-23:59,46.519577,Lausanne,6.631063,Lausanne Palace & Spa,,1002,Vaud,021 331 31 31,http://www.lausanne-palace.ch
1,Place Saint-François 2,"[347, 341, 342]","[[Social, Food and Dining, Restaurants], [Soci...",ch,,,,,contact@cafe-romand.ch,7b4b8a4a-601b-4c20-8250-585c49a4fd5f,...,Mon-Sat 8:00-23:59,46.519874,Lausanne,6.632516,Café Romand,[Centre],1003,Vaud,021 312 63 75,http://www.caferomand.com/
2,Rue Centrale 4,"[347, 341, 312]","[[Social, Food and Dining, Restaurants], [Soci...",ch,,,,,,56c745d9-a4e9-4527-824e-0e1d9af5f097,...,"Mon-Sat 11:30-13:30, 17:30-21:30; Sun 17:30-21:30",46.520609,Lausanne,6.632493,Les Brasseurs,,1003,Vaud,021 351 14 24,http://www.les-brasseurs.ch
3,Avenue Sainte-Luce 1,[436],"[[Travel, Lodging, Hotels and Motels]]",ch,,,,,info@elite-lausanne.ch,bad577fa-419d-47b5-aebf-ac3044123692,...,Open Daily 00:00-23:59,46.518183,Lausanne,6.631169,Hotel Elite,,1003,Vaud,021 320 23 61,http://www.elite-lausanne.ch
4,Rue Centrale 15,[80],"[[Healthcare, Pharmacies]]",ch,,,,,,7cbe2da8-333c-45b8-bb28-50f863f5d20e,...,Mon 10:00-18:30; Tue-Fri 8:00-18:30; Sat 9:00-...,46.521029,Lausanne,6.633765,Pharmacie Nouvelle,,1003,Vaud,021 323 84 84,http://www.pharmacieplus.ch/
5,Place Saint-François 1,"[342, 347]","[[Social, Food and Dining, Cafes, Coffee and T...",ch,,,,,,483c0b49-ddbb-464d-a0ae-1ca2dd23ec86,...,,46.519928,Lausanne,6.632186,Nespresso Boutique,,1003,Vaud,0800 555 253,http://www.nespresso.com
6,Rue Pepinet 3,[347],"[[Social, Food and Dining, Restaurants]]",ch,,Vaud,,,reservations@eat-me.ch,11b2af34-485c-40dc-a58c-d5d1f8feb7a3,...,"Tue-Thu 12:00-23:59; Fri 00:00-1:00, 12:00-23:...",46.520199,Lausanne,6.632264,Eat Me,,1003,Vaud,021 311 76 59,http://eat-me.ch
7,Rue du Grand-Chêne 7,"[356, 341]","[[Social, Food and Dining, Restaurants, French...",ch,,,,,reservation@lausanne-palace.ch,075e7ea9-bd4b-492a-b813-8914dcf86e18,...,"Mon 11:00-15:00, 18:00-22:00; Tue 11:00-14:00,...",46.519592,Lausanne,6.631104,Brasserie du Grand-Chêne,[Centre],1003,Vaud,021 331 32 24,http://www.lausanne-palace.com
8,Place Saint-François 15,[51],"[[Community and Government, Post Offices]]",ch,,,,,,9ce48a39-c76e-41b5-84bd-53b88d91de9c,...,"Mon 7:00-17:00; Tue 7:00-9:00, 11:00-17:00; We...",46.519386,Lausanne,6.633026,La Poste,,1003,Vaud,0848 888 888,http://www.post.ch/
9,Place Pépinet 3,[142],"[[Retail, Fashion]]",ch,,,,,lausanne@natureetdecouvertes.ch,c07e9427-6310-45f2-8981-9ce8f3f7a43f,...,Mon-Fri 9:00-19:00; Sat 9:00-18:00,46.521023,Lausanne,6.632635,Nature et Découvertes,,1003,Vaud,021 331 22 30,http://www.natureetdecouvertes.ch/


In [6]:
lausanne_geo.columns

Index(['address', 'category_ids', 'category_labels', 'country',
       'edge_attributes.address', 'edge_attributes.address_extended',
       'edge_attributes.locality', 'edge_attributes.postcode', 'email',
       'factual_id', 'fax', 'geometry', 'hours', 'hours.friday',
       'hours.monday', 'hours.saturday', 'hours.sunday', 'hours.thursday',
       'hours.tuesday', 'hours.wednesday', 'hours_display', 'latitude',
       'locality', 'longitude', 'name', 'neighborhood', 'postcode', 'region',
       'tel', 'website'],
      dtype='object')

In [7]:
#Check for NaN in geometry
lausanne_geo['geometry'].isnull().values.any()

False

In [8]:
#lausanne_geo.drop_duplicates(['latitude','longitude'], keep='last')

In [9]:
#Check for NaN in geometry
#lausanne_geo.groupby('country').count()
#lausanne_geo['address'].value_counts()
#lausanne_geo['geometry'].value_counts()

In [10]:
#Clean index
lausanne_geo = lausanne_geo.reset_index(drop=True)

### Zürich

In [11]:
zurich_raw = pd.read_pickle(root + 'zurich_crawled.pickle')
zurich_raw_geo = gpd.GeoDataFrame(zurich_raw, geometry="geometry")
zurich_raw_geo

Unnamed: 0,address,address_extended,category_ids,category_labels,country,edge_attributes.address,edge_attributes.address_extended,edge_attributes.locality,edge_attributes.postcode,email,...,hours_display,latitude,locality,longitude,name,neighborhood,postcode,region,tel,website
0,Bahnhofstrasse 75,,[141],"[[Retail, Department Stores]]",ch,,,,,,...,Mon-Sat 9:00-20:00,47.374826,Zürich,8.538470,Manor,,8001,Zürich,044 229 56 99,http://www.manor.ch/
1,Schweizergasse 11,,"[143, 146]","[[Retail, Fashion, Clothing and Accessories], ...",ch,,,,,office@lunchgate.com,...,Mon-Sat 9:00-20:00,47.375861,Zürich,8.537881,Globus,,8001,Zürich,058 578 11 11,http://www.globus.ch
2,Seidengasse 12,,[221],"[[Businesses and Services, Financial, Banking ...",ch,,,,,,...,Mon-Wed 9:00-17:00; Thu 9:00-18:00; Fri 9:00-1...,47.375084,Zürich,8.537626,Migros Bank,,8001,Zürich,0848 845 400,http://www.migrosbank.ch
3,Gessnerallee 36,,[35],"[[Community and Government, Education, Tutorin...",ch,,,,,zuerich@linguista.ch,...,Mon-Fri 9:00-18:00,47.376677,Zürich,8.536942,Linguista Sprachaufenthalte,,8001,Zürich,044 260 50 90,http://www.linguista.ch/
4,Lintheschergasse 21,,[73],"[[Healthcare, Holistic, Alternative and Naturo...",ch,,,,,,...,"Mon 8:15-12:00, 13:00-18:15; Tue-Wed 8:15-12:0...",47.376770,Zürich,8.538906,Akupunktur - Chinesische Medizin - SinoQi TCM ...,,8001,Zürich,044 210 22 22,http://www.sinoqi.ch
5,Schweizergasse 10,,,,ch,,,,,,...,"Mon-Fri 8:30-12:00, 13:00-17:30",47.376109,Zürich,8.538035,Amplifon AG,,8001,Zürich,044 221 25 53,http://www.amplifon.ch/
6,Löwenstrasse 42,,,,ch,,,,,loewenstrasse@neuroth.ch,...,"Mon-Fri 8:30-12:00, 13:00-17:30",47.375418,Zürich,8.536860,Neuroth Hörcenter AG,,8001,Zürich,043 497 36 60,http://www.neuroth.ch
7,Löwenstrasse 49,,"[340, 347]","[[Social, Food and Dining, Bakeries], [Social,...",ch,,,,,,...,Mon-Fri 7:00-18:30; Sat 8:00-18:00,47.376143,Zürich,8.537224,Confiserie Sprüngli AG,,8001,Zürich,044 211 96 12,http://www.spruengli.ch/
8,Schützengasse 23,,[35],"[[Community and Government, Education, Tutorin...",ch,,,,,zrh@boalingua.ch,...,Mon-Fri 9:00-18:00,47.376610,Zürich,8.538379,Boa Lingua AG,,8001,Zürich,044 211 12 32,http://www.boalingua.ch
9,Seidengasse 6,,[35],"[[Community and Government, Education, Tutorin...",ch,,,,,,...,Mon-Fri 8:30-21:00; Sat 9:00-13:00,47.374397,Zürich,8.537995,The Cambridge Institute (CH) Ltd,,8001,Zürich,044 221 12 12,http://www.thecambridgeinstitute.com


In [12]:
#Since we covered area greater than Lausanne we need to limit crawled data to Zürich only

zurich_geo = zurich_raw_geo[zurich_raw_geo['locality'] == 'Zürich']
zurich_geo.shape

(44758, 31)

In [13]:
#Check for NaN in geometry
zurich_geo['geometry'].isnull().values.any()

False

In [14]:
zurich_geo = zurich_geo.reset_index(drop=True)

# Loading Neighborhood data into Dataframes

In [15]:
file_lausanne = "./data/quartiers_lausanne.geojson"
file_zurich = "./data/quartiers_zurich.geojson"

neighborhood_lausanne = gpd.read_file(file_lausanne)
neighborhood_zurich = gpd.read_file(file_zurich)

print(type(neighborhood_lausanne))

neighborhood_lausanne

<class 'geopandas.geodataframe.GeoDataFrame'>


Unnamed: 0,NOMQUARTIER,NUMQUARTIER,Name,altitudeMode,begin,description,description2,drawOrder,end,extrude,geometry,icon,tessellate,timestamp,visibility
0,,,01 - Centre\n,,,http://www.lausanne.ch/lausanne-officielle/sta...,,,,-1,"POLYGON Z ((6.63867 46.519814 0, 6.638932 46.5...",,-1,,-1
1,,,02 - Maupas / Valency\n,,,http://www.lausanne.ch/lausanne-officielle/sta...,,,,-1,"POLYGON Z ((6.613409 46.527202 0, 6.613878 46....",,-1,,-1
2,Sébeillon/Malley,3\n,03 - Sébeillon / Malley\n,,,http://www.lausanne.ch/lausanne-officielle/sta...,,,,-1,"POLYGON Z ((6.620223 46.525428 0, 6.619146 46....",,-1,,-1
3,Montoie/Bourdonnette,4,04 - Montoie / Bourdonnette\n,,,http://www.lausanne.ch/lausanne-officielle/sta...,,,,-1,"POLYGON Z ((6.586364 46.523865 0, 6.586291 46....",,-1,,-1
4,Montriond/Cour,5,05 - Montriond / Cour\n,,,http://www.lausanne.ch/lausanne-officielle/sta...,,,,-1,"POLYGON Z ((6.604343 46.517098 0, 6.604351 46....",,-1,,-1
5,Sous-Gare/Ouchy,6,06 - Sous-Gare / Ouchy,,,http://www.lausanne.ch/lausanne-officielle/sta...,,,,-1,"POLYGON Z ((6.620933 46.507415 0, 6.620936 46....",,-1,,-1
6,Montchoisi,7,07 - Montchoisi\n,,,http://www.lausanne.ch/lausanne-officielle/sta...,,,,-1,"POLYGON Z ((6.635923 46.513462 0, 6.635853 46....",,-1,,-1
7,Florimont/Chissiez,8,08 - Florimont / Chissiez\n,,,http://www.lausanne.ch/lausanne-officielle/sta...,,,,-1,"POLYGON Z ((6.650699 46.514896 0, 6.650457 46....",,-1,,-1
8,Mousquines/Bellevue,9,09 - Mousquines / Bellevue\n,,,http://www.lausanne.ch/lausanne-officielle/sta...,,,,-1,"POLYGON Z ((6.64415 46.517644 0, 6.644686 46.5...",,-1,,-1
9,Vallon/Béthusy,10,10 - Vallon / Béthusy\n,,,http://www.lausanne.ch/lausanne-officielle/sta...,,,,-1,"POLYGON Z ((6.637838 46.524517 0, 6.63813 46.5...",,-1,,-1


In [16]:
neighborhood_zurich

Unnamed: 0,Kreisname,Kreisnummer,Quartiername,Quartiernummer,geometry
0,Kreis 7,7,Hirslanden,73,"POLYGON ((8.55670119166035 47.3640651794388, 8..."
1,Kreis 8,8,Weinegg,83,"POLYGON ((8.556705417978661 47.3640463835363, ..."
2,Kreis 8,8,Mühlebach,82,"POLYGON ((8.554234120938199 47.3645567121916, ..."
3,Kreis 8,8,Seefeld,81,"POLYGON ((8.54794601144437 47.3650049645305, 8..."
4,Kreis 2,2,Wollishofen,21,"POLYGON ((8.5429738675001 47.335431756782, 8.5..."
5,Kreis 2,2,Leimbach,23,"POLYGON ((8.518104137500281 47.3442484810979, ..."
6,Kreis 4,4,Werd,41,"POLYGON ((8.53300513493766 47.3739425854604, 8..."
7,Kreis 3,3,Sihlfeld,34,"POLYGON ((8.519370463949629 47.3747817124699, ..."
8,Kreis 9,9,Albisrieden,91,"POLYGON ((8.501270821270991 47.3796116903516, ..."
9,Kreis 7,7,Hottingen,72,"POLYGON ((8.583462441744061 47.3883106835516, ..."


# Compute the area of each neighborhood

In [17]:
# Lausanne area
neighborhood_lausanne = neighborhood_lausanne.to_crs({'init': 'epsg:32633'})
neighborhood_lausanne["area"] = neighborhood_lausanne['geometry'].area/ 10**6 # km^2
neighborhood_lausanne = neighborhood_lausanne.to_crs({'init': 'epsg:4326'})
neighborhood_lausanne.head()

Unnamed: 0,NOMQUARTIER,NUMQUARTIER,Name,altitudeMode,begin,description,description2,drawOrder,end,extrude,geometry,icon,tessellate,timestamp,visibility,area
0,,,01 - Centre\n,,,http://www.lausanne.ch/lausanne-officielle/sta...,,,,-1,"POLYGON Z ((6.63867 46.51981399999999 0, 6.638...",,-1,,-1,1.254503
1,,,02 - Maupas / Valency\n,,,http://www.lausanne.ch/lausanne-officielle/sta...,,,,-1,POLYGON Z ((6.613409000000003 46.5272019999999...,,-1,,-1,0.810437
2,Sébeillon/Malley,3\n,03 - Sébeillon / Malley\n,,,http://www.lausanne.ch/lausanne-officielle/sta...,,,,-1,POLYGON Z ((6.620222999999999 46.5254279999999...,,-1,,-1,1.04831
3,Montoie/Bourdonnette,4,04 - Montoie / Bourdonnette\n,,,http://www.lausanne.ch/lausanne-officielle/sta...,,,,-1,POLYGON Z ((6.586363999999998 46.5238649999999...,,-1,,-1,1.758095
4,Montriond/Cour,5,05 - Montriond / Cour\n,,,http://www.lausanne.ch/lausanne-officielle/sta...,,,,-1,"POLYGON Z ((6.604343 46.51709799999999 0, 6.60...",,-1,,-1,1.45776


In [18]:
# Zurich area
neighborhood_zurich = neighborhood_zurich.to_crs({'init': 'epsg:32633'})
neighborhood_zurich["area"] = neighborhood_zurich['geometry'].area/ 10**6 # km^2
neighborhood_zurich = neighborhood_zurich.to_crs({'init': 'epsg:4326'})
neighborhood_zurich.head()

Unnamed: 0,Kreisname,Kreisnummer,Quartiername,Quartiernummer,geometry,area
0,Kreis 7,7,Hirslanden,73,"POLYGON ((8.55670119166035 47.3640651794388, 8...",2.197764
1,Kreis 8,8,Weinegg,83,"POLYGON ((8.556705417978661 47.36404638353631,...",1.753914
2,Kreis 8,8,Mühlebach,82,"POLYGON ((8.554234120938201 47.36455671219159,...",0.640998
3,Kreis 8,8,Seefeld,81,"POLYGON ((8.547946011444369 47.3650049645305, ...",2.435484
4,Kreis 2,2,Wollishofen,21,"POLYGON ((8.542973867500102 47.33543175678199,...",5.809536


# Joining places with neighborhoods

In [19]:
from geopandas.tools import sjoin

# Compare CRS metrics
print('CRS places: ', lausanne_geo.crs)
print('CRS neigborhood: ', neighborhood_lausanne.crs)

CRS places:  None
CRS neigborhood:  {'init': 'epsg:4326'}


In [20]:
lausanne_geo.crs = neighborhood_lausanne.crs
print('CRS places: ', lausanne_geo.crs)
print('CRS neigborhood: ', neighborhood_lausanne.crs)

joined_lausanne = sjoin(lausanne_geo, neighborhood_lausanne, how='left')
grouped_lausanne = joined_lausanne.groupby('index_right')
grouped_lausanne.groups

CRS places:  {'init': 'epsg:4326'}
CRS neigborhood:  {'init': 'epsg:4326'}


{0.0: [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  157,

In [21]:
# Aggregate Lausanne
cat_list_lausanne = grouped_lausanne['category_ids'].aggregate(lambda x: list(x))
cat_list_lausanne = cat_list_lausanne.reset_index(drop=True)
cat_list_lausanne


0     [[436], [347, 341, 342], [347, 341, 312], [436...
1     [[195], [311], nan, [181], [443], [189], [238]...
2     [nan, [225], [457], nan, [440], [357], [2], [3...
3     [[420], [244], [288], [221], [272], [213], [23...
4     [[436], [440], [143], [188], [47], [107, 55], ...
5     [[347, 342, 341], [436], [436], [190], [145], ...
6     [[143, 145], [143], [280], nan, [253], nan, [2...
7     [[47], nan, [310], [144], nan, [20], [236], [2...
8     [[314], [239], nan, [189], [372], [314], [320,...
9     [[71], [47], [415], [20], [188], [47], [239], ...
10    [[272], [239], nan, [308], [179], [292], [193]...
11    [[212], [347], nan, [363], [83], [280], nan, [...
12    [[292], [333], [107, 20], [116], [387], [347],...
13    [[72], [236], nan, [62], [137], [253], [62], [...
14    [[347], [29], [167], [44], [347], [279], [280]...
15    [[51], [66], [280], [221], [20], [34], nan, [3...
16    [[280], [347], [312, 347], [177], [47], nan, [...
17    [[292], [277], nan, nan, [342], [125], nan

In [None]:
# Join Zürich

# Compare CRS metrics
print('CRS places: ', zurich_geo.crs)
print('CRS neigborhood: ', neighborhood_zurich.crs)

CRS places:  None
CRS neigborhood:  {'init': 'epsg:4326'}


In [None]:
zurich_geo.crs = neighborhood_zurich.crs
print('CRS places: ', zurich_geo.crs)
print('CRS neigborhood: ', neighborhood_zurich.crs)

joined_zurich = sjoin(zurich_geo, neighborhood_zurich, how='left')
grouped_zurich = joined_zurich.groupby('index_right')
grouped_zurich.groups

CRS places:  {'init': 'epsg:4326'}
CRS neigborhood:  {'init': 'epsg:4326'}


In [None]:
# Aggregate zurich
cat_list_zurich = grouped_zurich['category_ids'].aggregate(lambda x: list(x))
cat_list_zurich = cat_list_zurich.reset_index(drop=True)
cat_list_zurich


# Creating descriptors for neighborhoods

In [None]:
LABEL_LENGTH = 467

def create_descriptor(nested_array):
    label_freq = np.zeros(LABEL_LENGTH)
    for sublist in nested_array:
        if type(sublist) is not list:
            continue
        for item in sublist:
            label_freq[item] += 1
    return label_freq

freq_lausanne = cat_list_lausanne.apply(create_descriptor)
print(freq_lausanne)

freq_zurich = cat_list_zurich.apply(create_descriptor)
print(freq_zurich)

In [None]:
print(freq_lausanne[0])
print(freq_zurich[0])

### All places

In [None]:
#Normalize descriptor vectors
from sklearn.preprocessing import normalize

def create_descriptor(frequencies, area):
    descriptor = frequencies / area
    descriptor_matrix = descriptor.as_matrix()
    descriptor_matrix = np.vstack(descriptor_matrix)
    descriptor_matrix_normalized = normalize(descriptor_matrix, axis=1, norm='l2')
    
    return descriptor_matrix_normalized

In [None]:
all_lausanne = create_descriptor(freq_lausanne, neighborhood_lausanne['area'].values)

In [None]:
all_zurich = create_descriptor(freq_zurich, neighborhood_zurich['area'].values)

In [None]:
import collections
def flatten(x):
    if isinstance(x, collections.Iterable):
        return [a for i in x for a in flatten(i)]
    else:
        return [x]

def category_and_all_subs(categories_df, category_ids):
    category_list = [] + category_ids
    childs_list = []
    while True:
        childs_list = category_list
        new_childs_list = []
        for cat_id in childs_list:
            childs = list(categories[categories["parents"] == str(cat_id)].index)
            if childs:
                new_childs_list += childs

        childs_list = list(set(new_childs_list) - set(category_list))
        category_list += childs_list
        category_list = flatten(category_list)
            
        if not childs_list:
            break
    return category_list

def selected_frequencies(row, frequencies):
    selected_frequencies = row[frequencies]
    return selected_frequencies

### Landmarks

In [None]:
landmarks_ids = category_and_all_subs(categories, [107])

for cat_id in landmarks_ids:
    print(categories.loc[cat_id]['label'])

In [None]:
landmarks_freq_lausanne = freq_lausanne.apply(lambda x: selected_frequencies(x, landmarks_ids))
landmarks_freq_zurich = freq_zurich.apply(lambda x: selected_frequencies(x, landmarks_ids))

In [None]:
landmarks_freq_lausanne = freq_lausanne.apply(lambda x: selected_frequencies(x, landmarks_ids))
landmarks_freq_zurich = freq_zurich.apply(lambda x: selected_frequencies(x, landmarks_ids))

landmarks_lausanne = create_descriptor(landmarks_freq_lausanne, neighborhood_lausanne['area'].values)
landmarks_zurich = create_descriptor(landmarks_freq_zurich, neighborhood_zurich['area'].values)

### Fast foods

In [None]:
fast_food_ids = category_and_all_subs(categories, [351, 355, 363, 366]) #burgers, fast_food, pizza, sushi

for cat_id in fast_food_ids:
    print(categories.loc[cat_id]['label'])

In [None]:
fastfood_freq_lausanne = freq_lausanne.apply(lambda x: selected_frequencies(x, fast_food_ids))
fastfood_freq_zurich = freq_zurich.apply(lambda x: selected_frequencies(x, fast_food_ids))

fastfood_lausanne = create_descriptor(fastfood_freq_lausanne, neighborhood_lausanne['area'].values)
fastfood_zurich = create_descriptor(fastfood_freq_zurich, neighborhood_zurich['area'].values)

### Travel spots

In [None]:
travel_ids = category_and_all_subs(categories, [430])

for cat_id in travel_ids:
    print(categories.loc[cat_id]['label'])

In [None]:
travel_freq_lausanne = freq_lausanne.apply(lambda x: selected_frequencies(x, travel_ids))
travel_freq_zurich = freq_zurich.apply(lambda x: selected_frequencies(x, travel_ids))

travel_lausanne = create_descriptor(travel_freq_lausanne, neighborhood_lausanne['area'].values)
travel_zurich = create_descriptor(travel_freq_zurich, neighborhood_zurich['area'].values)

### Sports

In [None]:
sports_ids = category_and_all_subs(categories, [372])

for cat_id in sports_ids:
    print(categories.loc[cat_id]['label'])

In [None]:
sports_freq_lausanne = freq_lausanne.apply(lambda x: selected_frequencies(x, sports_ids))
sports_freq_zurich = freq_zurich.apply(lambda x: selected_frequencies(x, sports_ids))

sports_lausanne = create_descriptor(sports_freq_lausanne, neighborhood_lausanne['area'].values)
sports_zurich = create_descriptor(sports_freq_zurich, neighborhood_zurich['area'].values)

# Neighborhood mapping procedure

In order to find a best matching between a freely chosen neighborhood and its corresponding descriptor $\vec n_i^A$ from the origin city $A$ and a corresponding target neighborhood with descriptor $\vec n_j^B$ in the target city $B$ we will be mapping to the closest descriptor in the target city as follows:

$$ \| \vec n_i^A - \vec n_j^B \|^2 \leq  \| \vec n_i^A - \vec n_k^B \|^2  \quad \forall k \neq j $$

By expanding the above formula we can show that maximizing the dot product between neighborhood descriptors will actually result in the closest match for neighborhood $n_i$,

$$ (\vec n_i^A)^2 - 2 (\vec n_i^A)^T \vec n_j^B + (\vec n_j^B)^2  \leq  (\vec n_i^A)^2 - 2 (\vec n_i^A)^T \vec n_k^B + (\vec n_k^B)^2  $$

Since we have normalized the descriptor vectors above we have $ (\vec n_i^A)^2 = (\vec n_j^B)^2 = (\vec n_k^B)^2 = 1 \quad \forall i,j,k$ which leads to

$$ - 2 (\vec n_i^A)^T \vec n_j^B \leq - 2 (\vec n_i^A)^T \vec n_k^B  $$

Dividing the above inequation by $-2$ including the corresponding flip to the inequality sign results in

$$ (\vec n_i^A)^T \vec n_j^B \geq (\vec n_i^A)^T \vec n_k^B  $$

This shows that the closest neighborhood can be found be finding the maximal dot product between the chosen origin neighborhood descriptor and the set of target neighborhood descriptors $\vec n_j^B$.





In [None]:
def map_neighborhoods(lausanne_descriptor_matrix_normalized, zurich_descriptor_matrix_normalized):
    map_cities = lausanne_descriptor_matrix_normalized @ zurich_descriptor_matrix_normalized.T
    zurich_districts_in_lausanne = map_cities.argmax(axis=0)
    lausanne_districts_in_zurich = map_cities.argmax(axis=1)
    
    return lausanne_districts_in_zurich, zurich_districts_in_lausanne

### All places

In [None]:
lausanne_districts_in_zurich_all, zurich_districts_in_lausanne_all = map_neighborhoods(all_lausanne, all_zurich)

In [None]:
lausanne_districts_in_zurich_all

In [None]:
zurich_districts_in_lausanne_all

### Landmarks 

In [None]:
lausanne_districts_in_zurich_landmarks, zurich_districts_in_lausanne_landmarks = map_neighborhoods(landmarks_lausanne, landmarks_zurich)

In [None]:
lausanne_districts_in_zurich_landmarks

In [None]:
zurich_districts_in_lausanne_landmarks

### Fast foods

In [None]:
lausanne_districts_in_zurich_fastfood, zurich_districts_in_lausanne_fastfood = map_neighborhoods(fastfood_lausanne, fastfood_zurich)

In [None]:
lausanne_districts_in_zurich_fastfood

In [None]:
zurich_districts_in_lausanne_fastfood

### Travel spots

In [None]:
lausanne_districts_in_zurich_travel, zurich_districts_in_lausanne_travel = map_neighborhoods(travel_lausanne, travel_zurich)

In [None]:
lausanne_districts_in_zurich_travel

In [None]:
zurich_districts_in_lausanne_travel

### Sports

In [None]:
lausanne_districts_in_zurich_sports, zurich_districts_in_lausanne_sports = map_neighborhoods(sports_lausanne, sports_zurich)

In [None]:
lausanne_districts_in_zurich_sports

In [None]:
zurich_districts_in_lausanne_sports