# Your job will be to:

Scrape data from destinations

Get weather data from each destination

Get hotels' info about each destination

Store all the information above in a data lake

Extract, transform and load cleaned data from your datalake to a data warehouse


In [1]:
import pandas as pd
import requests
import logging
from scrapy.crawler import CrawlerProcess
import os 
import scrapy
import re

In [2]:
destinations = ["Mont Saint Michel",
"St Malo",
"Bayeux",
"Le Havre",
"Rouen",
"Paris",
"Amiens",
"Lille",
"Strasbourg",
"Chateau du Haut Koenigsbourg",
"Colmar",
"Eguisheim",
"Besancon",
"Dijon",
"Annecy",
"Grenoble",
"Lyon",
"Gorges du Verdon",
"Bormes les Mimosas",
"Cassis",
"Marseille",
"Aix en Provence",
"Avignon",
"Uzes",
"Nimes",
"Aigues Mortes",
"Saintes Maries de la mer",
"Collioure",
"Carcassonne",
"Ariege",
"Toulouse",
"Montauban",
"Biarritz",
"Bayonne",
"La Rochelle"]

In [13]:
## Endpoint
# https://nominatim.openstreetmap.org/search?<params>

In [4]:
# # Get the url and check that it returns a success code
# url = 'https://nominatim.openstreetmap.org/search'
# r = requests.get(url)
# r

In [5]:
'''From the API documentation, 'city' should be used to call at the endpoint to search for a specific city, and should return a jsonv2 by default'''

'https://nominatim.org/release-docs/develop/api/Search/#structured-query'

'https://nominatim.org/release-docs/develop/api/Search/#structured-query'

# First, I am testing it for one city

In [6]:
# Test with html url

# paris=requests.get('https://nominatim.openstreetmap.org/ui/search.html?city=Paris')
# paris

In [7]:
# paris.content

In [8]:
# ## Test with manual url
# # here it is working

# paris=requests.get('https://nominatim.openstreetmap.org/search?city=Paris&country=France&format=json')
# paris

In [9]:
# paris.content

In [206]:
# it is working ! Carefull, I have been banned before because I forgot to add headers (User-Agent at least) : Error 403
# to know user-agent, go in dev mode in the browser, network tab and type navigator.userAgent in the console
# can add 'From' : 'email address' in the headers too if needed (for heavy usage)
## Important to add the 'country=France' param because other Paris in the world. ALSO ADD limit = 1 because the request returns several results for the same city
headers = {'User-Agent' : 'Chrome/125.0.0.0'}
payload = {'city':'Paris', 'country':'France', 'format':'json', 'limit' : '1'}

paris=requests.get('https://nominatim.openstreetmap.org/search', headers=headers, params=payload)
print('Code is {}'.format(paris))
print(paris.content)


Code is <Response [200]>
b'[{"place_id":82297359,"licence":"Data \xc2\xa9 OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright","osm_type":"relation","osm_id":71525,"lat":"48.8534951","lon":"2.3483915","class":"boundary","type":"administrative","place_rank":12,"importance":0.8317101715588673,"addresstype":"city","name":"Paris","display_name":"Paris, \xc3\x8ele-de-France, France m\xc3\xa9tropolitaine, France","boundingbox":["48.8155755","48.9021560","2.2241220","2.4697602"]}]'


In [208]:
# transform the output as a list
paris=list(paris.json())

In [193]:
type(paris)

list

In [196]:
#check the output, coordinates are correct
paris

[{'place_id': 83293737,
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright',
  'osm_type': 'relation',
  'osm_id': 71525,
  'lat': '48.8534951',
  'lon': '2.3483915',
  'class': 'boundary',
  'type': 'administrative',
  'place_rank': 12,
  'importance': 0.8317101715588673,
  'addresstype': 'city',
  'name': 'Paris',
  'display_name': 'Paris, Île-de-France, France métropolitaine, France',
  'boundingbox': ['48.8155755', '48.9021560', '2.2241220', '2.4697602']}]

In [198]:
#create a dataframe
paris_df=pd.DataFrame(paris)
paris_df

Unnamed: 0,place_id,licence,osm_type,osm_id,lat,lon,class,type,place_rank,importance,addresstype,name,display_name,boundingbox
0,83293737,"Data © OpenStreetMap contributors, ODbL 1.0. h...",relation,71525,48.8534951,2.3483915,boundary,administrative,12,0.83171,city,Paris,"Paris, Île-de-France, France métropolitaine, F...","[48.8155755, 48.9021560, 2.2241220, 2.4697602]"


In [204]:
#Trim the dataframe to keep only the data of interest
paris_df_trim=paris_df[['place_id', 'lat', 'lon', 'name']]
paris_df_trim

Unnamed: 0,place_id,lat,lon,name
0,83293737,48.8534951,2.3483915,Paris


Quand on passe à l'iteration, mettre un param qui bloque un peu la boulcvep our laisser l'API se regenerer (time_sleep ? await? un truc du genre)

Essaye vaec une liste navec que 3 villes pour eviter de se faire ban

# Nouvel essai en iterations cette fois-ci, avec seulement 3 villes

In [213]:
destinations_backup=destinations

In [215]:
#space should be replaced by + in params
destinations= [city.replace(' ', '+') for city in destinations]
destinations

['Mont+Saint+Michel',
 'St+Malo',
 'Bayeux',
 'Le+Havre',
 'Rouen',
 'Paris',
 'Amiens',
 'Lille',
 'Strasbourg',
 'Chateau+du+Haut+Koenigsbourg',
 'Colmar',
 'Eguisheim',
 'Besancon',
 'Dijon',
 'Annecy',
 'Grenoble',
 'Lyon',
 'Gorges+du+Verdon',
 'Bormes+les+Mimosas',
 'Cassis',
 'Marseille',
 'Aix+en+Provence',
 'Avignon',
 'Uzes',
 'Nimes',
 'Aigues+Mortes',
 'Saintes+Maries+de+la+mer',
 'Collioure',
 'Carcassonne',
 'Ariege',
 'Toulouse',
 'Montauban',
 'Biarritz',
 'Bayonne',
 'La+Rochelle']

In [297]:
len(destinations)

34

In [287]:
destinations_trim=destinations[:3]
destinations_trim

['Mont+Saint+Michel', 'St+Malo', 'Bayeux']

In [268]:
type(destinations_trim)

list

In [288]:
#Do it with one city to initiate the dataframe
headers = {'User-Agent' : 'Chrome/125.0.0.0'}
payload = {'city':'{}'.format(destinations_trim[0]), 'country':'France', 'format':'json', 'limit' : '1'}

response=requests.get('https://nominatim.openstreetmap.org/search', headers=headers, params=payload)
print('Code is {}'.format(response))
city_trim_df=pd.DataFrame(response.json())
city_trim_df

Code is <Response [200]>


Unnamed: 0,place_id,licence,osm_type,osm_id,lat,lon,class,type,place_rank,importance,addresstype,name,display_name,boundingbox
0,276970680,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,211285890,48.6359541,-1.511459954959514,place,islet,20,0.455437,islet,Mont Saint-Michel,"Mont Saint-Michel, Le Mont-Saint-Michel, Avran...","[48.6349172, 48.6370310, -1.5133292, -1.5094796]"


In [None]:
#Drop the first destination because we already scrapped its data when initiating the dataframe
destinations_trim.pop(0)
destinations_trim

['St+Malo', 'Bayeux']

In [290]:
#Run the loop for the rest of the destinations; ignore_index=true in pd.concat otherwise all index rows are 0
headers = {'User-Agent' : 'Chrome/125.0.0.0'}

for city in range(len(destinations_trim)):
    payload = {'city':'{}'.format(destinations_trim[city]), 'country':'France', 'format':'json', 'limit' : '1'}
    response=requests.get('https://nominatim.openstreetmap.org/search', headers=headers, params=payload)
    city_trim_df=pd.concat([city_trim_df, pd.DataFrame(response.json())], ignore_index=True)
city_trim_df

Unnamed: 0,place_id,licence,osm_type,osm_id,lat,lon,class,type,place_rank,importance,addresstype,name,display_name,boundingbox
0,276970680,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,211285890,48.6359541,-1.511459954959514,place,islet,20,0.455437,islet,Mont Saint-Michel,"Mont Saint-Michel, Le Mont-Saint-Michel, Avran...","[48.6349172, 48.6370310, -1.5133292, -1.5094796]"
1,277106181,"Data © OpenStreetMap contributors, ODbL 1.0. h...",relation,905534,48.649518,-2.0260409,boundary,administrative,16,0.576467,town,Saint-Malo,"Saint-Malo, Ille-et-Vilaine, Bretagne, France ...","[48.5979853, 48.6949736, -2.0765246, -1.9367259]"
2,276101915,"Data © OpenStreetMap contributors, ODbL 1.0. h...",relation,145776,49.2764624,-0.7024738,boundary,administrative,16,0.5727,town,Bayeux,"Bayeux, Calvados, Normandie, France métropolit...","[49.2608124, 49.2934736, -0.7275671, -0.6757378]"


In [291]:
#Trim the dataframe to keep only the data of interest
city_df_trim=city_trim_df[['place_id', 'lat', 'lon', 'name']]
city_df_trim

Unnamed: 0,place_id,lat,lon,name
0,276970680,48.6359541,-1.511459954959514,Mont Saint-Michel
1,277106181,48.649518,-2.0260409,Saint-Malo
2,276101915,49.2764624,-0.7024738,Bayeux


In [292]:
#reorder columns order for readability
city_df_trim=city_df_trim[[ 'name', 'place_id', 'lat', 'lon']]
city_df_trim

Unnamed: 0,name,place_id,lat,lon
0,Mont Saint-Michel,276970680,48.6359541,-1.511459954959514
1,Saint-Malo,277106181,48.649518,-2.0260409
2,Bayeux,276101915,49.2764624,-0.7024738


# Final List of all destinations coordinates

In [293]:
#Do it with one city to initiate the dataframe
headers = {'User-Agent' : 'Chrome/125.0.0.0'}
payload = {'city':'{}'.format(destinations[0]), 'country':'France', 'format':'json', 'limit' : '1'}

response=requests.get('https://nominatim.openstreetmap.org/search', headers=headers, params=payload)
print('Code is {}'.format(response))
city_df=pd.DataFrame(response.json())
city_df

Code is <Response [200]>


Unnamed: 0,place_id,licence,osm_type,osm_id,lat,lon,class,type,place_rank,importance,addresstype,name,display_name,boundingbox
0,276970680,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,211285890,48.6359541,-1.511459954959514,place,islet,20,0.455437,islet,Mont Saint-Michel,"Mont Saint-Michel, Le Mont-Saint-Michel, Avran...","[48.6349172, 48.6370310, -1.5133292, -1.5094796]"


In [294]:
#Drop the first destination because we already scrapped its data when initiating the dataframe
destinations.pop(0)
destinations

['St+Malo',
 'Bayeux',
 'Le+Havre',
 'Rouen',
 'Paris',
 'Amiens',
 'Lille',
 'Strasbourg',
 'Chateau+du+Haut+Koenigsbourg',
 'Colmar',
 'Eguisheim',
 'Besancon',
 'Dijon',
 'Annecy',
 'Grenoble',
 'Lyon',
 'Gorges+du+Verdon',
 'Bormes+les+Mimosas',
 'Cassis',
 'Marseille',
 'Aix+en+Provence',
 'Avignon',
 'Uzes',
 'Nimes',
 'Aigues+Mortes',
 'Saintes+Maries+de+la+mer',
 'Collioure',
 'Carcassonne',
 'Ariege',
 'Toulouse',
 'Montauban',
 'Biarritz',
 'Bayonne',
 'La+Rochelle']

In [295]:
#Run the loop for the rest of the destinations; ignore_index=true in pd.concat otherwise all index rows are 0
headers = {'User-Agent' : 'Chrome/125.0.0.0'}

for city in range(len(destinations)):
    payload = {'city':'{}'.format(destinations[city]), 'country':'France', 'format':'json', 'limit' : '1'}
    response=requests.get('https://nominatim.openstreetmap.org/search', headers=headers, params=payload)
    city_df=pd.concat([city_df, pd.DataFrame(response.json())], ignore_index=True)
city_df

Unnamed: 0,place_id,licence,osm_type,osm_id,lat,lon,class,type,place_rank,importance,addresstype,name,display_name,boundingbox
0,276970680,"Data © OpenStreetMap contributors, ODbL 1.0. h...",way,211285890,48.6359541,-1.511459954959514,place,islet,20,0.455437,islet,Mont Saint-Michel,"Mont Saint-Michel, Le Mont-Saint-Michel, Avran...","[48.6349172, 48.6370310, -1.5133292, -1.5094796]"
1,277106181,"Data © OpenStreetMap contributors, ODbL 1.0. h...",relation,905534,48.649518,-2.0260409,boundary,administrative,16,0.576467,town,Saint-Malo,"Saint-Malo, Ille-et-Vilaine, Bretagne, France ...","[48.5979853, 48.6949736, -2.0765246, -1.9367259]"
2,276101915,"Data © OpenStreetMap contributors, ODbL 1.0. h...",relation,145776,49.2764624,-0.7024738,boundary,administrative,16,0.5727,town,Bayeux,"Bayeux, Calvados, Normandie, France métropolit...","[49.2608124, 49.2934736, -0.7275671, -0.6757378]"
3,116797886,"Data © OpenStreetMap contributors, ODbL 1.0. h...",relation,104492,49.4938975,0.1079732,boundary,administrative,16,0.622333,city,Le Havre,"Le Havre, Seine-Maritime, Normandie, France mé...","[49.4516697, 49.5401463, 0.0667992, 0.1955556]"
4,116490697,"Data © OpenStreetMap contributors, ODbL 1.0. h...",relation,75628,49.4404591,1.0939658,boundary,administrative,16,0.640073,city,Rouen,"Rouen, Seine-Maritime, Normandie, France métro...","[49.4172001, 49.4652601, 1.0300648, 1.1521157]"
5,114827617,"Data © OpenStreetMap contributors, ODbL 1.0. h...",relation,71525,48.8534951,2.3483915,boundary,administrative,12,0.83171,city,Paris,"Paris, Île-de-France, France métropolitaine, F...","[48.8155755, 48.9021560, 2.2241220, 2.4697602]"
6,116694242,"Data © OpenStreetMap contributors, ODbL 1.0. h...",relation,114347,49.8941708,2.2956951,boundary,administrative,16,0.624949,city,Amiens,"Amiens, Somme, Hauts-de-France, France métropo...","[49.8468370, 49.9505487, 2.2235574, 2.3457767]"
7,118533424,"Data © OpenStreetMap contributors, ODbL 1.0. h...",relation,58404,50.6365654,3.0635282,boundary,administrative,16,0.653204,city,Lille,"Lille, Nord, Hauts-de-France, France métropoli...","[50.6008264, 50.6612596, 2.9679677, 3.1257250]"
8,381804960,"Data © OpenStreetMap contributors, ODbL 1.0. h...",relation,71033,48.584614,7.7507127,boundary,administrative,16,0.674805,city,Strasbourg,"Strasbourg, Bas-Rhin, Grand Est, France métrop...","[48.4918610, 48.6461896, 7.6881371, 7.8360646]"
9,136411398,"Data © OpenStreetMap contributors, ODbL 1.0. h...",node,4245068168,48.2495226,7.3454923,place,isolated_dwelling,22,0.106728,isolated_dwelling,Château du Haut-Kœnigsbourg,"Château du Haut-Kœnigsbourg, Orschwiller, Séle...","[48.2494726, 48.2495726, 7.3454423, 7.3455423]"


In [298]:
#Trim the dataframe to keep only the data of interest
city_df_trim=city_df[['place_id', 'lat', 'lon', 'name']]
city_df_trim

Unnamed: 0,place_id,lat,lon,name
0,276970680,48.6359541,-1.511459954959514,Mont Saint-Michel
1,277106181,48.649518,-2.0260409,Saint-Malo
2,276101915,49.2764624,-0.7024738,Bayeux
3,116797886,49.4938975,0.1079732,Le Havre
4,116490697,49.4404591,1.0939658,Rouen
5,114827617,48.8534951,2.3483915,Paris
6,116694242,49.8941708,2.2956951,Amiens
7,118533424,50.6365654,3.0635282,Lille
8,381804960,48.584614,7.7507127,Strasbourg
9,136411398,48.2495226,7.3454923,Château du Haut-Kœnigsbourg


In [299]:
#reorder columns order for readability
city_df_trim=city_df_trim[[ 'name', 'place_id', 'lat', 'lon']]
city_df_trim

Unnamed: 0,name,place_id,lat,lon
0,Mont Saint-Michel,276970680,48.6359541,-1.511459954959514
1,Saint-Malo,277106181,48.649518,-2.0260409
2,Bayeux,276101915,49.2764624,-0.7024738
3,Le Havre,116797886,49.4938975,0.1079732
4,Rouen,116490697,49.4404591,1.0939658
5,Paris,114827617,48.8534951,2.3483915
6,Amiens,116694242,49.8941708,2.2956951
7,Lille,118533424,50.6365654,3.0635282
8,Strasbourg,381804960,48.584614,7.7507127
9,Château du Haut-Kœnigsbourg,136411398,48.2495226,7.3454923


In [300]:
# Save the coordinates dataframe as a csv
city_df_trim.to_csv('cities_coordinates.csv')

# Dump

In [91]:
paris_json ### checked the coordinates on google and it is correct

[{'place_id': 83293737,
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright',
  'osm_type': 'relation',
  'osm_id': 71525,
  'lat': '48.8534951',
  'lon': '2.3483915',
  'class': 'boundary',
  'type': 'administrative',
  'place_rank': 12,
  'importance': 0.8317101715588673,
  'addresstype': 'city',
  'name': 'Paris',
  'display_name': 'Paris, Île-de-France, France métropolitaine, France',
  'boundingbox': ['48.8155755', '48.9021560', '2.2241220', '2.4697602']}]

In [None]:
paris_json=paris.json()
type(paris_json)

list

In [92]:
## make a dict
paris_gps=paris_json[0]
print(type(paris_gps))
paris_gps

<class 'dict'>


{'place_id': 83293737,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright',
 'osm_type': 'relation',
 'osm_id': 71525,
 'lat': '48.8534951',
 'lon': '2.3483915',
 'class': 'boundary',
 'type': 'administrative',
 'place_rank': 12,
 'importance': 0.8317101715588673,
 'addresstype': 'city',
 'name': 'Paris',
 'display_name': 'Paris, Île-de-France, France métropolitaine, France',
 'boundingbox': ['48.8155755', '48.9021560', '2.2241220', '2.4697602']}

In [102]:
paris_gps_backup=paris_gps

In [103]:
# creer un nouveau dictionnaire en filtrant par les categories d'interet
request_list=['place_id', 'lat', 'lon', 'name']
paris_gps_trim = dict((i, paris_gps[i]) for i in request_list)
paris_gps_trim

{'place_id': 83293737,
 'lat': '48.8534951',
 'lon': '2.3483915',
 'name': 'Paris'}

In [104]:
# initiate a dataframe with dictionnary keys as columns
city_keys=paris_gps_trim.keys()  #Not sure why I need to specify the index here (is of type list and not dict so this may be why)
city_df=pd.DataFrame(columns=city_keys)
city_df

Unnamed: 0,place_id,lat,lon,name


In [187]:
paris_df=pd.DataFrame(paris_gps_trim)

ValueError: If using all scalar values, you must pass an index

In [183]:
paris_df=pd.json_normalize(paris_gps_trim).reset_index(drop=True)
paris_df

Unnamed: 0,place_id,lat,lon,name
0,83293737,48.8534951,2.3483915,Paris


In [185]:
city_df.index.is_unique

True

In [205]:
paris_df = paris_df.loc[~paris_df.index.duplicated(keep='first')]

In [186]:
# add Paris values to the dataframe
city_df=pd.concat([city_df, paris_df], ignore_index=True)
city_df

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [127]:
# add Paris values to the dataframe
city_df=pd.concat([city_df, pd.DataFrame(paris_gps_trim.values())])
city_df

InvalidIndexError: Reindexing only valid with uniquely valued Index objects