As the project has just started, your team doesn't have any data that can be used to create this application. Therefore, your job will be to:

Scrape data from destinations
Get weather data from each destination
Get hotels' info about each destination
Store all the information above in a data lake
Extract, transform and load cleaned data from your datalake to a data warehouse

In [1]:
# import libraries
import requests
import json
import pandas as pd
from statistics import mean

Marketing team wants to focus first on the best cities to travel to in France. According One Week In.com here are the top-35 cities to visit in France:

In [2]:
# liste des villes étudiées

top_35_cities = ["Mont Saint Michel",
"St Malo",
"Bayeux",
"Le Havre",
"Rouen",
"Paris",
"Amiens",
"Lille",
"Strasbourg",
"Chateau du Haut Koenigsbourg",
"Colmar",
"Eguisheim",
"Besancon",
"Dijon",
"Annecy",
"Grenoble",
"Lyon",
"Gorges du Verdon",
"Bormes les Mimosas",
"Cassis",
"Marseille",
"Aix en Provence",
"Avignon",
"Uzes",
"Nimes",
"Aigues Mortes",
"Saintes Maries de la mer",
"Collioure",
"Carcassonne",
"Ariege",
"Toulouse",
"Montauban",
"Biarritz",
"Bayonne",
"La Rochelle"]

In [3]:
# scrapper la geolocalisation des villes
# Use https://nominatim.org/ 

#  https://nominatim.openstreetmap.org/search?<params>
endpoint = "https://nominatim.openstreetmap.org"

response =  requests.get(endpoint)
print("Response code:",response,"\n \n")

Response code: <Response [200]> 
 



In [7]:
response =  requests.get("https://nominatim.openstreetmap.org/search?q=Ariege&format=json")
print("Response code:",response,"\n \n")
print("Response data:\n")
response.content

Response code: <Response [200]> 
 

Response data:



b'[{"place_id":297389050,"licence":"Data \xc2\xa9 OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright","osm_type":"relation","osm_id":7439,"boundingbox":["42.5732416","43.3162514","0.8267506","2.1758135"],"lat":"42.9455368","lon":"1.4065544156065486","display_name":"Ari\xc3\xa8ge, Occitanie, France m\xc3\xa9tropolitaine, France","class":"boundary","type":"administrative","importance":0.6009114788084189,"icon":"https://nominatim.openstreetmap.org/ui/mapicons/poi_boundary_administrative.p.20.png"},{"place_id":299422341,"licence":"Data \xc2\xa9 OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright","osm_type":"relation","osm_id":13625918,"boundingbox":["51.1624195","51.199616","-56.0111241","-55.9723346"],"lat":"51.18111155","lon":"-55.98447423947262","display_name":"Ariege (Belvy) Bay, Main Brook, Newfoundland, Newfoundland and Labrador, Canada","class":"natural","type":"bay","importance":0.29999999999999993},{"place_id":135201531,"licence":"Data \xc2\xa9 OpenStreet

In [8]:
response.json()

[{'place_id': 297389050,
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
  'osm_type': 'relation',
  'osm_id': 7439,
  'boundingbox': ['42.5732416', '43.3162514', '0.8267506', '2.1758135'],
  'lat': '42.9455368',
  'lon': '1.4065544156065486',
  'display_name': 'Ariège, Occitanie, France métropolitaine, France',
  'class': 'boundary',
  'type': 'administrative',
  'importance': 0.6009114788084189,
  'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/poi_boundary_administrative.p.20.png'},
 {'place_id': 299422341,
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
  'osm_type': 'relation',
  'osm_id': 13625918,
  'boundingbox': ['51.1624195', '51.199616', '-56.0111241', '-55.9723346'],
  'lat': '51.18111155',
  'lon': '-55.98447423947262',
  'display_name': 'Ariege (Belvy) Bay, Main Brook, Newfoundland, Newfoundland and Labrador, Canada',
  'class': 'natural',
  'type': 'bay',
  'importance': 0.299999999999

In [9]:
# intialisation dataframe vide
df_cities = pd.DataFrame(columns = ['Id', 'City' , 'Latitude', 'Longitude'])

In [10]:
# parcours la liste des 35 cités
for counter, city in enumerate(top_35_cities):
    response =  requests.get(f"https://nominatim.openstreetmap.org/search?q={city}&format=json")
    # Complète le dataframe
    df_cities.loc[counter] = [response.json()[0]['place_id'],
                              city,
                              response.json()[0]['lat'],
                              response.json()[0]['lon']]

In [11]:
df_cities

Unnamed: 0,Id,City,Latitude,Longitude
0,156094680,Mont Saint Michel,48.6359541,-1.511459954959514
1,297756747,St Malo,48.649518,-2.0260409
2,297981358,Bayeux,49.2764624,-0.7024738
3,298137491,Le Havre,49.4938975,0.1079732
4,297518815,Rouen,49.4404591,1.0939658
5,297417241,Paris,48.8588897,2.3200410217200766
6,297534793,Amiens,49.8941708,2.2956951
7,297472400,Lille,50.6365654,3.0635282
8,297508568,Strasbourg,48.584614,7.7507127
9,120791766,Chateau du Haut Koenigsbourg,48.249489800000006,7.34429620253195


In [13]:
len(df_cities)

35

In [14]:
df_cities.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35 entries, 0 to 34
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Id         35 non-null     int64 
 1   City       35 non-null     object
 2   Latitude   35 non-null     object
 3   Longitude  35 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.4+ KB


In [15]:
# scrap the weather
# Use https://openweathermap.org/appid 

# some information about the weather for the 35 cities
# https://api.openweathermap.org/data/2.5/onecall?lat={lat}&lon={lon}&exclude={part}&appid={API key}

# import libraries

from credentials import APIKEY 

# API key
APPID = APIKEY

EXCLUDE = 'current,minutely,hourly'

In [16]:
#  https://nominatim.openstreetmap.org/search?<params>
endpoint = "https://api.openweathermap.org/data/2.5/onecall?"

# example
params = {'exclude' : EXCLUDE ,
          'appid' : APPID ,
          'lat' : '48.6359541' ,
          'lon' : -1.511459954959514,
          'units' : 'metric'}

response =  requests.get(endpoint, params = params)
print("Response code:",response,"\n \n")
response.json()

Response code: <Response [200]> 
 



{'lat': 48.636,
 'lon': -1.5115,
 'timezone': 'Europe/Paris',
 'timezone_offset': 7200,
 'daily': [{'dt': 1665313200,
   'sunrise': 1665296200,
   'sunset': 1665336585,
   'moonrise': 1665336960,
   'moonset': 1665293340,
   'moon_phase': 0.5,
   'temp': {'day': 18.09,
    'min': 7.95,
    'max': 19.49,
    'night': 13.91,
    'eve': 17.83,
    'morn': 7.97},
   'feels_like': {'day': 17.21, 'night': 12.92, 'eve': 16.97, 'morn': 5.42},
   'pressure': 1015,
   'humidity': 48,
   'dew_point': 6.92,
   'wind_speed': 6.89,
   'wind_deg': 139,
   'wind_gust': 12.08,
   'weather': [{'id': 801,
     'main': 'Clouds',
     'description': 'few clouds',
     'icon': '02d'}],
   'clouds': 20,
   'pop': 0,
   'uvi': 2.71},
  {'dt': 1665399600,
   'sunrise': 1665382688,
   'sunset': 1665422864,
   'moonrise': 1665424380,
   'moonset': 1665384300,
   'moon_phase': 0.52,
   'temp': {'day': 17.46,
    'min': 9.92,
    'max': 17.7,
    'night': 9.92,
    'eve': 14.17,
    'morn': 14.75},
   'feels_like'

In [17]:
import datetime

# date of extraction
date = response.json()['daily'][0]['dt']
readable = datetime.datetime.fromtimestamp(date).isoformat()
readable

'2022-10-09T13:00:00'

In [74]:
# params
params = {'exclude' : EXCLUDE ,
          'appid' : APPID ,
          'units' : 'metric'}

# initialization of an empty dataframe
df_weather = pd.DataFrame(columns = ['Id', 'City','lat','lon','temperature', 'clear_weather'])

# browse the list of 35 cities by latitude and longitude
for counter, (id, city, lat, lon) in enumerate(zip(df_cities['Id'],
                                                   df_cities['City'],
                                                   df_cities['Latitude'],
                                                   df_cities['Longitude'])):
    # latitude and longitude of each city
    localpoint = endpoint + f"lat={lat}&lon={lon}"
    response =  requests.get(localpoint, params = params)
    
    # Retrieves weather information for the next 7 days
    temp = []
    clear_weather = []
    for i in range(1, len(response.json()['daily'])):
        # Day temperature
        temp.append((response.json()['daily'][i]['temp']['day']))
        # clear weather: 100 - Cloudiness, %
        clear_weather.append(100 - (response.json()['daily'][i]['clouds']))
        
    # Complete the dataframe
    df_weather.loc[counter] = [id, city, float(lat), float(lon), mean(temp), mean(clear_weather)]

In [75]:
df_weather

Unnamed: 0,Id,City,lat,lon,temperature,clear_weather
0,156094680,Mont Saint Michel,48.635954,-1.51146,16.735714,16.857143
1,297756747,St Malo,48.649518,-2.026041,16.392857,16.0
2,297981358,Bayeux,49.276462,-0.702474,15.268571,15.285714
3,298137491,Le Havre,49.493898,0.107973,14.871429,11.142857
4,297518815,Rouen,49.440459,1.093966,15.597143,7.285714
5,297417241,Paris,48.85889,2.320041,17.527143,12.285714
6,297534793,Amiens,49.894171,2.295695,15.315714,6.0
7,297472400,Lille,50.636565,3.063528,14.641429,3.142857
8,297508568,Strasbourg,48.584614,7.750713,17.552857,23.142857
9,120791766,Chateau du Haut Koenigsbourg,48.24949,7.344296,15.498571,23.857143


In [78]:
import plotly.express as px
import plotly.io as pio

# pio.renderers.default = "svg"

fig = px.scatter_mapbox(df_weather,
                        lat ="lat",
                        lon = "lon",
                        color = "temperature",
                        hover_name = "City",
                        size= "clear_weather",
                        zoom = 3.9,
                        mapbox_style = "carto-positron",
                        width = 800)
fig.show()

In [20]:
df_weather.to_csv("cities_weather.csv")

## Scrape Booking.com

Since BookingHoldings doesn't have aggregated databases, it will be much faster to scrape data directly from booking.com

You can scrap as many information asyou want, but we suggest that you get at least:

hotel name,
Url to its booking.com page,
Its coordinates: latitude and longitude
Score given by the website users
Text description of the hotel

In [29]:
!python bookingHotel.py

2022-10-09 17:07:07 [scrapy.utils.log] INFO: Scrapy 2.6.3 started (bot: scrapybot)
2022-10-09 17:07:07 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 2.0.1, Twisted 22.8.0, Python 3.10.4 (tags/v3.10.4:9d38120, Mar 23 2022, 23:13:41) [MSC v.1929 64 bit (AMD64)], pyOpenSSL 22.1.0 (OpenSSL 3.0.5 5 Jul 2022), cryptography 38.0.1, Platform Windows-10-10.0.19043-SP0
2022-10-09 17:07:07 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 20, 'USER_AGENT': 'Chrome/97.0'}
2022-10-09 17:07:07 [scrapy.extensions.telnet] INFO: Telnet Password: f54b6bb7d79ccc0f
2022-10-09 17:07:07 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2022-10-09 17:07:07 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy

In [30]:
df_booking = pd.read_json("src/booking_hotel.json", encoding='utf-8')

In [39]:
df_booking.to_csv("booking_weather.csv")

Join columns from df_booking with df_weather by the id city

In [33]:
df_booking_weather = df_booking.merge(df_weather, left_on='city_id', right_on='Id', how='outer')
df_booking_weather.head()

Unnamed: 0,url,city_id,name_hotel,score,coordinates,text_description,Id,City,lat,lon,temp,pop,clouds,weather_main
0,https://www.booking.com/hotel/fr/hotel-celine....,297518815,Hôtel Céline - Hôtel de la Gare,80,"1.06252587352748,49.4310680988023,1.1178954788...",[L'Hôtel Céline - Hôtel de la Gare se trouve d...,297518815,Rouen,49.440459,1.093966,15.597143,0.595714,92.714286,"[Rain, Clouds, Clouds, Rain, Rain, Rain, Rain]"
1,https://www.booking.com/hotel/fr/the-first.fr....,297518815,"Le First, centre ville terrasse et parking",91,"1.05554739606558,49.4280597834269,1.1109136039...",[Vous pouvez bénéficier d'une réduction Genius...,297518815,Rouen,49.440459,1.093966,15.597143,0.595714,92.714286,"[Rain, Clouds, Clouds, Rain, Rain, Rain, Rain]"
2,https://www.booking.com/hotel/fr/industriel-ch...,297518815,"L'INDUSTRIEL CHIC, 2 PIECES AVEC VU IMPRENABLE...",90,"1.04465806489105,49.4328606384189,1.1000296949...",[Vous pouvez bénéficier d'une réduction Genius...,297518815,Rouen,49.440459,1.093966,15.597143,0.595714,92.714286,"[Rain, Clouds, Clouds, Rain, Rain, Rain, Rain]"
3,https://www.booking.com/hotel/fr/de-la-cathedr...,297518815,Hôtel De La Cathédrale,78,"1.06925386466515,49.4221513972113,1.1246134015...",[Arborant une façade à colombages datant du XV...,297518815,Rouen,49.440459,1.093966,15.597143,0.595714,92.714286,"[Rain, Clouds, Clouds, Rain, Rain, Rain, Rain]"
4,https://www.booking.com/hotel/fr/le-palm-duple...,297518815,Le Palm ✧ Duplex centre historique Rouen,93,"1.05837701832101,49.4260720834269,1.1137409816...",[Vous pouvez bénéficier d'une réduction Genius...,297518815,Rouen,49.440459,1.093966,15.597143,0.595714,92.714286,"[Rain, Clouds, Clouds, Rain, Rain, Rain, Rain]"


In [40]:
len(df_booking_weather['City'].unique())

35

In [34]:
df_booking_weather.describe(include="all")

Unnamed: 0,url,city_id,name_hotel,score,coordinates,text_description,Id,City,lat,lon,temp,pop,clouds,weather_main
count,875,875.0,874,867.0,875,875,875.0,875,875.0,875.0,875.0,875.0,875.0,875
unique,875,,873,40.0,873,875,,35,,,,,,21
top,https://www.booking.com/hotel/fr/hotel-celine....,,Le Valmer,80.0,"5.68637707337778,45.1749075834269,5.7374569266...",[L'Hôtel Céline - Hôtel de la Gare se trouve d...,,Rouen,,,,,,"[Rain, Clouds, Clouds, Clouds, Rain, Rain, Rain]"
freq,1,,2,50.0,2,1,,25,,,,,,100
mean,,282377900.0,,,,,282377900.0,,45.840986,3.395928,19.771347,0.346531,67.608163,
std,,51374160.0,,,,,51374160.0,,2.554274,2.913222,2.933438,0.188661,15.718124,
min,,76036310.0,,,,,76036310.0,,42.52505,-2.026041,14.641429,0.122857,43.714286,
25%,,297504700.0,,,,,297504700.0,,43.494514,1.354999,17.527143,0.171429,53.0,
50%,,297749100.0,,,,,297749100.0,,45.18756,4.360069,20.0,0.288571,58.857143,
75%,,298011300.0,,,,,298011300.0,,48.584614,5.735782,22.808571,0.544286,83.142857,


In [None]:
df_booking_weather['coordinates'] = df_booking_weather['coordinates'].apply(lambda x : x.split(","))

In [64]:
df_booking_weather['long_hotel'] = df_booking_weather['coordinates'].apply(lambda x : mean([float(x[0]),float(x[2])]))
df_booking_weather['lat_hotel'] = df_booking_weather['coordinates'].apply(lambda x : mean([float(x[1]),float(x[3])]))
df_booking_weather.drop(columns=['coordinates','Id'], inplace = True)