As the project has just started, your team doesn't have any data that can be used to create this application. Therefore, your job will be to:

Scrape data from destinations
Get weather data from each destination
Get hotels' info about each destination
Store all the information above in a data lake
Extract, transform and load cleaned data from your datalake to a data warehouse

In [1]:
# import libraries
import requests
import json
import pandas as pd
from statistics import mean
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go


Marketing team wants to focus first on the best cities to travel to in France. According One Week In.com here are the top-35 cities to visit in France:

In [2]:
# liste des villes étudiées

top_35_cities = ["Mont Saint Michel",
"St Malo",
"Bayeux",
"Le Havre",
"Rouen",
"Paris",
"Amiens",
"Lille",
"Strasbourg",
"Chateau du Haut Koenigsbourg",
"Colmar",
"Eguisheim",
"Besancon",
"Dijon",
"Annecy",
"Grenoble",
"Lyon",
"Gorges du Verdon",
"Bormes les Mimosas",
"Cassis",
"Marseille",
"Aix en Provence",
"Avignon",
"Uzes",
"Nimes",
"Aigues Mortes",
"Saintes Maries de la mer",
"Collioure",
"Carcassonne",
"Ariege",
"Toulouse",
"Montauban",
"Biarritz",
"Bayonne",
"La Rochelle"]

In [3]:
# scrapper la geolocalisation des villes
# Use https://nominatim.org/ 

#  https://nominatim.openstreetmap.org/search?<params>
endpoint = "https://nominatim.openstreetmap.org"

response =  requests.get(endpoint)
print("Response code:",response,"\n \n")

Response code: <Response [200]> 
 



In [4]:
response =  requests.get("https://nominatim.openstreetmap.org/search?q=Ariege&format=json")
print("Response code:",response,"\n \n")
print("Response data:\n")
response.content

Response code: <Response [200]> 
 

Response data:



b'[{"place_id":297389050,"licence":"Data \xc2\xa9 OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright","osm_type":"relation","osm_id":7439,"boundingbox":["42.5732416","43.3162514","0.8267506","2.1758135"],"lat":"42.9455368","lon":"1.4065544156065486","display_name":"Ari\xc3\xa8ge, Occitanie, France m\xc3\xa9tropolitaine, France","class":"boundary","type":"administrative","importance":0.6009114788084189,"icon":"https://nominatim.openstreetmap.org/ui/mapicons/poi_boundary_administrative.p.20.png"},{"place_id":299422341,"licence":"Data \xc2\xa9 OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright","osm_type":"relation","osm_id":13625918,"boundingbox":["51.1624195","51.199616","-56.0111241","-55.9723346"],"lat":"51.18111155","lon":"-55.98447423947262","display_name":"Ariege (Belvy) Bay, Main Brook, Newfoundland, Newfoundland and Labrador, Canada","class":"natural","type":"bay","importance":0.29999999999999993},{"place_id":135201531,"licence":"Data \xc2\xa9 OpenStreet

In [5]:
response.json()

[{'place_id': 297389050,
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
  'osm_type': 'relation',
  'osm_id': 7439,
  'boundingbox': ['42.5732416', '43.3162514', '0.8267506', '2.1758135'],
  'lat': '42.9455368',
  'lon': '1.4065544156065486',
  'display_name': 'Ariège, Occitanie, France métropolitaine, France',
  'class': 'boundary',
  'type': 'administrative',
  'importance': 0.6009114788084189,
  'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/poi_boundary_administrative.p.20.png'},
 {'place_id': 299422341,
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
  'osm_type': 'relation',
  'osm_id': 13625918,
  'boundingbox': ['51.1624195', '51.199616', '-56.0111241', '-55.9723346'],
  'lat': '51.18111155',
  'lon': '-55.98447423947262',
  'display_name': 'Ariege (Belvy) Bay, Main Brook, Newfoundland, Newfoundland and Labrador, Canada',
  'class': 'natural',
  'type': 'bay',
  'importance': 0.299999999999

In [6]:
# intialisation dataframe vide
df_cities = pd.DataFrame(columns = ['Id', 'City' , 'Latitude', 'Longitude'])

In [7]:
# browse the list of 35 cities
for counter, city in enumerate(top_35_cities):
    response =  requests.get(f"https://nominatim.openstreetmap.org/search?q={city}&format=json")
    # Complete the dataframe
    df_cities.loc[counter] = [response.json()[0]['place_id'],
                              city,
                              response.json()[0]['lat'],
                              response.json()[0]['lon']]

In [8]:
df_cities

Unnamed: 0,Id,City,Latitude,Longitude
0,156094680,Mont Saint Michel,48.6359541,-1.511459954959514
1,297756747,St Malo,48.649518,-2.0260409
2,297981358,Bayeux,49.2764624,-0.7024738
3,298137491,Le Havre,49.4938975,0.1079732
4,297518815,Rouen,49.4404591,1.0939658
5,297417241,Paris,48.8588897,2.3200410217200766
6,297534793,Amiens,49.8941708,2.2956951
7,297472400,Lille,50.6365654,3.0635282
8,297508568,Strasbourg,48.584614,7.7507127
9,120791766,Chateau du Haut Koenigsbourg,48.249489800000006,7.34429620253195


In [9]:
len(df_cities)

35

In [10]:
df_cities.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35 entries, 0 to 34
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Id         35 non-null     int64 
 1   City       35 non-null     object
 2   Latitude   35 non-null     object
 3   Longitude  35 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.4+ KB


In [11]:
# scrap the weather
# Use https://openweathermap.org/appid 

# some information about the weather for the 35 cities
# https://api.openweathermap.org/data/2.5/onecall?lat={lat}&lon={lon}&exclude={part}&appid={API key}

# import libraries

from credentials import APIKEY 

# API key
APPID = APIKEY

EXCLUDE = 'current,minutely,hourly'

In [12]:
#  https://nominatim.openstreetmap.org/search?<params>
endpoint = "https://api.openweathermap.org/data/2.5/onecall?"

# example
params = {'exclude' : EXCLUDE ,
          'appid' : APPID ,
          'lat' : '48.6359541' ,
          'lon' : -1.511459954959514,
          'units' : 'metric'}

response =  requests.get(endpoint, params = params)
print("Response code:",response,"\n \n")
response.json()

Response code: <Response [200]> 
 



{'lat': 48.636,
 'lon': -1.5115,
 'timezone': 'Europe/Paris',
 'timezone_offset': 7200,
 'daily': [{'dt': 1665486000,
   'sunrise': 1665469177,
   'sunset': 1665509143,
   'moonrise': 1665511860,
   'moonset': 1665475320,
   'moon_phase': 0.55,
   'temp': {'day': 15.65,
    'min': 9.03,
    'max': 17.5,
    'night': 11.15,
    'eve': 15.31,
    'morn': 9.31},
   'feels_like': {'day': 14.63, 'night': 10.02, 'eve': 14.62, 'morn': 7.12},
   'pressure': 1025,
   'humidity': 52,
   'dew_point': 5.84,
   'wind_speed': 4.25,
   'wind_deg': 46,
   'wind_gust': 10.13,
   'weather': [{'id': 803,
     'main': 'Clouds',
     'description': 'broken clouds',
     'icon': '04d'}],
   'clouds': 61,
   'pop': 0,
   'uvi': 2.63},
  {'dt': 1665572400,
   'sunrise': 1665555666,
   'sunset': 1665595422,
   'moonrise': 1665599520,
   'moonset': 1665566220,
   'moon_phase': 0.59,
   'temp': {'day': 16.83,
    'min': 9.76,
    'max': 18.84,
    'night': 12.84,
    'eve': 15.2,
    'morn': 10.27},
   'feels_li

In [13]:
import datetime

# date of extraction
date = response.json()['daily'][0]['dt']
readable = datetime.datetime.fromtimestamp(date).isoformat()
readable

'2022-10-11T13:00:00'

In [14]:
# params
params = {'exclude' : EXCLUDE ,
          'appid' : APPID ,
          'units' : 'metric'}

# initialization of an empty dataframe
df_weather = pd.DataFrame(columns = ['Id', 'City','lat','lon','temperature', 'clear_weather'])

# browse the list of 35 cities by latitude and longitude
for counter, (id, city, lat, lon) in enumerate(zip(df_cities['Id'],
                                                   df_cities['City'],
                                                   df_cities['Latitude'],
                                                   df_cities['Longitude'])):
    # latitude and longitude of each city
    localpoint = endpoint + f"lat={lat}&lon={lon}"
    response =  requests.get(localpoint, params = params)
    
    # Retrieves weather information for the next 7 days
    temp = []
    clear_weather = []
    for i in range(1, len(response.json()['daily'])):
        # Day temperature
        temp.append((response.json()['daily'][i]['temp']['day']))
        # clear weather: 100 - Cloudiness, %
        clear_weather.append(100 - (response.json()['daily'][i]['clouds']))
        
    # Complete the dataframe
    df_weather.loc[counter] = [id, city, float(lat), float(lon), mean(temp), mean(clear_weather)]

In [15]:
df_weather

Unnamed: 0,Id,City,lat,lon,temperature,clear_weather
0,156094680,Mont Saint Michel,48.635954,-1.51146,16.17,1.0
1,297756747,St Malo,48.649518,-2.026041,16.191429,0.714286
2,297981358,Bayeux,49.276462,-0.702474,16.74,1.285714
3,298137491,Le Havre,49.493898,0.107973,16.251429,4.142857
4,297518815,Rouen,49.440459,1.093966,17.388571,3.857143
5,297417241,Paris,48.85889,2.320041,17.36,3.0
6,297534793,Amiens,49.894171,2.295695,16.26,0.142857
7,297472400,Lille,50.636565,3.063528,16.537143,1.714286
8,297508568,Strasbourg,48.584614,7.750713,17.812857,27.857143
9,120791766,Chateau du Haut Koenigsbourg,48.24949,7.344296,15.484286,35.428571


In [104]:
fig = px.scatter_mapbox(
    df_weather,
    lat = "lat",
    lon = "lon",
    color = "temperature",
    hover_name = "City",
    size = "clear_weather",
    zoom = 3.9,
    mapbox_style = "carto-positron",
    width = 800,
    title = "destination depending on the weather and temperature")
fig.show()

In [17]:
df_weather.to_csv("cities_weather.csv")

## Scrape Booking.com

Since BookingHoldings doesn't have aggregated databases, it will be much faster to scrape data directly from booking.com

You can scrap as many information asyou want, but we suggest that you get at least:

hotel name,
Url to its booking.com page,
Its coordinates: latitude and longitude
Score given by the website users
Text description of the hotel

In [18]:
!python bookingHotel.py

2022-10-11 13:47:24 [scrapy.utils.log] INFO: Scrapy 2.6.3 started (bot: scrapybot)
2022-10-11 13:47:24 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 2.0.1, Twisted 22.8.0, Python 3.10.4 (tags/v3.10.4:9d38120, Mar 23 2022, 23:13:41) [MSC v.1929 64 bit (AMD64)], pyOpenSSL 22.1.0 (OpenSSL 3.0.5 5 Jul 2022), cryptography 38.0.1, Platform Windows-10-10.0.19043-SP0
2022-10-11 13:47:24 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 20, 'USER_AGENT': 'Chrome/97.0'}
2022-10-11 13:47:24 [scrapy.extensions.telnet] INFO: Telnet Password: a42cfe2060f3f16e
2022-10-11 13:47:24 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2022-10-11 13:47:24 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy

In [19]:
df_booking = pd.read_json("src/booking_hotel.json", encoding='utf-8')

In [20]:
df_booking.to_csv("booking_weather.csv")

Join columns from df_booking with df_weather by the id city

In [21]:
df_booking_weather = df_booking.merge(df_weather, left_on='city_id', right_on='Id', how='outer')
df_booking_weather.head()

Unnamed: 0,url,city_id,name_hotel,score,coordinates,text_description,Id,City,lat,lon,temperature,clear_weather
0,https://www.booking.com/hotel/fr/gites-bellevu...,156094680,Gites Bellevue,92,"-1.54444441066926,48.5899146496683,-1.49000302...",[Vous pouvez bénéficier d'une réduction Genius...,156094680,Mont Saint Michel,48.635954,-1.51146,16.17,1.0
1,https://www.booking.com/hotel/fr/maison-au-pie...,156094680,Maison au pied du Mont Saint Michel,92,"-1.5127539244992,48.5947081636619,-1.458307366...","[Offrant une vue sur le jardin, la Maison au p...",156094680,Mont Saint Michel,48.635954,-1.51146,16.17,1.0
2,https://www.booking.com/hotel/fr/26-route-de-l...,156094680,Au Mont Chez Nous Jaccuzi et Sauna,92,"-1.51309600058896,48.5954898834269,-1.45864859...","[Situé à Pontorson, à seulement 8,6 km de l'ab...",156094680,Mont Saint Michel,48.635954,-1.51146,16.17,1.0
3,https://www.booking.com/hotel/fr/chambres-d-ha...,156094680,Chambres d'Hôtes Les Vieilles Digues,93,"-1.53881782815031,48.5862220878419,-1.48438041...","[Occupant une maison bretonne restaurée, l'éta...",156094680,Mont Saint Michel,48.635954,-1.51146,16.17,1.0
4,https://www.booking.com/hotel/fr/mon-saint-mic...,156094680,Mon Saint Michel,88,"-1.51301479316993,48.5956615834269,-1.45856720...",[Vous pouvez bénéficier d'une réduction Genius...,156094680,Mont Saint Michel,48.635954,-1.51146,16.17,1.0


In [22]:
df_booking_weather.describe(include="all")

Unnamed: 0,url,city_id,name_hotel,score,coordinates,text_description,Id,City,lat,lon,temperature,clear_weather
count,875,875.0,875,857.0,875,875,875.0,875,875.0,875.0,875.0,875.0
unique,875,,874,41.0,871,875,,35,,,,
top,https://www.booking.com/hotel/fr/gites-bellevu...,,Le Valmer,84.0,"2.32619403797062,43.1931925834269,2.3755839620...",[Vous pouvez bénéficier d'une réduction Genius...,,Mont Saint Michel,,,,
freq,1,,2,51.0,2,1,,25,,,,
mean,,282377900.0,,,,,282377900.0,,45.840986,3.395928,19.899265,23.840816
std,,51374160.0,,,,,51374160.0,,2.554274,2.913222,2.723061,16.565576
min,,76036310.0,,,,,76036310.0,,42.52505,-2.026041,15.484286,0.142857
25%,,297504700.0,,,,,297504700.0,,43.494514,1.354999,17.36,4.142857
50%,,297749100.0,,,,,297749100.0,,45.18756,4.360069,20.325714,24.142857
75%,,298011300.0,,,,,298011300.0,,48.584614,5.735782,22.81,37.285714


In [86]:
df_booking_weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 875 entries, 0 to 874
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   url               875 non-null    object 
 1   city_id           875 non-null    int64  
 2   name_hotel        875 non-null    object 
 3   score             857 non-null    float64
 4   text_description  875 non-null    object 
 5   City              875 non-null    object 
 6   lat               875 non-null    float64
 7   lon               875 non-null    float64
 8   temperature       875 non-null    float64
 9   clear_weather     875 non-null    float64
 10  long_hotel        875 non-null    float64
 11  lat_hotel         875 non-null    float64
dtypes: float64(7), int64(1), object(4)
memory usage: 88.9+ KB


In [23]:
# transforms the score type to integer instead of characters
df_booking_weather['score'] = df_booking_weather['score'].str.replace(",", ".")
df_booking_weather['score'] = df_booking_weather['score'].astype(float)
# calculate latitude and longitude coordinates
df_booking_weather['coordinates'] = df_booking_weather['coordinates'].apply(lambda x : x.split(","))
df_booking_weather['long_hotel'] = df_booking_weather['coordinates'].apply(lambda x : mean([float(x[0]),float(x[2])]))
df_booking_weather['lat_hotel'] = df_booking_weather['coordinates'].apply(lambda x : mean([float(x[1]),float(x[3])]))
# drop the old column
df_booking_weather.drop(columns=['coordinates','Id'], inplace = True)

Top-5 destinations

In [57]:
top_5_cities = df_weather.sort_values(by = 'clear_weather', ascending = False).head(5)[["Id","City"]]
top_5_cities

Unnamed: 0,Id,City
21,297906924,Aix en Provence
20,297763730,Marseille
19,298072685,Cassis
17,76036307,Gorges du Verdon
32,298516909,Biarritz


In [107]:
for Id , city in top_5_cities.values:
     mask = df_booking_weather["city_id"]== Id
     df = df_booking_weather.loc[mask,:]
     fig = px.scatter_mapbox(
          df,
          lat ="lat_hotel",
          lon = "long_hotel",
          color = "score",
          hover_name = "name_hotel",
          zoom = 6,
          mapbox_style = "open-street-map",
          width = 800,
          title = f'{city}')
     fig.show()    
     