![Kayak](https://seekvectorlogo.com/wp-content/uploads/2018/01/kayak-vector-logo.png)

# Plan your trip with Kayak

# 1. Import libraries

In [1]:
import pandas as pd 
import numpy as np
import plotly.express as px
from geopy.geocoders import Nominatim
from meteofrance_api import MeteoFranceClient
from datetime import datetime
import json
import boto3
import requests
from sqlalchemy import create_engine
import os

from dotenv import load_dotenv
import dotenv

# 2. Data : list of place 

In [4]:
holy_place = [
"Mont Saint Michel",
"St Malo",
"Bayeux",
"Le Havre",
"Rouen",
"Paris",
"Amiens",
"Lille",
"Strasbourg",
"Chateau du Haut Koenigsbourg",
"Colmar",
"Eguisheim",
"Besancon",
"Dijon",
"Annecy",
"Grenoble",
"Lyon",
"Gorges du Verdon",
"Bormes les Mimosas",
"Cassis",
"Marseille",
"Aix en Provence",
"Avignon",
"Uzes",
"Nimes",
"Aigues Mortes",
"Saintes Maries de la mer",
"Collioure",
"Carcassonne",
"Ariege",
"Toulouse",
"Montauban",
"Biarritz",
"Bayonne",
"La Rochelle"
]

# 3. API call for cities coordonates & weather

## 3.1. API call

In [None]:
# Initialize geocoder based on OpenStreetMap + weather client
geolocator = Nominatim(timeout=10, user_agent="meteo_app")
client = MeteoFranceClient()

In [None]:
# Creation of function to fetch coordinates & weather on 7 days
def get_forecast_by_city_fordf(city_name):
    """Returns the D→D+7 forecasts for a given city"""
    # Geocoding
    location = geolocator.geocode(f'{city_name}, France')
    if not location:
        return city_name, None

    # Retrieve weather forecast + id
    forecast = client.get_forecast(location.latitude, location.longitude)
    
    # Extract the next 7 days
    daily = forecast.daily_forecast[:7]
    id_place = forecast.position['insee']
    return city_name, daily, id_place, location.latitude, location.longitude

In [None]:
# Creation of empty variable to put futur data
data_weather = []

# loop to fetch infos on each location
for place in holy_place:
    city, daily_forecast, id_place, lat, lon = get_forecast_by_city_fordf(place)
    if daily_forecast:
        for day in daily_forecast:
            row = {
                "insee": id_place,                
                "ville": city,
                "lat": lat,
                "lon": lon,
                "date": datetime.fromtimestamp(day["dt"]).strftime("%Y-%m-%d"),
                "tmin": day["T"]["min"],
                "tmax": day["T"]["max"],
                "pluie": day['precipitation']['24h']
            }
            data_weather.append(row)

# Creation of df with all data
df_weather = pd.DataFrame(data_weather)
df_weather

Unnamed: 0,insee,ville,lat,lon,date,tmin,tmax,pluie
0,5035351,Mont Saint Michel,48.635954,-1.511460,2025-10-20,12.6,17.4,1.7
1,5035351,Mont Saint Michel,48.635954,-1.511460,2025-10-21,11.4,16.9,0.8
2,5035351,Mont Saint Michel,48.635954,-1.511460,2025-10-22,12.1,17.9,15.0
3,5035351,Mont Saint Michel,48.635954,-1.511460,2025-10-23,10.6,13.5,21.6
4,5035351,Mont Saint Michel,48.635954,-1.511460,2025-10-24,7.6,13.6,13.4
...,...,...,...,...,...,...,...,...
240,173000,La Rochelle,46.159732,-1.151595,2025-10-22,15.2,18.2,23.2
241,173000,La Rochelle,46.159732,-1.151595,2025-10-23,12.6,16.0,9.7
242,173000,La Rochelle,46.159732,-1.151595,2025-10-24,12.0,15.5,4.4
243,173000,La Rochelle,46.159732,-1.151595,2025-10-25,11.1,15.7,2.1


In [None]:
# Save df as csv file
pd.DataFrame.to_csv(df_weather, "data/weather.csv", index=False)

## 3.2. Clean data

In [5]:
try:
    # try to use an already existing df
    df_weather
    print("✅ DataFrame déjà disponible en mémoire")

except NameError:
    # if df does not exist → we read it from the CSV
    print("⚠️ DataFrame introuvable → import depuis CSV")
    df_weather = pd.read_csv("data/weather.csv")
    print(df_weather.info())

⚠️ DataFrame introuvable → import depuis CSV
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245 entries, 0 to 244
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   insee   245 non-null    int64  
 1   ville   245 non-null    object 
 2   lat     245 non-null    float64
 3   lon     245 non-null    float64
 4   date    245 non-null    object 
 5   tmin    245 non-null    float64
 6   tmax    245 non-null    float64
 7   pluie   245 non-null    float64
dtypes: float64(5), int64(1), object(2)
memory usage: 15.4+ KB
None


In [6]:
# Calculate mean temperature for each day and place
df_weather['td_mean'] = np.mean(df_weather[['tmin', 'tmax']], axis=1)
df_weather.head()

Unnamed: 0,insee,ville,lat,lon,date,tmin,tmax,pluie,td_mean
0,5035351,Mont Saint Michel,48.635954,-1.51146,2025-10-20,12.6,17.4,1.7,15.0
1,5035351,Mont Saint Michel,48.635954,-1.51146,2025-10-21,11.4,16.9,0.8,14.15
2,5035351,Mont Saint Michel,48.635954,-1.51146,2025-10-22,12.1,17.9,15.0,15.0
3,5035351,Mont Saint Michel,48.635954,-1.51146,2025-10-23,10.6,13.5,21.6,12.05
4,5035351,Mont Saint Michel,48.635954,-1.51146,2025-10-24,7.6,13.6,13.4,10.6


In [7]:
# Copy to work without deleting data
df_w = df_weather.copy(deep=True)

# Calculate mean precipitation on the 7 days for each place 
df_w['pluiew_mean'] = df_w.groupby('insee')['pluie'].transform("mean")

# Calculate mean temperature on the 7 days for each place
df_w['tw_mean'] = df_w.groupby('insee')['td_mean'].transform("mean")

df_w.head()

Unnamed: 0,insee,ville,lat,lon,date,tmin,tmax,pluie,td_mean,pluiew_mean,tw_mean
0,5035351,Mont Saint Michel,48.635954,-1.51146,2025-10-20,12.6,17.4,1.7,15.0,10.457143,12.385714
1,5035351,Mont Saint Michel,48.635954,-1.51146,2025-10-21,11.4,16.9,0.8,14.15,10.457143,12.385714
2,5035351,Mont Saint Michel,48.635954,-1.51146,2025-10-22,12.1,17.9,15.0,15.0,10.457143,12.385714
3,5035351,Mont Saint Michel,48.635954,-1.51146,2025-10-23,10.6,13.5,21.6,12.05,10.457143,12.385714
4,5035351,Mont Saint Michel,48.635954,-1.51146,2025-10-24,7.6,13.6,13.4,10.6,10.457143,12.385714


In [8]:
# Keep 1 line for each city
df_w1 = df_w.groupby(by='insee').last().reset_index()

# Drop useless columns
df_w2 = df_w1.drop(columns=['tmin', 'tmax', 'pluie', 'td_mean'])

df_w2.head()

Unnamed: 0,insee,ville,lat,lon,date,pluiew_mean,tw_mean
0,41440,Gorges du Verdon,43.749656,6.328562,2025-10-26,10.042857,9.528571
1,91960,Ariege,42.945537,1.406554,2025-10-26,4.857143,11.45
2,110690,Carcassonne,43.213036,2.349107,2025-10-26,2.728571,15.478571
3,130010,Aix en Provence,43.529842,5.447474,2025-10-26,5.257143,14.292857
4,130220,Cassis,43.214036,5.539632,2025-10-26,3.357143,16.971429


In [None]:
# Save to csv file
pd.DataFrame.to_csv(df_w2, "data/cities_weather.csv", index=False)

In [9]:
# Keep only cities with mean precipitations less than 3mm
df_w3 = df_w2[df_w2['pluiew_mean'] <= 3 ]
df_w3.head(10)

Unnamed: 0,insee,ville,lat,lon,date,pluiew_mean,tw_mean
2,110690,Carcassonne,43.213036,2.349107,2025-10-26,2.728571,15.478571
5,130550,Marseille,43.296174,5.369953,2025-10-26,1.785714,17.071429
10,300030,Aigues Mortes,43.566152,4.19154,2025-10-26,1.328571,16.921429
11,301890,Nimes,43.837425,4.360069,2025-10-26,1.785714,16.264286
17,660530,Collioure,42.52505,3.083155,2025-10-26,0.742857,18.0
31,1309651,Saintes Maries de la mer,43.451592,4.42772,2025-10-26,0.771429,17.014286


In [10]:
# Extract in a new df the 5 cities with the best mean temperature
df_w4 = df_w3.sort_values(by='tw_mean', ascending=False)
df_w5 = df_w4.head(5)
df_w5

Unnamed: 0,insee,ville,lat,lon,date,pluiew_mean,tw_mean
17,660530,Collioure,42.52505,3.083155,2025-10-26,0.742857,18.0
5,130550,Marseille,43.296174,5.369953,2025-10-26,1.785714,17.071429
31,1309651,Saintes Maries de la mer,43.451592,4.42772,2025-10-26,0.771429,17.014286
10,300030,Aigues Mortes,43.566152,4.19154,2025-10-26,1.328571,16.921429
11,301890,Nimes,43.837425,4.360069,2025-10-26,1.785714,16.264286


## 3.3. Visualization

In [11]:
# Map visualization
fig = px.scatter_map(df_w5, lat="lat", lon="lon", color="tw_mean",
                     hover_name="ville", hover_data=["pluiew_mean", "tw_mean"], 
                     zoom=5, size='tw_mean', height=500, width=600)
fig.update_geos(fitbounds="locations")
fig.update_layout(map_style="basic")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

# 4. Scrapping Booking.com

## 4.1. Retrieve URL

Activation of a spider that goes directly to booking.com and searches the URL of 20 hotels for each city,   
on anaconda prompt to ensure the spider is launched in the correct environment.
* Activate the environment:
-> ``` (base) C:\Users\Username> conda activate scrap_py311 ```
* Retrieve the script from its path:
-> ``` (scrap_py311) C:\Users\Username> cd "C:\Users\Username\projet_kayak\src" ```
* Run the script:
-> ``` (scrap_py311) C:\Users\Username\projet_kayak\src> python booking_url_hotel.py ```

## 4.2. Retrieve hotel's info via scrapy 

Activation of a spider that goes directly to booking.com and searches the informations for each hotel,   
on anaconda prompt to ensure the spider is launched in the correct environment.
* Activate the environment:
-> ``` (base) C:\Users\Username> conda activate scrap_py311 ```
* Retrieve the script from its path:
-> ``` (scrap_py311) C:\Users\Username> cd "C:\Users\Username\projet_kayak\src" ```
* Run the script:
-> ``` (scrap_py311) C:\Users\Username\projet_kayak\src> python booking_info_hotel.py ```

## 4.3. Merge Data Weather and Hotels Scrapping

In [12]:
try:
    # try to use an already existing df
    df_w2
    print("✅ DataFrame déjà disponible en mémoire")

except NameError:
    # if df does not exist → we read it from the CSV
    print("⚠️ DataFrame introuvable → import depuis CSV")
    df_w2 = pd.read_csv('data/cities_weather.csv')
    print(df_w2.info())


✅ DataFrame déjà disponible en mémoire


In [13]:
df_w2.head()

Unnamed: 0,insee,ville,lat,lon,date,pluiew_mean,tw_mean
0,41440,Gorges du Verdon,43.749656,6.328562,2025-10-26,10.042857,9.528571
1,91960,Ariege,42.945537,1.406554,2025-10-26,4.857143,11.45
2,110690,Carcassonne,43.213036,2.349107,2025-10-26,2.728571,15.478571
3,130010,Aix en Provence,43.529842,5.447474,2025-10-26,5.257143,14.292857
4,130220,Cassis,43.214036,5.539632,2025-10-26,3.357143,16.971429


In [14]:
# Charger le JSON
with open("data/hotels_details.json", "r", encoding="utf-8") as f:
    hotels = json.load(f)

# Convertir en DataFrame
df_hotels = pd.DataFrame(hotels)

# Merge avec df_top5_insee
df_merged = pd.merge(df_hotels, df_w2, on='ville', how="right")

df_merged.head()

Unnamed: 0,ville,url,nom,note,adresse,description,insee,lat,lon,date,pluiew_mean,tw_mean
0,Gorges du Verdon,https://www.booking.com/hotel/fr/charming-stud...,CHARMING studio proche centre,Avec une note de 10,"243 Route des Gorges du Verdon, 04120 Castella...","Situé à Castellane, l’hébergement CHARMING stu...",41440,43.749656,6.328562,2025-10-26,10.042857,9.528571
1,Gorges du Verdon,https://www.booking.com/hotel/fr/la-bastide-de...,La Bastide des Marcassins,Avec une note de 10,"81 All. du Rossignol, 83630 Moissac-Bellevue, ...","Situé à Moissac-Bellevue, l’hébergement La Bas...",41440,43.749656,6.328562,2025-10-26,10.042857,9.528571
2,Gorges du Verdon,https://www.booking.com/hotel/fr/chambres-d-ho...,Gorges du Verdon - Chambres d'Hotes des arches...,Avec une note de 9.6,"Chambre des arches 2388 Rte de Chasteuil, 0412...","Situé à Castellane, l’hébergement Gorges du Ve...",41440,43.749656,6.328562,2025-10-26,10.042857,9.528571
3,Gorges du Verdon,https://www.booking.com/hotel/fr/montagnac-vil...,Montagnac Village House - Verdon,Avec une note de 9.5,"53 Route Nouvelle, 04500 Montagnac, France",L’hébergement Montagnac Village House - Verdon...,41440,43.749656,6.328562,2025-10-26,10.042857,9.528571
4,Gorges du Verdon,https://www.booking.com/hotel/fr/le-petit-para...,Le Petit Paradis Provençal,Avec une note de 9.6,"261C Imp. du Bosquet, 04500 Allemagne-en-Prove...",Possédant une piscine extérieure ouverte en sa...,41440,43.749656,6.328562,2025-10-26,10.042857,9.528571


In [None]:
# Reconvertir en liste de dicts
hotels_enriched = df_merged.to_dict(orient="records")

# Sauvegarder en JSON enrichi
with open("data/all_hotels_details_insee.json", "w", encoding="utf-8") as f:
    json.dump(hotels_enriched, f, indent=4, ensure_ascii=False)

# 5. Hotels recommendations for best cities destinations

## 5.1. Merge all informations into 1 df
Some hotels are in duplicate because recommended for several cities.   
To not miss then we need to use the 1rst dataset all_cities_urls_hotels.json to complete data/all_hotels_details_insee.json.

### 5.1. Import data

In [15]:
# Import 
with open("data/all_cities_urls_hotels.json", "r") as f:
    all_hotels_url = json.load(f)
df_all_hotels_url = pd.DataFrame(all_hotels_url)

df_all_hotels_url.head()

Unnamed: 0,city,url
0,Gorges du Verdon,https://www.booking.com/hotel/fr/charming-stud...
1,Gorges du Verdon,https://www.booking.com/hotel/fr/la-bastide-de...
2,Gorges du Verdon,https://www.booking.com/hotel/fr/chambres-d-ho...
3,Gorges du Verdon,https://www.booking.com/hotel/fr/montagnac-vil...
4,Gorges du Verdon,https://www.booking.com/hotel/fr/le-petit-para...


In [16]:
df_all_hotels_url.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   city    700 non-null    object
 1   url     700 non-null    object
dtypes: object(2)
memory usage: 11.1+ KB


In [40]:
with open("data/all_hotels_details_insee.json", "r") as f:
    all_hotels_details = json.load(f)
df_all_hotels_details = pd.DataFrame(all_hotels_details)

df_all_hotels_details.head()

Unnamed: 0,ville,url,nom,note,adresse,description,insee,lat,lon,date,pluiew_mean,tw_mean
0,Gorges du Verdon,https://www.booking.com/hotel/fr/charming-stud...,CHARMING studio proche centre,Avec une note de 10,"243 Route des Gorges du Verdon, 04120 Castella...","Situé à Castellane, l’hébergement CHARMING stu...",41440,43.749656,6.328562,2025-10-06,0.4,10.871429
1,Gorges du Verdon,https://www.booking.com/hotel/fr/la-bastide-de...,La Bastide des Marcassins,Avec une note de 10,"81 All. du Rossignol, 83630 Moissac-Bellevue, ...","Situé à Moissac-Bellevue, l’hébergement La Bas...",41440,43.749656,6.328562,2025-10-06,0.4,10.871429
2,Gorges du Verdon,https://www.booking.com/hotel/fr/chambres-d-ho...,Gorges du Verdon - Chambres d'Hotes des arches...,Avec une note de 9.6,"Chambre des arches 2388 Rte de Chasteuil, 0412...","Situé à Castellane, l’hébergement Gorges du Ve...",41440,43.749656,6.328562,2025-10-06,0.4,10.871429
3,Gorges du Verdon,https://www.booking.com/hotel/fr/montagnac-vil...,Montagnac Village House - Verdon,Avec une note de 9.5,"53 Route Nouvelle, 04500 Montagnac, France",L’hébergement Montagnac Village House - Verdon...,41440,43.749656,6.328562,2025-10-06,0.4,10.871429
4,Gorges du Verdon,https://www.booking.com/hotel/fr/le-petit-para...,Le Petit Paradis Provençal,Avec une note de 9.6,"261C Imp. du Bosquet, 04500 Allemagne-en-Prove...",Possédant une piscine extérieure ouverte en sa...,41440,43.749656,6.328562,2025-10-06,0.4,10.871429


In [41]:
df_all_hotels_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689 entries, 0 to 688
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ville        689 non-null    object 
 1   url          689 non-null    object 
 2   nom          689 non-null    object 
 3   note         689 non-null    object 
 4   adresse      689 non-null    object 
 5   description  689 non-null    object 
 6   insee        689 non-null    int64  
 7   lat          689 non-null    float64
 8   lon          689 non-null    float64
 9   date         689 non-null    object 
 10  pluiew_mean  689 non-null    float64
 11  tw_mean      689 non-null    float64
dtypes: float64(4), int64(1), object(7)
memory usage: 64.7+ KB


### 5.1.2. Merge all df

In [42]:
df = pd.merge(df_all_hotels_url, df_all_hotels_details, on='url', how="right")

In [43]:
# Check if we keeped all the hotels
(df['ville']).isna().value_counts(ascending=False)

ville
False    700
Name: count, dtype: int64

In [44]:
df.head()

Unnamed: 0,city,url,ville,nom,note,adresse,description,insee,lat,lon,date,pluiew_mean,tw_mean
0,Gorges du Verdon,https://www.booking.com/hotel/fr/charming-stud...,Gorges du Verdon,CHARMING studio proche centre,Avec une note de 10,"243 Route des Gorges du Verdon, 04120 Castella...","Situé à Castellane, l’hébergement CHARMING stu...",41440,43.749656,6.328562,2025-10-06,0.4,10.871429
1,Gorges du Verdon,https://www.booking.com/hotel/fr/la-bastide-de...,Gorges du Verdon,La Bastide des Marcassins,Avec une note de 10,"81 All. du Rossignol, 83630 Moissac-Bellevue, ...","Situé à Moissac-Bellevue, l’hébergement La Bas...",41440,43.749656,6.328562,2025-10-06,0.4,10.871429
2,Gorges du Verdon,https://www.booking.com/hotel/fr/chambres-d-ho...,Gorges du Verdon,Gorges du Verdon - Chambres d'Hotes des arches...,Avec une note de 9.6,"Chambre des arches 2388 Rte de Chasteuil, 0412...","Situé à Castellane, l’hébergement Gorges du Ve...",41440,43.749656,6.328562,2025-10-06,0.4,10.871429
3,Gorges du Verdon,https://www.booking.com/hotel/fr/montagnac-vil...,Gorges du Verdon,Montagnac Village House - Verdon,Avec une note de 9.5,"53 Route Nouvelle, 04500 Montagnac, France",L’hébergement Montagnac Village House - Verdon...,41440,43.749656,6.328562,2025-10-06,0.4,10.871429
4,Gorges du Verdon,https://www.booking.com/hotel/fr/le-petit-para...,Gorges du Verdon,Le Petit Paradis Provençal,Avec une note de 9.6,"261C Imp. du Bosquet, 04500 Allemagne-en-Prove...",Possédant une piscine extérieure ouverte en sa...,41440,43.749656,6.328562,2025-10-06,0.4,10.871429


### 5.1.2. Clean data

In [45]:
# Delete column city
df = df.drop(columns='city')

In [46]:
# Check for strange in 'note' column
df['note'].value_counts(ascending=False)

note
Avec une note de 9      67
Avec une note de 10     58
Avec une note de 8.5    44
Avec une note de 9.4    41
Avec une note de 8.6    39
Avec une note de 9.5    39
Avec une note de 9.1    39
Avec une note de 9.6    34
Avec une note de 9.2    33
Avec une note de 9.3    32
Avec une note de 8.2    28
Avec une note de 8.7    27
Avec une note de 8.8    26
Avec une note de 8      25
Avec une note de 8.9    20
Avec une note de 8.3    19
Avec une note de 9.7    17
Avec une note de 9.8    17
Avec une note de 8.1    16
Non disponible          14
Avec une note de 8.4    14
Avec une note de 7.9     6
Avec une note de 7.3     5
Avec une note de 7.8     4
Avec une note de 7.5     4
Avec une note de 9.9     3
Avec une note de 6       2
Avec une note de 7.7     2
Avec une note de 7.6     2
Avec une note de 6.6     2
Avec une note de 6.3     2
Avec une note de 5.5     1
Avec une note de 7.1     1
Avec une note de 7       1
Avec une note de 6.1     1
Avec une note de 5.3     1
Avec une note de 7.4   

In [47]:
# Keep only the note
df['note'] = df['note'].str.split('de ').str[-1]
df['note'] = pd.to_numeric(df['note'], errors="coerce") # 'coerce' for invalid parsing to be set as NaN.
df['note'].head()

0    10.0
1    10.0
2     9.6
3     9.5
4     9.6
Name: note, dtype: float64

In [48]:
df.head()

Unnamed: 0,url,ville,nom,note,adresse,description,insee,lat,lon,date,pluiew_mean,tw_mean
0,https://www.booking.com/hotel/fr/charming-stud...,Gorges du Verdon,CHARMING studio proche centre,10.0,"243 Route des Gorges du Verdon, 04120 Castella...","Situé à Castellane, l’hébergement CHARMING stu...",41440,43.749656,6.328562,2025-10-06,0.4,10.871429
1,https://www.booking.com/hotel/fr/la-bastide-de...,Gorges du Verdon,La Bastide des Marcassins,10.0,"81 All. du Rossignol, 83630 Moissac-Bellevue, ...","Situé à Moissac-Bellevue, l’hébergement La Bas...",41440,43.749656,6.328562,2025-10-06,0.4,10.871429
2,https://www.booking.com/hotel/fr/chambres-d-ho...,Gorges du Verdon,Gorges du Verdon - Chambres d'Hotes des arches...,9.6,"Chambre des arches 2388 Rte de Chasteuil, 0412...","Situé à Castellane, l’hébergement Gorges du Ve...",41440,43.749656,6.328562,2025-10-06,0.4,10.871429
3,https://www.booking.com/hotel/fr/montagnac-vil...,Gorges du Verdon,Montagnac Village House - Verdon,9.5,"53 Route Nouvelle, 04500 Montagnac, France",L’hébergement Montagnac Village House - Verdon...,41440,43.749656,6.328562,2025-10-06,0.4,10.871429
4,https://www.booking.com/hotel/fr/le-petit-para...,Gorges du Verdon,Le Petit Paradis Provençal,9.6,"261C Imp. du Bosquet, 04500 Allemagne-en-Prove...",Possédant une piscine extérieure ouverte en sa...,41440,43.749656,6.328562,2025-10-06,0.4,10.871429


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   url          700 non-null    object 
 1   ville        700 non-null    object 
 2   nom          700 non-null    object 
 3   note         686 non-null    float64
 4   adresse      700 non-null    object 
 5   description  700 non-null    object 
 6   insee        700 non-null    int64  
 7   lat          700 non-null    float64
 8   lon          700 non-null    float64
 9   date         700 non-null    object 
 10  pluiew_mean  700 non-null    float64
 11  tw_mean      700 non-null    float64
dtypes: float64(5), int64(1), object(6)
memory usage: 65.8+ KB


## 5.2. Get coordinates for each hotel

In [None]:
# Read .env file to get all credentials
dotenv.load_dotenv()

#Set up environmental variables for HERE API
HERE_api_key = os.getenv("HERE_api_key")
HERE_api_URL = os.getenv("HERE_api_URL")


# Verification (optionnel)
print(f"API Key chargée : {HERE_api_key[:10]}..." if HERE_api_key else "❌ Erreur : clé non trouvée")

In [None]:
# Function to find coordinates with address
def geocode_here(address):
    params = {"q": address, "apiKey": HERE_api_key}
    r = requests.get(HERE_api_URL, params=params).json()
    if "items" in r and len(r["items"]) > 0:
        lat_h = r["items"][0]["position"]["lat"]
        lon_h = r["items"][0]["position"]["lng"]
        return lat_h, lon_h
    else:
        return None, None

df[["lat_h", "lon_h"]] = df["adresse"].apply(lambda x: pd.Series(geocode_here(x)))

df.head()

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   url          700 non-null    object 
 1   ville        700 non-null    object 
 2   nom          700 non-null    object 
 3   note         660 non-null    float64
 4   adresse      700 non-null    object 
 5   description  700 non-null    object 
 6   insee        700 non-null    int64  
 7   lat          700 non-null    float64
 8   lon          700 non-null    float64
 9   date         700 non-null    object 
 10  pluiew_mean  700 non-null    float64
 11  tw_mean      700 non-null    float64
 12  lat_h        0 non-null      object 
 13  lon_h        0 non-null      object 
dtypes: float64(5), int64(1), object(8)
memory usage: 76.7+ KB


In [None]:
pd.DataFrame.to_csv(df, "data/hotels_info.csv", index=False)

## 5.3. Visualization of best hotels in best cities

In [19]:
df = pd.read_csv("data/hotels_info.csv")
df.head()

Unnamed: 0,url,ville,nom,note,adresse,description,insee,lat,lon,date,pluiew_mean,tw_mean,lat_h,lon_h
0,https://www.booking.com/hotel/fr/charming-stud...,Gorges du Verdon,CHARMING studio proche centre,10.0,"243 Route des Gorges du Verdon, 04120 Castella...","Situé à Castellane, l’hébergement CHARMING stu...",41440,43.749656,6.328562,2025-10-06,0.4,10.871429,43.84507,6.50694
1,https://www.booking.com/hotel/fr/la-bastide-de...,Gorges du Verdon,La Bastide des Marcassins,10.0,"81 All. du Rossignol, 83630 Moissac-Bellevue, ...","Situé à Moissac-Bellevue, l’hébergement La Bas...",41440,43.749656,6.328562,2025-10-06,0.4,10.871429,43.65037,6.18103
2,https://www.booking.com/hotel/fr/chambres-d-ho...,Gorges du Verdon,Gorges du Verdon - Chambres d'Hotes des arches...,9.6,"Chambre des arches 2388 Rte de Chasteuil, 0412...","Situé à Castellane, l’hébergement Gorges du Ve...",41440,43.749656,6.328562,2025-10-06,0.4,10.871429,43.83677,6.42174
3,https://www.booking.com/hotel/fr/montagnac-vil...,Gorges du Verdon,Montagnac Village House - Verdon,9.5,"53 Route Nouvelle, 04500 Montagnac, France",L’hébergement Montagnac Village House - Verdon...,41440,43.749656,6.328562,2025-10-06,0.4,10.871429,43.77882,6.09671
4,https://www.booking.com/hotel/fr/le-petit-para...,Gorges du Verdon,Le Petit Paradis Provençal,9.6,"261C Imp. du Bosquet, 04500 Allemagne-en-Prove...",Possédant une piscine extérieure ouverte en sa...,41440,43.749656,6.328562,2025-10-06,0.4,10.871429,43.78818,6.01919


In [20]:
fig = px.scatter_map(df, lat="lat_h", lon="lon_h", color="note",
                     color_continuous_scale="Turbo",
                     hover_name="nom", hover_data=["url", "adresse"],
                     zoom=4, height=500, width=600)
fig.update_traces(marker_size=10)
fig.update_layout(map_style="basic")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

# 6. Online storing 

## 6.1. AWS S3 Bucket

Storing dataset into S3 as a csv file.

In [None]:
# Load environment variables from .env file
dotenv.load_dotenv()
AWS_access_key=os.getenv('AWS_access_key')
AWS_secret_access_key=os.getenv('AWS_secret_access_key')

# Verification
print(f"AWS key : {AWS_access_key[:10]}..." if AWS_access_key else "❌ Erreur : clé non trouvée")

In [None]:
# Create a client which is going to interact with our S3 Bucket
s3_client = boto3.client(
    's3',
    aws_access_key_id=os.getenv('AWS_access_key'),
    aws_secret_access_key=os.getenv('AWS_secret_access_key'),
    region_name=os.getenv('AWS_REGION', 'eu-west-3')
)
bucket_name=os.getenv('S3_BUCKET_NAME')

print(f"AWS S3 Bucket name : {bucket_name}")

In [None]:
# Ask S3_client to upload hotels_info.csv file into our S3 Bucket with name "hotels_info.csv"
s3_client.upload_file('data/hotels_info.csv', bucket_name, 'hotels_info.csv')
print(f"hotels_info.csv has been uploaded to {bucket_name} AWS S3 Bucket.")

## 6.2. NEON Database
Creating a SQL Database using NEONcb to extract clean data from AWS S3 Bucket and store it in NEON Database.

In [2]:
s3_client = boto3.client(
    's3',
    aws_access_key_id=os.getenv('AWS_access_key'),
    aws_secret_access_key=os.getenv('AWS_secret_access_key'),
    region_name=os.getenv('AWS_REGION', 'eu-west-3')
)
bucket_name = os.getenv('S3_BUCKET_NAME')

NEON_db = os.getenv('NEON_URI')

print(f"Début de NEON_db : {NEON_db[:10]}...")

Début de NEON_db : postgresql...


In [3]:
# Get s3 object from bucket
Object_s3 = s3_client.get_object(Bucket=bucket_name, Key='hotels_info.csv')

# Extract from s3 as csv
df_test = pd.read_csv(Object_s3['Body'])

# SQL method to extract & load
engine = create_engine(NEON_db, echo=True)
print(f"Type de test_engine: {type(engine)}")

# Send csv to neon db with engine
df_test.to_sql("hotels_info", engine, if_exists="replace", index=True)

Type de test_engine: <class 'sqlalchemy.engine.base.Engine'>
2025-10-23 15:59:42,667 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2025-10-23 15:59:42,668 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-10-23 15:59:42,728 INFO sqlalchemy.engine.Engine select current_schema()
2025-10-23 15:59:42,730 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-10-23 15:59:42,790 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2025-10-23 15:59:42,791 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-10-23 15:59:42,848 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-10-23 15:59:42,861 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.p

700