# Projet KAYAK

In [1]:
#installing scrappy
!pip install Scrapy -q

In [2]:
#importing libraries 
import os
import logging
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "iframe_connected"
import requests
from datetime import datetime

In [3]:
#list of cities in France to recommand 
cities = ["Mont Saint Michel", "Saint Malo", "Bayeux", "Le Havre", "Rouen", "Paris", "Amiens", "Lille", "Strasbourg",
            "Chateau du Haut Koenigsbourg", "Colmar", "Eguisheim", "Besancon", "Dijon", "Annecy", "Grenoble", "Lyon",
            "Gorges du Verdon", "Bormes les Mimosas", "Cassis", "Marseille", "Aix en Provence", "Avignon", "Uzes", "Nimes",
            "Aigues Mortes", "Saintes Maries de la mer", "Collioure", "Carcassonne", "Ariege", "Toulouse", "Montauban",
            "Biarritz", "Bayonne", "La Rochelle"]

## I- Getting gps coordinates and weather forecast with api query 

In [4]:
# First i will develop the code for one city, Paris
paris_gps = requests.get("https://nominatim.openstreetmap.org/search?q=Paris+France&format=geojson").json()['features'][0]['geometry']['coordinates']
display(paris_gps)

[2.3200410217200766, 48.8588897]

In [5]:
# getting weather info from api.openweathermap with the gps coordinates 
key = "274c0a55230e8d4d1cb650d6dfd9b351"
r = requests.get("https://api.openweathermap.org/data/2.5/onecall?lat={}&lon={}&exclude=current,minutely,hourly,alerts&units=metric&appid={}".format(paris_gps[1],paris_gps[0], key)).json()

In [6]:
display(r)

{'lat': 48.8589,
 'lon': 2.32,
 'timezone': 'Europe/Paris',
 'timezone_offset': 7200,
 'daily': [{'dt': 1652785200,
   'sunrise': 1652760404,
   'sunset': 1652815662,
   'moonrise': 1652823840,
   'moonset': 1652762640,
   'moon_phase': 0.55,
   'temp': {'day': 24.52,
    'min': 17.77,
    'max': 27.54,
    'night': 20.82,
    'eve': 26.61,
    'morn': 18.93},
   'feels_like': {'day': 24.36, 'night': 20.92, 'eve': 26.61, 'morn': 18.76},
   'pressure': 1019,
   'humidity': 51,
   'dew_point': 13.73,
   'wind_speed': 3.43,
   'wind_deg': 133,
   'wind_gust': 7.46,
   'weather': [{'id': 500,
     'main': 'Rain',
     'description': 'light rain',
     'icon': '10d'}],
   'clouds': 37,
   'pop': 0.47,
   'rain': 0.3,
   'uvi': 6.94},
  {'dt': 1652871600,
   'sunrise': 1652846730,
   'sunset': 1652902142,
   'moonrise': 0,
   'moonset': 1652851800,
   'moon_phase': 0.59,
   'temp': {'day': 25,
    'min': 16.63,
    'max': 26.73,
    'night': 20.7,
    'eve': 26.19,
    'morn': 16.66},
   'fe

In [7]:
#these are the parmaters chosen to calculate average forcasted weather for each city over the next 7 days
weather_params = ['sunrise','sunset','temp','humidity']

#create a function that request from openweathermap weather forecast data for each city in France given its gps position
def weather_forecast(city):
    key = "274c0a55230e8d4d1cb650d6dfd9b351"
    city_gps = requests.get("https://nominatim.openstreetmap.org/search?q={}+France&format=geojson".format(city)).json()['features'][0]['geometry']['coordinates']
    df = pd.DataFrame(columns = ['temp','humidity'])
    r = requests.get("https://api.openweathermap.org/data/2.5/onecall?lat={}&lon={}&exclude=current,minutely,hourly,alerts&units=metric&appid={}".format(city_gps[1],city_gps[0], key)).json()
    for i in range(8):
        r['daily'][i] # get forcast weather of day i
        newDict = {key: value for (key, value) in r['daily'][i].items() if key in weather_params} #keeping only weather parameters needed
        #creating a new weather forcast variable = duration of sunlight in the day as number of hours
        newDict['sunlight duration']=round((newDict['sunset']-newDict['sunrise'])/3600,2)
        newDict['temp'] = newDict['temp']['day'] #we choose to keep the avg temperature among all the temperature data scraped
        del newDict['sunrise']
        del newDict['sunset']
        df.loc[i,'temp'] = newDict['temp']
        df.loc[i,'humidity'] = newDict['humidity']
        df.loc[i,'sunlight duration'] = newDict['sunlight duration']
    #print("weather forecast in : {}".format(city))
    return {'latitude' : r['lat'], \
            'longitude' : r['lon'], \
            'avg day temp' :round(df['temp'].mean(),2), \
            'avg humidity' : round(df['humidity'].mean(),2), \
            'avg sunlight in hours' : round(df['sunlight duration'].mean(),2)}
#checking results for a city
weather_forecast('Mont Saint Michel')

{'latitude': 48.636,
 'longitude': -1.5115,
 'avg day temp': 20.24,
 'avg humidity': 63.12,
 'avg sunlight in hours': 15.46}

In [8]:
#getting the data for all french cities with a loop and putting it into a dataframe that will be saved 

In [9]:
cities_weather = pd.DataFrame(columns = ['city name', 'latitude', 'longitude', 'avg day temp','avg humidity', 'avg sunlight in hours'])
for i in range(len(cities)):
    cities_weather.loc[i,'city name'] = cities[i]
    cities_weather.loc[i,'latitude'] = weather_forecast(cities[i])['latitude']
    cities_weather.loc[i,'longitude'] = weather_forecast(cities[i])['longitude']
    cities_weather.loc[i,'avg day temp'] = weather_forecast(cities[i])['avg day temp']
    cities_weather.loc[i,'avg humidity'] = weather_forecast(cities[i])['avg humidity']
    cities_weather.loc[i,'avg sunlight in hours'] = weather_forecast(cities[i])['avg sunlight in hours']

In [12]:
#saving data into csv file
cities_weather.to_csv('cities_weather.csv')

## II- Ranking the 5 best destinations

In [10]:
#Use plotly to display the temperature map 
fig = px.scatter_mapbox(cities_weather, lat="latitude", lon="longitude",
                        size=cities_weather["avg day temp"].to_list(), 
                        hover_name = "city name", mapbox_style="carto-positron",
                        height=700, zoom=4.7, color_continuous_scale="avg day temp")
fig.show()

In [11]:
#Using plotly to display the sunlight duration map ordered by day temperature fo each city
fig = px.scatter_mapbox(cities_weather, lat="latitude", lon="longitude",
                        size=cities_weather["avg day temp"].to_list(), 
                        hover_name = "city name", mapbox_style="carto-positron", color="avg sunlight in hours",
                        size_max=15, zoom=4, color_continuous_scale=px.colors.cyclical.IceFire)
fig.show()

In [12]:
#ranking the 5 best cities (warmest weather)
cities_weather.sort_values("avg day temp", ascending=False).head(5)

Unnamed: 0,city name,latitude,longitude,avg day temp,avg humidity,avg sunlight in hours
22,Avignon,43.9492,4.8059,30.65,33.88,14.92
15,Grenoble,45.1876,5.7358,30.6,37.25,15.05
23,Uzes,44.0121,4.4197,30.17,32.88,14.93
24,Nimes,43.8374,4.3601,29.58,38.25,14.91
21,Aix en Provence,43.5298,5.4475,29.06,32.88,14.88


In [13]:
#best 5 cities to keep in term of weather 
index_city = list(cities_weather["avg day temp"].sort_values(ascending=False)[:5].index)
warmest_cities = [cities_weather["city name"][i] for i in index_city]
print(warmest_cities)

['Avignon', 'Grenoble', 'Uzes', 'Nimes', 'Aix en Provence']


## III- Getting hotels info by scarping booking website

In [14]:
cities_to_keep = warmest_cities

In [15]:
#define url by city where to scrap hotels data
url_list = [f'https://www.booking.com/city/fr/{cities_to_keep[i].replace(" ", "-").lower()}.fr.html' for i in range(len(cities_to_keep))]

In [16]:
print(url_list)

['https://www.booking.com/city/fr/avignon.fr.html', 'https://www.booking.com/city/fr/grenoble.fr.html', 'https://www.booking.com/city/fr/uzes.fr.html', 'https://www.booking.com/city/fr/nimes.fr.html', 'https://www.booking.com/city/fr/aix-en-provence.fr.html']


In [17]:
#defining a scrapping class with scrapy.spider 
class BookingSpider(scrapy.Spider):
    # Name of spider
    name = "BookingSpider"
    RESULTS_PER_PAGE = 20 #20 results per page  
    
    def start_requests(self):
        urls = url_list #defined when calling crawl
        for index, url in enumerate(urls):
            yield scrapy.Request(url, meta={'index':index}) #to keep in memory the city index we are scrapping data for
    
    # Callback function that will be called when starting the spider
    def parse(self, response):
        for link in response.css('header.bui-spacer--medium a::attr(href)'):
            hotel_link = 'https://www.booking.com' + link.get()
            yield response.follow(hotel_link, callback=self.hotel_infos, meta={'index': response.meta['index']})
        
    def hotel_infos(self, response):
        yield {
        
        'Hotel_name': response.css('h2.hp__hotel-name::text').getall()[1].strip(),
        'Rating': response.css('div.b5cd09854e.d10a6220b4::text').get(1),
        'Coordinates':response.css('a.jq_tooltip.loc_block_link_underline_fix.bui-link.show_on_map_hp_link::attr(data-atlas-latlng)').get(),
        'Description':response.css('div.hp_desc_main_content p::text').getall(),
        'Link': str(response),
        'City':response.meta['index']
        
        }  

In [18]:
# Name of the file where the results will be saved
filename = "booking_data.json"

# If file already exists, delete it before crawling 
if filename in os.listdir('./src/'):
        os.remove('./src/' + filename)

# Declare a new CrawlerProcess with some settings
process = CrawlerProcess(settings = {
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36', #changer user 
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        'src/' + filename : {"format": "json"},
    },
    "AUTOTHROTTLE_ENABLED": True  # AutoThrottle Here!
})

# Start the crawling using the spider defined above
process.crawl(BookingSpider, start_urls=url_list)
process.start()


2022-05-17 07:48:39 [scrapy.utils.log] INFO: Scrapy 2.6.1 started (bot: scrapybot)
2022-05-17 07:48:39 [scrapy.utils.log] INFO: Versions: lxml 4.8.0.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 22.4.0, Python 3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:20:46) - [GCC 9.4.0], pyOpenSSL 22.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 36.0.1, Platform Linux-5.4.170+-x86_64-with-glibc2.31
2022-05-17 07:48:39 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
               '(KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}
2022-05-17 07:48:39 [scrapy.extensions.telnet] INFO: Telnet Password: 5ebab5bec6e6d65f
2022-05-17 07:48:39 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.f

In [48]:
#chechking scraped data
df = pd.read_json("./src/booking_data.json")
df.head()

Unnamed: 0,Hotel_name,Rating,Coordinates,Description,Link,City
0,Best Western Hôtel Le Paradou Avignon Sud.,79,"43.90489250,4.89553034","[Situé dans un parc à Avignon Sud-Montfavet, l...",<200 https://www.booking.com/hotel/fr/paradoua...,0
1,Hotel Restaurant la Ferme,85,"43.97548368,4.82743710",[Vous pouvez bénéficier d'une réduction Genius...,<200 https://www.booking.com/hotel/fr/logis-de...,0
2,Hotel Boquier,86,"43.94469660,4.80499312",[Vous pouvez bénéficier d'une réduction Genius...,<200 https://www.booking.com/hotel/fr/boquier....,0
3,Garlande Hôtel Avignon Centre,89,"43.94761833,4.80680354",[Vous pouvez bénéficier d'une réduction Genius...,<200 https://www.booking.com/hotel/fr/de-garla...,0
4,La Mirande,89,"43.95007660,4.80770677",[Vous pouvez bénéficier d'une réduction Genius...,<200 https://www.booking.com/hotel/fr/la-miran...,0


In [49]:
df.shape

(111, 6)

In [50]:
df.dtypes

Hotel_name     object
Rating         object
Coordinates    object
Description    object
Link           object
City            int64
dtype: object

In [51]:
# Saving data into csv file
df.to_csv('booking_data.csv')

In [52]:
#cleaning and processing data

#replacing city index with city name
def func(index):
    for i in range(len(cities_to_keep)):
        if index == i:
            return cities_to_keep[i]
df['City_name'] = df['City'].apply(func)

In [53]:
del df["City"]
df.head(20)

Unnamed: 0,Hotel_name,Rating,Coordinates,Description,Link,City_name
0,Best Western Hôtel Le Paradou Avignon Sud.,79,"43.90489250,4.89553034","[Situé dans un parc à Avignon Sud-Montfavet, l...",<200 https://www.booking.com/hotel/fr/paradoua...,Avignon
1,Hotel Restaurant la Ferme,85,"43.97548368,4.82743710",[Vous pouvez bénéficier d'une réduction Genius...,<200 https://www.booking.com/hotel/fr/logis-de...,Avignon
2,Hotel Boquier,86,"43.94469660,4.80499312",[Vous pouvez bénéficier d'une réduction Genius...,<200 https://www.booking.com/hotel/fr/boquier....,Avignon
3,Garlande Hôtel Avignon Centre,89,"43.94761833,4.80680354",[Vous pouvez bénéficier d'une réduction Genius...,<200 https://www.booking.com/hotel/fr/de-garla...,Avignon
4,La Mirande,89,"43.95007660,4.80770677",[Vous pouvez bénéficier d'une réduction Genius...,<200 https://www.booking.com/hotel/fr/la-miran...,Avignon
5,Campanile Avignon Sud - Montfavet la Cristole,72,"43.92592594,4.85080719",[Vous pouvez bénéficier d'une réduction Genius...,<200 https://www.booking.com/hotel/fr/campanil...,Avignon
6,hotelF1 Avignon Centre Courtine gare TGV,66,"43.92903200,4.78818900",[L'hotelF1 Avignon Centre Courtine gare TGV se...,<200 https://www.booking.com/hotel/fr/hotelf1-...,Avignon
7,Au Saint Roch - Hôtel et Jardin,81,"43.94290138,4.79804277",[Vous pouvez bénéficier d'une réduction Genius...,<200 https://www.booking.com/hotel/fr/au-saint...,Avignon
8,Hôtel Le Colbert,88,"43.94411137,4.80579557",[L'Hôtel Le Colbert est idéalement situé dans ...,<200 https://www.booking.com/hotel/fr/le-colbe...,Avignon
9,Hôtel-Restaurant Kyriad Cap Sud,80,"43.92744820,4.84788895",[Vous pouvez bénéficier d'une réduction Genius...,<200 https://www.booking.com/hotel/fr/kyriad-a...,Avignon


In [54]:
#transforming string columns into float type when relevant
df['Rating'] = df['Rating'].str.replace(',','.')
df["Rating"] = pd.to_numeric(df["Rating"], downcast="float")

In [55]:
df.iloc[0,3]

["Situé dans un parc à Avignon Sud-Montfavet, le Best Western Hôtel Le Paradou Avignon Sud. propose une connexion Wi-Fi gratuite disponible dans l'ensemble de l'établissement. Vous séjournerez à 15 minutes en voiture du centre-ville d'Avignon.",
 "Offrant une vue sur la piscine ou le jardin, les chambres insonorisées et climatisées possèdent une télévision par câble. Leur salle de bains privative est pourvue d'une baignoire, d'articles de toilette mis gracieusement à votre disposition et d'un sèche-cheveux.",
 "L'hôtel comporte un restaurant préparant des plats provençaux variés et un bar.",
 "La région est appréciée pour le golf. L'aéroport d'Avignon-Provence est installé à proximité. Vous bénéficierez gratuitement d'un parking privé sur place. ",
 '\n',
 '\nChaîne hôtelière/marque:\nBest Western\n']

In [56]:
#creating gps coordinates columns from Coordinates 
df[['Hotel_latitude', 'Hotel_longitude']] = df['Coordinates'].str.split(',', expand=True)
df["Hotel_latitude"] = pd.to_numeric(df["Hotel_latitude"], downcast="float")
df["Hotel_longitude"] = pd.to_numeric(df["Hotel_longitude"], downcast="float")

In [57]:
#cleaning text columns
df["Description"]= df["Description"].str.join(" ").str.strip().str.replace("\n","")

In [58]:
df.iloc[0,3]

"Situé dans un parc à Avignon Sud-Montfavet, le Best Western Hôtel Le Paradou Avignon Sud. propose une connexion Wi-Fi gratuite disponible dans l'ensemble de l'établissement. Vous séjournerez à 15 minutes en voiture du centre-ville d'Avignon. Offrant une vue sur la piscine ou le jardin, les chambres insonorisées et climatisées possèdent une télévision par câble. Leur salle de bains privative est pourvue d'une baignoire, d'articles de toilette mis gracieusement à votre disposition et d'un sèche-cheveux. L'hôtel comporte un restaurant préparant des plats provençaux variés et un bar. La région est appréciée pour le golf. L'aéroport d'Avignon-Provence est installé à proximité. Vous bénéficierez gratuitement d'un parking privé sur place.   Chaîne hôtelière/marque:Best Western"

In [59]:
df.iloc[0,4]

'<200 https://www.booking.com/hotel/fr/paradouavignon.fr.html?label=gen173nr-1FCAMoTUIHYXZpZ25vbkgNWARoFYgBAZgBDbgBGMgBDNgBAegBAfgBAogCAagCBLgC16aNlAbAAgHSAiRlMTQxMjQzMi02NWNmLTQ2NGYtOWNlNS04Mjc0ZDg2ZjJlNjHYAgXgAgE&sid=ef4c1fc0086d64b80e070d87330090f5>'

In [60]:
df["Link"]= df["Link"].str.replace("<200", "").str.replace(">", "")

In [61]:
df.iloc[0,4]

' https://www.booking.com/hotel/fr/paradouavignon.fr.html?label=gen173nr-1FCAMoTUIHYXZpZ25vbkgNWARoFYgBAZgBDbgBGMgBDNgBAegBAfgBAogCAagCBLgC16aNlAbAAgHSAiRlMTQxMjQzMi02NWNmLTQ2NGYtOWNlNS04Mjc0ZDg2ZjJlNjHYAgXgAgE&sid=ef4c1fc0086d64b80e070d87330090f5'

In [62]:
 #deleting useless columns
del df['Coordinates']

In [63]:
df.head()

Unnamed: 0,Hotel_name,Rating,Description,Link,City_name,Hotel_latitude,Hotel_longitude
0,Best Western Hôtel Le Paradou Avignon Sud.,7.9,"Situé dans un parc à Avignon Sud-Montfavet, le...",https://www.booking.com/hotel/fr/paradouavign...,Avignon,43.904892,4.89553
1,Hotel Restaurant la Ferme,8.5,Vous pouvez bénéficier d'une réduction Genius ...,https://www.booking.com/hotel/fr/logis-de-fra...,Avignon,43.975483,4.827437
2,Hotel Boquier,8.6,Vous pouvez bénéficier d'une réduction Genius ...,https://www.booking.com/hotel/fr/boquier.fr.h...,Avignon,43.944698,4.804993
3,Garlande Hôtel Avignon Centre,8.9,Vous pouvez bénéficier d'une réduction Genius ...,https://www.booking.com/hotel/fr/de-garlande....,Avignon,43.947617,4.806804
4,La Mirande,8.9,Vous pouvez bénéficier d'une réduction Genius ...,https://www.booking.com/hotel/fr/la-mirande.f...,Avignon,43.950077,4.807707


In [64]:
#re arranging columns in dataframe
df = df.reindex(['City_name','Hotel_name','Rating', 'Hotel_latitude', 'Hotel_longitude', 'Description', 'Link'], axis=1)

In [65]:
df.head()

Unnamed: 0,City_name,Hotel_name,Rating,Hotel_latitude,Hotel_longitude,Description,Link
0,Avignon,Best Western Hôtel Le Paradou Avignon Sud.,7.9,43.904892,4.89553,"Situé dans un parc à Avignon Sud-Montfavet, le...",https://www.booking.com/hotel/fr/paradouavign...
1,Avignon,Hotel Restaurant la Ferme,8.5,43.975483,4.827437,Vous pouvez bénéficier d'une réduction Genius ...,https://www.booking.com/hotel/fr/logis-de-fra...
2,Avignon,Hotel Boquier,8.6,43.944698,4.804993,Vous pouvez bénéficier d'une réduction Genius ...,https://www.booking.com/hotel/fr/boquier.fr.h...
3,Avignon,Garlande Hôtel Avignon Centre,8.9,43.947617,4.806804,Vous pouvez bénéficier d'une réduction Genius ...,https://www.booking.com/hotel/fr/de-garlande....
4,Avignon,La Mirande,8.9,43.950077,4.807707,Vous pouvez bénéficier d'une réduction Genius ...,https://www.booking.com/hotel/fr/la-mirande.f...


## IV- ETL process

In [62]:
# Install boto3 using pip 
!pip install Boto3
import boto3

Collecting Boto3
  Downloading boto3-1.23.0-py3-none-any.whl (132 kB)
     |████████████████████████████████| 132 kB 9.4 MB/s            
[?25hCollecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.2-py3-none-any.whl (79 kB)
     |████████████████████████████████| 79 kB 8.4 MB/s             
Collecting botocore<1.27.0,>=1.26.0
  Downloading botocore-1.26.0-py3-none-any.whl (8.7 MB)
     |████████████████████████████████| 8.7 MB 51.3 MB/s            
Installing collected packages: botocore, s3transfer, Boto3
Successfully installed Boto3-1.23.0 botocore-1.26.0 s3transfer-0.5.2


In [63]:
#set up a AWS session with Boto3
session = boto3.Session(aws_access_key_id="xxxxxx", 
                        aws_secret_access_key="xxxxxx")

In [65]:
#set up an S3 ressource
s3 = session.resource("s3")

In [68]:
#create a bucket to store data 
BUCKET_NAME = "kayak-datalake-an22"
bucket = s3.create_bucket(Bucket=BUCKET_NAME)

In [69]:
put_object1 = bucket.put_object(Key="cities_weather.csv", Body="cities_weather.csv")

In [70]:
put_object2 = bucket.put_object(Key="booking_data.csv", Body="booking_data.csv")

## V- Visulization

In [73]:
#visualizing for each city in the top 5 cities the best ranked hotels

fig = px.scatter_mapbox(df, lat="Hotel_latitude", lon="Hotel_longitude", color="Rating", zoom =10,
                        mapbox_style="open-street-map", color_continuous_scale = 'Reds', range_color = [5.0,10.0],
                       animation_frame = 'City_name')
fig.show()