# Project: planning my next holidays ☀️

Let's create a script that allows to get some information about all the hotels in a given city on <a href="https://www.booking.com" target="_blank">www.booking.com</a> 🧙

**We strongly recommend that you use Scrapy, it will be much easier!**

You can scrap as many information as you want, but we suggest that you get at least:

* The hotel name, 
* The url to its booking.com page, 
* Its coordinates: latitude and longitude,
* The score given by the website users,
* The text description of the hotel.

Then, you can execute this script for several cities from yesterday's list. Make sure you save the results in different files for each city and that the name of the city is stored in the filename (for later purposes 😉).

In [4]:
import os
import logging

import scrapy
from scrapy.crawler import CrawlerProcess

import pandas as pd

In [2]:
destination_name = "Paris"

top_35_cities=["Mont-Saint-Michel", "St-Malo", "Bayeux", "Le-Havre", "Rouen", "Paris", "Amiens", "Lille", "Strasbourg",
"Chateau-du-Haut-Koenigsbourg","Colmar", "Eguisheim", "Besancon", "Dijon","Annecy", "Grenoble", "Lyon", "Gorges-du-Verdon",
"Bormes-les-Mimosas", "Cassis", "Marseille", "Aix-en-Provence", "Avignon", "Uzes", "Nimes", "Aigues-Mortes",
"Saintes-Maries-de-la-mer", "Collioure", "Carcassonne", "Ariege", "Toulouse", "Montauban", "Biarritz", "Bayonne",
"La-Rochelle"]
len(top_35_cities)

35

In [3]:
class Hotels(scrapy.Spider):
    # Name of your spider
    name = "hotels"

    # Starting URL
    start_urls = ['https://www.booking.com/index.fr.html']
    
    # Parse function for login
    def parse(self, response):
        # FormRequest used to login
        return scrapy.FormRequest.from_response(
            response,
            formdata={'ss': destination_name},
            callback=self.after_search
        )

    # Callback used after login
    def after_search(self, response):
        
        hotels = response.css('.sr_item')

        for h in hotels:
            yield {
                'name': h.css('.sr-hotel__name::text').get(),
                'url': "https://www.booking.com" + h.css('.hotel_name_link').attrib["href"],
                'coords': h.css('.sr_card_address_line a').attrib["data-coords"],
                'score': h.css('.bui-review-score__badge::text').get(),
                'description': h.css('.hotel_desc::text').get()
                }
        
        
        # Select the NEXT button and store it in next_page
        try:
            next_page = response.css('a.paging-next').attrib["href"]
        except KeyError:
            logging.info('No next page. Terminating crawling process.')
        else:
            yield response.follow(next_page, callback=self.after_search)

In [4]:
class Hotels(scrapy.Spider):
    # Name of your spider
    name = "hotels"

    # Starting URL
    start_urls = ['https://www.booking.com/index.fr.html']
    
    # Parse function for login
    def parse(self, response):
        # FormRequest used to login
        return scrapy.FormRequest.from_response(
            response,
            formdata={'ss': destination_name},
            callback=self.after_search
        )

    # Callback used after login
    def after_search(self, response):
        
        hotels = response.css('.sr_item')

        for h in hotels:
            yield {
                'name': h.css('.sr-hotel__name::text').get(),
                'url': "https://www.booking.com" + h.css('.hotel_name_link').attrib["href"],
                'coords': h.css('.sr_card_address_line a').attrib["data-coords"],
                'score': h.css('.bui-review-score__badge::text').get(),
                'description': h.css('.hotel_desc::text').get()
                }
        
        
        # Select the NEXT button and store it in next_page
        try:
            next_page = response.css('a.paging-next').attrib["href"]
        except KeyError:
            logging.info('No next page. Terminating crawling process.')
        else:
            yield response.follow(next_page, callback=self.after_search)

In [5]:

filename = "hotels_" + destination_name.replace(" ", "-") + ".json"

if filename in os.listdir('/Users/saas1/Desktop/'):
        os.remove('/Users/saas1/Desktop/' + filename)

process = CrawlerProcess(settings = {
    'USER_AGENT': 'Chrome/84.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'LOG_LEVEL': logging.ERROR,
    "FEEDS": {
        '/Users/saas1/Desktop/' + filename: {"format": "json"},
    }
})

process.crawl(Hotels)
process.start()

In [5]:
path_to_file="/Users/saas1/OneDrive/Escritorio/introduction_to_ python_for_data_science/Projects/Project_3/Project/hotels_"

data_frame=pd.DataFrame()

top_35_cities=["Mont-Saint-Michel", "St-Malo", "Bayeux", "Le-Havre", "Rouen", "Paris", "Amiens", "Lille", "Strasbourg",
"Chateau-du-Haut-Koenigsbourg","Colmar", "Eguisheim", "Besancon", "Dijon","Annecy", "Grenoble", "Lyon", "Gorges-du-Verdon",
"Bormes-les-Mimosas", "Cassis", "Marseille", "Aix-en-Provence", "Avignon", "Uzes", "Nimes", "Aigues-Mortes",
"Saintes-Maries-de-la-mer", "Collioure", "Carcassonne", "Ariege", "Toulouse", "Montauban", "Biarritz", "Bayonne",
"La-Rochelle"]

for city in top_35_cities:
    data=pd.read_json(path_to_file+city+".json")
    data["city"]=city
    data_frame=data_frame.append(data)

In [6]:
data_frame['name'] = data_frame['name'].replace('\n','', regex=True)
data_frame["url"] = data_frame['url'].replace('\n','', regex=True)
data_frame["description"] = data_frame['description'].replace('\n','', regex=True)


In [7]:
data_frame[["lon", "lat"]] = data_frame["coords"].str.split(pat=",", expand=True)
data_frame=data_frame.drop(columns=['coords'])

In [8]:
foo = (1, 2, 3)
bar = (4, 5, 6)

cities_=["Mont-Saint-Michel", "St-Malo", "Le-Havre","Chateau-du-Haut-Koenigsbourg","Gorges-du-Verdon",
"Bormes-les-Mimosas", "Aix-en-Provence", "Aigues-Mortes","Saintes-Maries-de-la-mer","La-Rochelle"]
cities=["Mont Saint Michel", "St Malo", "Le Havre","Chateau du Haut Koenigsbourg","Gorges du Verdon",
"Bormes les Mimosas", "Aix en Provence", "Aigues Mortes","Saintes Maries de la mer","La Rochelle"]

for (city_, city) in zip(cities_, cities):
    data_frame['city'] = data_frame['city'].replace(city_,city, regex=True)

data_frame['lat'] = data_frame['lat'].astype(float)
data_frame['lon'] = data_frame['lon'].astype(float)

data_frame=data_frame.reset_index(drop=True)

In [9]:
data_frame.head()

Unnamed: 0,name,url,score,description,city,lon,lat
0,Hôtel Vert,https://www.booking.com/hotel/fr/vert.fr.html?...,81,"Situé à 2 km du Mont-Saint-Michel, sur la côte...",Mont Saint Michel,-1.509617,48.6147
1,Mercure Mont Saint Michel,https://www.booking.com/hotel/fr/mont-saint-mi...,82,Installé dans des espaces verts à seulement 2 ...,Mont Saint Michel,-1.510545,48.614247
2,Hotel De La Digue,https://www.booking.com/hotel/fr/de-la-digue.f...,71,L'hôtel De La Digue est un établissement tradi...,Mont Saint Michel,-1.510918,48.616882
3,Le Saint Aubert,https://www.booking.com/hotel/fr/hotel-saint-a...,73,"Niché dans un écrin de verdure, à seulement 2 ...",Mont Saint Michel,-1.510105,48.612938
4,Les Terrasses Poulard,https://www.booking.com/hotel/fr/les-terrasses...,73,Occupant 2 bâtiments différents au cœur du Mon...,Mont Saint Michel,-1.510379,48.635349


In [10]:
data_frame.to_excel("hotel.xlsx")
data_frame.to_csv("hotel.csv")