<p style = 'color:#E6E4E7 ; background-color:#00561B ; text-align:center ; font-size: 300%'> Kayak project - Get hotel data with Scrapy Spider (first page only)

<p style = 'color:#00561B ; background-color:#E6E4E7 ; text-align:center ; font-size: 150%'> Project prepared by Stephanie Cotineau - #dsmpt-Paris-08

In [1]:
!pip install Scrapy -q

In [2]:
import os # For manipulate file 
import logging # For display logs
import scrapy # Import scrapy & scrapy.crawler
from scrapy.crawler import CrawlerProcess

In [3]:
# Name of the file where the results will be saved
filename = "Booking.json"

# If file already exists, delete it before crawling (because Scrapy will 
# concatenate the last and new results otherwise)
if filename in os.listdir('results_hotels/'):
        os.remove('results_hotels/' + filename)

# Declare a new CrawlerProcess with some settings
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'
process = CrawlerProcess(settings = {
    'USER_AGENT': user_agent,
    'LOG_LEVEL': logging.ERROR,
    "FEEDS": {
        'results_hotels/' + filename : {"format": "json"},
    }
})

In [4]:
class HotelBookingSpider(scrapy.Spider) :
    # Name of spider
    name = 'hotelbooking'
    
    # Starting URL
    start_urls = ['https://www.booking.com/']
    
    # Scope of the project: list of top French cities
    cities = ["Mont Saint Michel", "St Malo", "Bayeux", "Le Havre", "Rouen", "Paris", "Amiens", "Lille", 
              "Strasbourg", "Chateau du Haut Koenigsbourg", "Colmar", "Eguisheim", "Besancon", "Dijon", 
              "Annecy", "Grenoble", "Lyon", "Bormes les Mimosas", "Cassis", "Marseille", "Aix en Provence", 
              "Avignon", "Uzes", "Nimes", "Aigues Mortes", "Saintes Maries de la mer", "Collioure", 
              "Carcassonne", "Toulouse", "Montauban", "Biarritz", "Bayonne", "La Rochelle"]
    
    # Parse function for 'search'
    def parse(self, response) : 
        for location in self.cities :
            # FormRequest used to search per cities
            yield scrapy.FormRequest.from_response(
                response,
                formdata = {'ss' : location}, # id = 'ss' 
                # Function to be called once search launched
                callback = self.search,
                cb_kwargs = {'location':location})
    
    # Callback used after search launched
    def search(self, response, location):
        hotels = response.css('div.a826ba81c4.fe821aea6c.fa2f36ad22.afd256fc79.d08f526e0d.ed11e24d01.da89aeb942')
        for hotel in hotels:
            yield {
                'location' : location,
                'hotel_url' : hotel.css('a.e13098a59f').attrib['href'],
                'hotel_name' : hotel.css('div.fcab3ed991.a23c043802::text').get(),
                'hotel_rating' : hotel.css('div.b5cd09854e.d10a6220b4::text').get(),
                'hotel_description_short' : hotel.css('div.d8eab2cf7f::text').get()
            }

In [5]:
# Start the crawling using the spider defined above
process.crawl(HotelBookingSpider)
process.start()

In [6]:
import pandas as pd

In [7]:
df = pd.read_json('results_hotels/Booking.json')

In [8]:
df.head(55)

Unnamed: 0,location,hotel_url,hotel_name,hotel_rating,hotel_description_short
0,Bayeux,https://www.booking.com/hotel/fr/gite-6-47-9-p...,Le Chat Qui Veille,9.9,Managed by a private host
1,Bayeux,https://www.booking.com/hotel/fr/etap-bayeux.e...,ibis budget Bayeux,8.2,"Between Caen and Cherbourg, near the exit 36 o..."
2,Bayeux,https://www.booking.com/hotel/fr/domaine-de-ba...,Domaine de Bayeux,9.4,"Located in the centre of Bayeux, the Domaine d..."
3,Bayeux,https://www.booking.com/hotel/fr/hoteldebrunvi...,Hôtel De Brunville et La Table du Grand Luxemb...,7.9,"Just 400 metres from Bayeux Cathedral, this ho..."
4,Bayeux,https://www.booking.com/hotel/fr/reine-mathild...,Hotel Reine Mathilde,8.6,Hotel Reine Mathilde is situated in the mediev...
5,Bayeux,https://www.booking.com/hotel/fr/chateau-saint...,Château Saint Gilles,9.7,"Situated in Bayeux, 4 km from Baron Gerard Mus..."
6,Bayeux,https://www.booking.com/hotel/fr/logis-du-gran...,LOGIS DU GRAND PIN,9.5,"LOGIS DU GRAND PIN has garden views, free WiFi..."
7,Bayeux,https://www.booking.com/hotel/fr/le-lion-d-or-...,Hotel Le Lion D'Or et Restaurant La Table Du Lion,8.7,This former 18th-century post office is now a ...
8,Bayeux,https://www.booking.com/hotel/fr/largo.en-gb.h...,largo,9.0,"Largo is situated in the centre of Bayeux, 200..."
9,Bayeux,https://www.booking.com/hotel/fr/le-castel-nob...,Le Castel Guesthouse,8.7,Le Castel is a guesthouse located in the centr...


In [9]:
df.shape

(825, 5)

In [10]:
from bs4 import BeautifulSoup 
import requests

In [11]:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1'}
result_ll =[]
for link in df['hotel_url']:
    page = requests.get(link, headers = headers)
    soup_link = BeautifulSoup(page.content, "lxml")
    lat_lon = soup_link.find_all(id = "hotel_address")
    for ll in lat_lon : 
        #print(ll['data-atlas-latlng'])
        result_ll.append(ll['data-atlas-latlng'])

In [12]:
result_ll

['49.28464200,-0.71455100',
 '49.25424209,-0.64648747',
 '49.27232560,-0.69851010',
 '49.27815769,-0.70351392',
 '49.27603159,-0.70172489',
 '49.24936600,-0.71246900',
 '49.25200360,-0.66984770',
 '49.27627832,-0.69836944',
 '49.27631225,-0.70026888',
 '49.27368314,-0.70322692',
 '49.28256300,-0.67736300',
 '49.27643410,-0.69997250',
 '49.27415914,-0.70450634',
 '49.27646775,-0.70556582',
 '49.27971200,-0.70978000',
 '49.27842191,-0.70329666',
 '49.27990374,-0.70877506',
 '49.27600709,-0.69828361',
 '49.27905535,-0.70856206',
 '49.27454414,-0.70200115',
 '49.28082623,-0.69080722',
 '49.27913671,-0.72042494',
 '49.27399410,-0.69870470',
 '49.27660730,-0.70057154',
 '49.28176922,-0.71848869',
 '48.61470049,-1.50961697',
 '48.61758727,-1.51039615',
 '48.63534943,-1.51037872',
 '48.63508532,-1.51053965',
 '48.61424653,-1.51054502',
 '48.61688155,-1.51091784',
 '48.61538141,-1.51070997',
 '48.63602298,-1.50989592',
 '48.61293783,-1.51010513',
 '48.63568798,-1.50988251',
 '48.63606300,-1.511

In [13]:
len(result_ll)

825

In [14]:
df['Latitude_Longitude'] = result_ll

In [15]:
df

Unnamed: 0,location,hotel_url,hotel_name,hotel_rating,hotel_description_short,Latitude_Longitude
0,Bayeux,https://www.booking.com/hotel/fr/gite-6-47-9-p...,Le Chat Qui Veille,9.9,Managed by a private host,"49.28464200,-0.71455100"
1,Bayeux,https://www.booking.com/hotel/fr/etap-bayeux.e...,ibis budget Bayeux,8.2,"Between Caen and Cherbourg, near the exit 36 o...","49.25424209,-0.64648747"
2,Bayeux,https://www.booking.com/hotel/fr/domaine-de-ba...,Domaine de Bayeux,9.4,"Located in the centre of Bayeux, the Domaine d...","49.27232560,-0.69851010"
3,Bayeux,https://www.booking.com/hotel/fr/hoteldebrunvi...,Hôtel De Brunville et La Table du Grand Luxemb...,7.9,"Just 400 metres from Bayeux Cathedral, this ho...","49.27815769,-0.70351392"
4,Bayeux,https://www.booking.com/hotel/fr/reine-mathild...,Hotel Reine Mathilde,8.6,Hotel Reine Mathilde is situated in the mediev...,"49.27603159,-0.70172489"
...,...,...,...,...,...,...
820,Marseille,https://www.booking.com/hotel/fr/kyriad-marsei...,Kyriad Marseille Centre Paradis-Préfecture,6.8,Kyriad Marseille Centre Paradis-Préfecture is ...,"43.28872870,5.37895238"
821,Marseille,https://www.booking.com/hotel/fr/marseille-die...,"InterContinental Marseille - Hotel Dieu, an IH...",8.5,InterContinental Marseille - Hotel Dieu is set...,"43.29858167,5.36974624"
822,Marseille,https://www.booking.com/hotel/fr/les-o.en-gb.h...,Les Ô du Panier,8.5,Managed by a private host,"43.29929325,5.36862872"
823,Marseille,https://www.booking.com/hotel/fr/les-villages-...,Villages Clubs du Soleil - Marseille,8.2,"Offering a seasonal outdoor pool, a restaurant...","43.31284784,5.39198309"


In [16]:
lat = []
lon = []

# For each row in a varible,
for row in df['Latitude_Longitude']:
    # Try to,
    try:
        # Split the row by comma and append
        # everything before the comma to lat
        lat.append(row.split(',')[0])
        # Split the row by comma and append
        # everything after the comma to lon
        lon.append(row.split(',')[1])
    # But if you get an error
    except:
        # append a missing value to lat
        lat.append(np.NaN)
        # append a missing value to lon
        lon.append(np.NaN)

# Create two new columns from lat and lon
df['latitude'] = lat
df['longitude'] = lon

In [17]:
df

Unnamed: 0,location,hotel_url,hotel_name,hotel_rating,hotel_description_short,Latitude_Longitude,latitude,longitude
0,Bayeux,https://www.booking.com/hotel/fr/gite-6-47-9-p...,Le Chat Qui Veille,9.9,Managed by a private host,"49.28464200,-0.71455100",49.28464200,-0.71455100
1,Bayeux,https://www.booking.com/hotel/fr/etap-bayeux.e...,ibis budget Bayeux,8.2,"Between Caen and Cherbourg, near the exit 36 o...","49.25424209,-0.64648747",49.25424209,-0.64648747
2,Bayeux,https://www.booking.com/hotel/fr/domaine-de-ba...,Domaine de Bayeux,9.4,"Located in the centre of Bayeux, the Domaine d...","49.27232560,-0.69851010",49.27232560,-0.69851010
3,Bayeux,https://www.booking.com/hotel/fr/hoteldebrunvi...,Hôtel De Brunville et La Table du Grand Luxemb...,7.9,"Just 400 metres from Bayeux Cathedral, this ho...","49.27815769,-0.70351392",49.27815769,-0.70351392
4,Bayeux,https://www.booking.com/hotel/fr/reine-mathild...,Hotel Reine Mathilde,8.6,Hotel Reine Mathilde is situated in the mediev...,"49.27603159,-0.70172489",49.27603159,-0.70172489
...,...,...,...,...,...,...,...,...
820,Marseille,https://www.booking.com/hotel/fr/kyriad-marsei...,Kyriad Marseille Centre Paradis-Préfecture,6.8,Kyriad Marseille Centre Paradis-Préfecture is ...,"43.28872870,5.37895238",43.28872870,5.37895238
821,Marseille,https://www.booking.com/hotel/fr/marseille-die...,"InterContinental Marseille - Hotel Dieu, an IH...",8.5,InterContinental Marseille - Hotel Dieu is set...,"43.29858167,5.36974624",43.29858167,5.36974624
822,Marseille,https://www.booking.com/hotel/fr/les-o.en-gb.h...,Les Ô du Panier,8.5,Managed by a private host,"43.29929325,5.36862872",43.29929325,5.36862872
823,Marseille,https://www.booking.com/hotel/fr/les-villages-...,Villages Clubs du Soleil - Marseille,8.2,"Offering a seasonal outdoor pool, a restaurant...","43.31284784,5.39198309",43.31284784,5.39198309


In [18]:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1'}
result_long_description =[]
for link in df['hotel_url']:
    page = requests.get(link, headers = headers)
    soup_link = BeautifulSoup(page.content, "lxml")
    long_description = soup_link.find_all(id = "property_description_content") 
    for ld in long_description : 
        result_long_description.append(ld.text)

In [19]:
len(result_long_description)

825

In [20]:
df['hotel_description_long'] = result_long_description

In [21]:
df.head(55)

Unnamed: 0,location,hotel_url,hotel_name,hotel_rating,hotel_description_short,Latitude_Longitude,latitude,longitude,hotel_description_long
0,Bayeux,https://www.booking.com/hotel/fr/gite-6-47-9-p...,Le Chat Qui Veille,9.9,Managed by a private host,"49.28464200,-0.71455100",49.284642,-0.714551,\nYou're eligible for a Genius discount at Le ...
1,Bayeux,https://www.booking.com/hotel/fr/etap-bayeux.e...,ibis budget Bayeux,8.2,"Between Caen and Cherbourg, near the exit 36 o...","49.25424209,-0.64648747",49.25424209,-0.64648747,"\nBetween Caen and Cherbourg, near the exit 36..."
2,Bayeux,https://www.booking.com/hotel/fr/domaine-de-ba...,Domaine de Bayeux,9.4,"Located in the centre of Bayeux, the Domaine d...","49.27232560,-0.69851010",49.2723256,-0.6985101,"\nLocated in the centre of Bayeux, the Domaine..."
3,Bayeux,https://www.booking.com/hotel/fr/hoteldebrunvi...,Hôtel De Brunville et La Table du Grand Luxemb...,7.9,"Just 400 metres from Bayeux Cathedral, this ho...","49.27815769,-0.70351392",49.27815769,-0.70351392,\nYou're eligible for a Genius discount at Hôt...
4,Bayeux,https://www.booking.com/hotel/fr/reine-mathild...,Hotel Reine Mathilde,8.6,Hotel Reine Mathilde is situated in the mediev...,"49.27603159,-0.70172489",49.27603159,-0.70172489,\nHotel Reine Mathilde is situated in the medi...
5,Bayeux,https://www.booking.com/hotel/fr/chateau-saint...,Château Saint Gilles,9.7,"Situated in Bayeux, 4 km from Baron Gerard Mus...","49.24936600,-0.71246900",49.249366,-0.712469,"\nSituated in Bayeux, 4 km from Baron Gerard M..."
6,Bayeux,https://www.booking.com/hotel/fr/logis-du-gran...,LOGIS DU GRAND PIN,9.5,"LOGIS DU GRAND PIN has garden views, free WiFi...","49.25200360,-0.66984770",49.2520036,-0.6698477,"\nLOGIS DU GRAND PIN has garden views, free Wi..."
7,Bayeux,https://www.booking.com/hotel/fr/le-lion-d-or-...,Hotel Le Lion D'Or et Restaurant La Table Du Lion,8.7,This former 18th-century post office is now a ...,"49.27627832,-0.69836944",49.27627832,-0.69836944,\nThis former 18th-century post office is now ...
8,Bayeux,https://www.booking.com/hotel/fr/largo.en-gb.h...,largo,9.0,"Largo is situated in the centre of Bayeux, 200...","49.27631225,-0.70026888",49.27631225,-0.70026888,\nYou're eligible for a Genius discount at lar...
9,Bayeux,https://www.booking.com/hotel/fr/le-castel-nob...,Le Castel Guesthouse,8.7,Le Castel is a guesthouse located in the centr...,"49.27368314,-0.70322692",49.27368314,-0.70322692,\nLe Castel is a guesthouse located in the cen...


In [22]:
df['hotel_description_long'] = df['hotel_description_long'].replace('\n','', regex=True)

In [23]:
df['hotel_description_long'][0]

"You're eligible for a Genius discount at Le Chat Qui Veille! To save at this property, all you have to do is sign in.Situated 1 km from Cathedrale Notre Dame de Bayeux, Le Chat Qui Veille is in Bayeux and has barbecue facilities and a garden. Featuring garden views, this holiday home also offers free WiFi.The holiday home features 3 separate bedrooms, 2 bathrooms, a fully equipped kitchen with a dining area, and a living room with a flat-screen TV.The holiday home offers a children's playground. A terrace is available on site and hiking can be enjoyed within close proximity of Le Chat Qui Veille.Baron Gerard Museum is 1.4 km from the accommodation. Caen-Carpiquet Airport is located 27 km from the property and is accessible by a shuttle, free of charge. "

In [24]:
df['hotel_description_short'][0]

'Managed by a private host'

In [25]:
df.to_csv("Booking_First_page_ALL_cities.csv")