# Hotels.ng Web Scraping

### Import Libraries

In [1]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver

### Startup the webdriver
Steps:
- startup the webdriver
- navigate to the target website
- use the .get method of the driver and pass in the url as the argument

In [2]:
driver = webdriver.Chrome("C:\Webdrivers\chromedriver.exe")

In [3]:
url = "https://hotels.ng/"
driver.get(url)

### Create a function that will insert a serach term using string formatting

Steps: 
- Assign the url link to a template variable 
- Delete the keyword search term in the url and replace it with curly braces {}. That is where we are going to insert our search term. 
- Also replace every space in the search term with a plus(+) to conform with the url convention.
- Return the template with the search term inserted with string formatting


In [4]:
def get_url(search_term):
    """Generate a url from a search term"""
    template = "https://hotels.ng/hotels-in-{}/"
    search_term = search_term.replace(" ", "+")
    return template.format(search_term)

- Now we have a function that will generate a url based on the search term provided. Let's try it out.

In [5]:
url = get_url("oyo")
print(url)

https://hotels.ng/hotels-in-oyo/


In [6]:
driver.get(url)

### Making the soup

- This will give us access to the HTML elements of the website and represent it as a nested data structure.
- Beautiful Soup supports the HTML parser included in Python’s standard library, but it also supports a number of third-party Python parsers. One is the lxml parser. We would used the lxml parser because it is faster than the HTML parser.
- The driver. page_source retrieves the HTML of the url we passed on driver. get(url) . It gets the same code as seen on our browser when we click Inspect Element.
- The find_all() returns all the tags and strings that matches our filters.

In [7]:
soup = BeautifulSoup(driver.page_source, "lxml")

In [8]:
listings = soup.find_all("div", class_ = "col-xs-12 col-sm-8 row listing-hotels-details-box")

In [9]:
len(listings)

10

### Data we need from the website

- Name
- Address
- Facilities
- Prices
- Rating

### Create a prototype model to extract the hotel data

- Prototype the extraction of a single record. And then, apply the model to the entire record set. 
- Assign the first result to the variable card. 

In [10]:
card = listings[0]

In [11]:
name = card.find("div", class_ = "listing-hotels-details-property").text.replace("\n", "")

In [12]:
name

' Royal Cedars Hotels And Apartments'

In [13]:
address= card.find("p", class_ = "listing-hotels-address color-dark hidden-md hidden-lg").text.replace("\n", "")

In [14]:
facilities= card.find("div", class_ = "listing-hotels-facilities hidden-xs").text.replace("\n", "")

In [15]:
avg_price_per_night = card.find("p", class_ = "listing-hotels-prices-discount").text.replace("\n", "")

In [16]:
rating = card.find("div", class_ ="listing-hotels-rating-box").text.replace("\n", "")

### Put the prototype together
Steps to follow:

- Generalize the prototype model into a function and apply it to all the hotel listings on the page.
- Define a function called get_hotel_listings that accepts a card arguement.
- Copy and paste the all the code we've created to get the card data
- Add error handling to our def function. 
- Organize the variable into a tuple and then assign it into an hotel_listings variable.
- And then return the hotel_listings variable

In [17]:
def get_hotel_listings(card):
    """Extract hotel listings information from the raw html"""
    
    #name
    name = card.find("div", class_ = "listing-hotels-details-property").text.replace("\n", "")
    
        
    #address
    address= card.find("p", class_ = "listing-hotels-address color-dark hidden-md hidden-lg").text.replace("\n", "")
    
    
    try:
        #facilities
        facilities= card.find("div", class_ = "listing-hotels-facilities hidden-xs").text.replace("\n", "")
    except AttributeError:
        return
        facilities = " "
    
    try:
        #average price per night
        avg_price_per_night = card.find("p", class_ = "listing-hotels-prices-discount").text.replace("\n", "")
    except AttributeError:
        return 
    
    try:
        #ratings
        rating = card.find("div", class_ ="listing-hotels-rating-box").text.replace("\n", "")
    except AttributeError:
        return
        rating = " "
    
    hotel_listings = (name, address, facilities, avg_price_per_night, rating)
    return hotel_listings

### Apply the prototype model to all listings on the page using a for loop
- Create an empty hotels [] list that would contain all of our extracted hotel data.
- Use the listings pattern we did above to collect all of the hotel listings on the page to iterate through. 

In [18]:
hotels = []
listings = soup.find_all("div", class_ = "col-xs-12 col-sm-8 row listing-hotels-details-box")

for card in listings:
    hotel_lists = get_hotel_listings(card) #checks to see if what we return from the function is empty or not
    if hotel_lists:
        hotels.append(hotel_lists) #if the hotel_lists has something in it, then we would append that to the hotels list.

### Print the first 5 listings

In [19]:
hotels[:5]

[(' Royal Cedars Hotels And Apartments',
  'Ibadan, Oyo - Valley View Inn; Plot 7, Block7, Alalubosa G.R.A I...',
  'RestaurantPoolBarGym',
  '₦23,000avg/night',
  '10.0 ExcellentFrom 1 review'),
 ('The Orchard Hotel',
  'Ibadan, Oyo - Opposite NDLEA By Golf Club, Onireke G.R.A',
  'InternetRestaurantBar',
  '₦12,000avg/night',
  '7.0 Very GoodFrom 37 reviews'),
 ('Joybam Hotel',
  'Ibadan, Oyo - Off Ring Road Ososami Behind Liberty Stadium Compl...',
  'Bar',
  '₦6,300avg/night',
  '7.7 Very GoodFrom 11 reviews'),
 ('Ayotoz Hotel',
  'Ibadan, Oyo - 3 Adegbite Street, Opposite Kenny Gee Plaza, UI Ga...',
  'RestaurantBar',
  '₦3,500avg/night',
  '6.2 Very GoodFrom 13 reviews'),
 ('Live Oak Suites',
  'Ibadan, Oyo - NO 8 Alabiamo Street Off Awolowo Avenue,Old Bodija...',
  'RestaurantBar',
  '₦6,000avg/night',
  '6.6 Very GoodFrom 26 reviews')]

### Getting to the next page

Steps: 

- Add the page query to the url using string formatting
- Then request the next page until we have extracted from all the pages in the website.
- For ease, we need to modify the get url function which we have already defined.
- Assign to a url variable template.format. And then pass in our search term as we did previously.
- Add the and page and curly braces{}. This would give us with string formatting the next page number. 
- Finally, we would return the url.

In [20]:
def get_url(search_term):
    """Generate a url from a search term"""
    template = "https://hotels.ng/hotels-in-{}/"
    search_term = search_term.replace(" ", "+")
    
    #add search term query to the url
    url = template.format(search_term) #Assign to a url variable template.format. And then pass in our search term as we did previously
    
    #add page query placeholder
    url += "{}"
    
    return url

### Put everything together/ Scraping Multiple Pages

In [21]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver

def get_url(search_term):
    """Generate a url from a search term"""
    template = "https://hotels.ng/hotels-in-{}/"
    search_term = search_term.replace(" ", "+")
    
    #add search term query to the url
    url = template.format(search_term) #Assign to a url variable template.format. And then pass in our search term as we did previously
    
    #add page query placeholder
    url += "{}"
    
    return url

def get_hotel_listings(card):
    """Extract hotel listings information from the raw html"""
    
    #name
    name = card.find("div", class_ = "listing-hotels-details-property").text.replace("\n", "")
    
        
    #address
    address= card.find("p", class_ = "listing-hotels-address color-dark hidden-md hidden-lg").text.replace("\n", "")
    
    try:
        #facilities
        facilities= card.find("div", class_ = "listing-hotels-facilities hidden-xs").text.replace("\n", "")
    except AttributeError:
        return
        facilities = " "
    
    try:
        #average price per night
        avg_price_per_night = card.find("p", class_ = "listing-hotels-prices-discount").text.replace("\n", "")
    except AttributeError:
        return 
    
    try:
        #ratings
        rating = card.find("div", class_ ="listing-hotels-rating-box").text.replace("\n", "")
    except AttributeError:
        return
        rating = " "
    
    hotel_listings = (name, address, facilities, avg_price_per_night, rating)
    return hotel_listings

def main(search_term):
    """Run main program routine"""
    #startup the webdriver
    driver = webdriver.Chrome("C:\Webdrivers\chromedriver.exe")
    
    hotel = []
    url = get_url(search_term)
    
    for page in range(1, 10):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, "lxml")
        listings = soup.find_all("div", class_ = "col-xs-12 col-sm-8 row listing-hotels-details-box")
        
        for card in listings:
            hotel_lists = get_hotel_listings(card) 
            if hotel_lists:
                hotels.append(hotel_lists) 

    driver.close()
    
    #save the data to a csv file
    with open ('hotels_in_rivers.csv', 'w', newline='', encoding= 'utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['name', 'address', 'facilities', 'avg_price_per_night', 'rating'])
        writer.writerows(hotels)

Now that everything is created, we can now run the main program. 

In [22]:
main('rivers')