# Scraping Hotel Ratings on Booking # 

In this homework we will practice web scraping on the following [site](https://www.booking.com/searchresults.html?aid=304142&label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmcgV1c19tYYgBAZgBMcIBA2FibsgBDdgBA-gBAfgBApICAXmoAgQ&sid=28d97f630803f9d48b4a1f535cbdd33f&class_interval=1&dest_id=20061717&dest_type=city&group_adults=2&group_children=0&label_click=undef&no_rooms=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&src=index&src_elem=sb&ss=Boston&ssb=empty&ssne_untouched=Cancún&rows=15). Let's get some basic information for each hotel in Boston.
On each hotel page, scrape the following information: 
1. Hotel Name
2. Class of Rating (Wonderful/Excellent/Very Good/Good)
3. Rating Score
4. Number of Reviews


** Save the data in "traveler_ratings.csv" in the following format: hotel_name, class_of_rating, rating, num_reviews **

**(10 pts)**

You can see an overview of the information as displayed:





![Information to be scraped](booking_sample.png)

In [2]:
from bs4 import BeautifulSoup as soup
import requests
import time 
import pandas as pd
import csv

# https://www.booking.com/searchresults.html?aid=304142&label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmcgV1c19tYYgBAZgBMcIBA2FibsgBDdgBA-gBAfgBApICAXmoAgQ&sid=28d97f630803f9d48b4a1f535cbdd33f&class_interval=1&dest_id=20061717&dest_type=city&group_adults=2&group_children=0&label_click=undef&no_rooms=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&src=index&src_elem=sb&ss=Boston&ssb=empty&ssne_untouched=Canc%C3%BAn&rows=15
# http://www.booking.com/Boston

#url = "https://www.booking.com/searchresults.html?aid=304142&label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmcgV1c19tYYgBAZgBMcIBA2FibsgBDdgBA-gBAfgBApICAXmoAgQ&sid=28d97f630803f9d48b4a1f535cbdd33f&class_interval=1&dest_id=20061717&dest_type=city&group_adults=2&group_children=0&label_click=undef&no_rooms=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&src=index&src_elem=sb&ss=Boston&ssb=empty&ssne_untouched=Canc%C3%BAn&rows=15"

#response = requests.get(url)
#html = response.text.encode('utf-8')



def get_hotellist_page(city_url):
    """ Get the hotel list page given the url returned by
	get_city_page().
    """

    # Sleep 2 sec before starting a new http request
    time.sleep(0.5)
    # Request page
    response = requests.get(city_url)
    html = response.text.encode('utf-8')
    # Save the
    #with open(os.path.join(args.datadir, args.city + '-hotelist-' + str(count) + '.html'), "w") as h:
     #   h.write(str(html))
    return html



def parse_hotellist_page(html):
    
    page_soup = soup(html, 'lxml')

    hotel_boxes = page_soup.findAll('div', {'class' : "sr_item_default"})

    #filename = "traveler_ratings.csv"
    
    #f = open(filename, "w")

    #headers = "Hotel_name,Class_of_rating,Rating_score,Num_reviews\n"
    
    #f.write(headers)
    
    Class_of_rating_list = ["Wonderful", "Excellent", "Very Good", "Good"]
    
    for hotel_box in hotel_boxes:
        try:
            Hotel_name = hotel_box.find('span', {'class' : 'sr-hotel__name'}).find(text=True).strip()
    
            
            Class_of_rating = hotel_box.find('span', {'class' : 'review-score-widget__text'}).find(text=True).strip()
            if Class_of_rating not in Class_of_rating_list:
                Class_of_rating = "N/A"
    
            Rating_score = hotel_box.find('span', {'class' : 'review-score-badge'}).find(text=True).strip()
    
            Num_reviews = hotel_box.find('span', {'class' :'review-score-widget__subtext'}).find(text=True).strip().split()[0]
    
        except Exception as e:
            Hotel_name = "N/A"
            Class_of_rating = "N/A"
            Rating_score = "N/A"
            Num_reviews = "N/A"
            
        data_list.append([Hotel_name,Class_of_rating,Rating_score,Num_reviews])
        
       # f.write(Hotel_name.replace(","," ") + "," + Class_of_rating + "," + Rating_score + "," + Num_reviews + "\n")
        
    #f.close()
    
    # Get next URL page if exists, else exit
    div = page_soup.find("div", {"class" : "results-paging"})

    # check if last page
    if div.find('span', {'class' : 'paging-end'}):
        print("We reached last page")
        return False
    
    # If it is not last page there must be the Next URL
    hrefs = div.findAll('a', href= True)

    for href in hrefs:
        
        if href.find(text = True) == 'Next page':
            print("Next url is %s" % href['href'])
            return href['href']

 
data_list = [] 
city_url = "https://www.booking.com/searchresults.html?aid=304142&label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmcgV1c19tYYgBAZgBMcIBA2FibsgBDdgBA-gBAfgBApICAXmoAgQ&sid=28d97f630803f9d48b4a1f535cbdd33f&class_interval=1&dest_id=20061717&dest_type=city&group_adults=2&group_children=0&label_click=undef&no_rooms=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&src=index&src_elem=sb&ss=Boston&ssb=empty&ssne_untouched=Canc%C3%BAn&rows=15"
c=0
while(city_url):    
    c +=1  
    print(c)
    html = get_hotellist_page(city_url)
    city_url = parse_hotellist_page(html)

df = pd.DataFrame(data_list)
df.columns = ['Hotel_name', 'Class_of_rating', 'Rating_score', 'Num_reviews']
print(df)

with open("traveler_ratings.csv","w") as f:
    writer = csv.writer(f, lineterminator='\n')
    writer.writerow(df.columns)
    writer.writerows(data_list)

1
Next url is https://www.booking.com/searchresults.html?dest_id=20061717;dest_type=city;ss=Boston;offset=15
2
Next url is https://www.booking.com/searchresults.html?dest_id=20061717;dest_type=city;ss=Boston;offset=30
3
Next url is https://www.booking.com/searchresults.html?dest_id=20061717;dest_type=city;ss=Boston;offset=45
4
Next url is https://www.booking.com/searchresults.html?dest_id=20061717;dest_type=city;ss=Boston;offset=60
5
Next url is https://www.booking.com/searchresults.html?dest_id=20061717;dest_type=city;ss=Boston;offset=75
6
Next url is https://www.booking.com/searchresults.html?dest_id=20061717;dest_type=city;ss=Boston;offset=90
7
Next url is https://www.booking.com/searchresults.html?dest_id=20061717;dest_type=city;ss=Boston;offset=105
8
Next url is https://www.booking.com/searchresults.html?dest_id=20061717;dest_type=city;ss=Boston;offset=120
9
Next url is https://www.booking.com/searchresults.html?dest_id=20061717;dest_type=city;ss=Boston;offset=135
10
Next url is h

Now let's scrape some reviews. For each review of each each hotel in Boston you are to scrape the following attributes: 
1. Reviewer name
2. Reviewer ethnicity
3. Number of reviews 
4. Number of helpful votes
5. Date
6. Rating
7. Negative Review
8. Positive Review

Note that you will also need the hotel's name!! Also, some reviews may not have all attributes. 

** Save the data in "review_ratings.csv" in the following format: hotel_name, reviewer_name, ethnicity, num_reviews, num_help_votes, date, rating, neg_review, pos_review **

**(25 pts)**

You can see an overview of the information as displayed:
![Information to be scraped](review_sample.png)

In [3]:
import re

def get_hotellist_link_page(city_url):
    """ Get the hotel list page given the url returned by
	get_city_page(). Return the html after saving
	it to the datadir 
    """

    # Sleep 2 sec before starting a new http request
    time.sleep(0.5)
    # Request page
    response = requests.get(city_url)
    html = response.text.encode('utf-8')
    review_soup = soup(html, "lxml")
    
    li = review_soup.findAll('a', {'class' :'hotel_name_link url'})
    output_list = [] 

    for el in li:
        link = el['href'].strip()
        if len(link.split('/')) > 2:
            hotel_link = link.split('#')[0]
            output_list.append(hotel_link)
            
    print(output_list)
    return output_list
            
        


def get_hotellist_review_page(city_url):
    """ Get the hotel list page given the url returned by
	get_city_page(). Return the html after saving
	it to the datadir 
    """

    # Sleep 2 sec before starting a new http request
    time.sleep(0.5)
    # Request page
    url = "http://www.booking.com" + city_url
    print(url)
    response = requests.get(url)
    html = response.text.encode('utf-8')
    review_soup = soup(html, "lxml")
    li = review_soup.findAll('a', href=True)
    print(city_url)
    for el in li:
        if el.find(text=re.compile('See all guest reviews for')):
            return el['href']



def parse_hotellist_review_page(url):
    url = "http://www.booking.com" + url
    #print("------------")
    print(url)
    response = requests.get(url)
    html = response.text.encode('utf-8')
    page_soup = soup(html, "lxml")
    
    review_lists = page_soup.findAll('li', {'class' : "review_item"})
    for review_list in review_lists:
        try:
            
            reviewer_name = review_list.find('h4').find(text=True).strip()
            
            ethnicity = review_list.find('span', {'class' : 'reviewer_country'}).text.strip()
            
            num_reviews = review_list.find('div', {'class' :'review_item_user_review_count'})
            
            num_help_votes = review_list.find('div', {'class' :'review_item_user_helpful_count'}).find(text=True).strip().split()[0]
            
            date = review_list.find('p', {'class' :'review_item_date'}).find(text=True).strip()
            
            rating = review_list.find('span', {'class' :'review-score-badge'}).find(text=True).strip()
            
            neg_review = review_list.find('p', {'class' :'review_neg'}).contents[1]
           # if neg_review == "":
            #    neg_review = "N/A"
            
            pos_review = review_list.find('p', {'class' :'review_pos'}).contents[1]
            #if pos_review == "":
             #   pos_review = "N/A"
        
        except Exception as e:
            reviewer_name = "N/A"
            ethnicity = "N/A"
            num_reviews = "N/A"
            num_help_votes = "N/A"
            date = "N/A"
            rating = "N/A"
            neg_review = "N/A"
            pos_review = "N/A"
            
        review_data_list.append([reviewer_name, ethnicity, num_reviews, num_help_votes, date, rating, neg_review, pos_review])
        
            # Get next URL page if exists, else exit
    div = page_soup.find('div', {'class' : 'review_list_pagination'})
    
        # check if last page
    if div.find('p', {'class' : 'page_link review_next_page'}).text.strip() == "":
        print("We reached last page")
        return False
        
        # If it is not last page there must be the Next URL
    hrefs = div.findAll('a', href= True)
    
    for href in hrefs:
            
        if href.find(text = True) == 'Next page':
            print("Next url is %s" % href['href'])
            return href['href']


review_data_list = [] 
city_url = "https://www.booking.com/searchresults.html?aid=304142&label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmcgV1c19tYYgBAZgBMcIBA2FibsgBDdgBA-gBAfgBApICAXmoAgQ&sid=28d97f630803f9d48b4a1f535cbdd33f&class_interval=1&dest_id=20061717&dest_type=city&group_adults=2&group_children=0&label_click=undef&no_rooms=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&src=index&src_elem=sb&ss=Boston&ssb=empty&ssne_untouched=Canc%C3%BAn&rows=15"
c=0

#get_hotellist_review_page(city_url)

while(city_url):    
    c +=1  
    print(city_url)
    url_list = get_hotellist_link_page(city_url)
    for url_hotel in url_list:
        html = get_hotellist_review_page(url_hotel)
        city_url = parse_hotellist_review_page(html)

df_review = pd.DataFrame(data_list)
df_review.columns = ['reviewer_name', 'ethnicity', 'num_reviews', 'num_help_votes', 'date', 'rating', 'neg_review', 'pos_review']

with open("review_ratings.csv","w") as f:
    writer = csv.writer(f, lineterminator='\n')
    writer.writerow(df_review.columns)
    writer.writerows(review_data_list)
    

https://www.booking.com/searchresults.html?aid=304142&label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmcgV1c19tYYgBAZgBMcIBA2FibsgBDdgBA-gBAfgBApICAXmoAgQ&sid=28d97f630803f9d48b4a1f535cbdd33f&class_interval=1&dest_id=20061717&dest_type=city&group_adults=2&group_children=0&label_click=undef&no_rooms=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&src=index&src_elem=sb&ss=Boston&ssb=empty&ssne_untouched=Canc%C3%BAn&rows=15
['/hotel/us/14-gloucester-st-2b-by-lyon-apartments-boston.html\n', '/hotel/us/seaport.html\n', '/hotel/us/clearway-street-by-boston-furnished-rooms.html\n', '/hotel/us/14-gloucester-st-2a-by-lyon-apartments-boston.html\n', '/hotel/us/oakwood-boston.html\n', '/hotel/us/luxury-apartments-steps-away-from-china-town-and-theater-district.html\n', '/hotel/us/the-colonnade.html\n', '/hotel/us/the-c-house-boston-massachusettes.html\n', '/hotel/us/longfellow-by-stay-alfred.html\n', '/hotel/us/battery-wharf-hotel.html\n', '/hotel/us/the-envoy-autograph-collection.html\n', '/hotel/u

TypeError: must be str, not NoneType