# Scraping Hotel Ratings on Tripadvisor

In this homework we will practice web scraping. Let's get some basic information for each hotel in Boston.

On each hotel page, scrape the Traverler ratings. **(10 pts)**

![Information to be scraped](traveler_ratings.png)

Save the data in "traverler_ratings.csv" in the following format:

hotel_name, rating, count

In [1]:
from bs4 import BeautifulSoup
import sys
import requests
import time
base_url = "http://www.tripadvisor.com"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36"

url = base_url + "/" + "Boston"

headers = {'User-Agent' : user_agent}
response = requests.get(url, headers=headers)
html = response.text.encode('utf-8')

In [2]:
# Travel Advisor has a series of languages for pages, here we return the english
def get_city_url(city):
    # Return city url
    url = base_url+ "/"+ city
    
    # Given the url, request the HTML page
    headers = { 'User-Agent' : user_agent }
    response = requests.get(url, headers=headers)
    
    html = response.text.encode('utf-8')
   
   # Save to file
    with open(('Boston' + '-search-page.json'), "wb") as h:
        h.write(html)

    soup = BeautifulSoup(html, 'lxml')
    li = soup.find("link", {"hreflang": "en"})
    return li['href']

In [3]:
# Get Hotel Url
def get_hotel_url(city_url):

    url = city_url

    # Given the url, request the HTML page
    headers = { 'User-Agent' : user_agent }
    response = requests.get(url, headers=headers)
   
    html = response.text.encode('utf-8')
    
    # Save to file
    with open('Boston-en-page.html', "wb") as h:
        h.write(html)

    soup = BeautifulSoup(html, 'lxml')

    li = soup.find("li", {"class": "hotels twoLines"})
    hotel_url = li.find('a', href = True)

    return hotel_url['href']

In [12]:
def get_hotel_list_page(city_url):
    
    # Get the hotellist according to the hotel link

    url = base_url + city_url
    print(url)
    # Sleep 2 sec before starting a new http request
    time.sleep(2)
    # Request page
    headers = { 'User-Agent' : user_agent }
    response = requests.get(url, headers=headers)
    html = response.text.encode('utf-8')
    # Save the 
    with open(('Boston-hotel_list.html'), "wb") as h:
        h.write(html)
    return html

In [5]:
def get_hotel_list(html, result):
    
    soup = BeautifulSoup(html, 'lxml')
    # Extract hotel name, star rating and number of reviews
    hotel_boxes = soup.select('div.listing.easyClear.p13n_imperfect')
    for hotel_box in hotel_boxes:
        name = hotel_box.find('div', {'class' :'listing_title'}).find(text=True)
        hotel_url = hotel_box.find('div', {'class' :'listing_title'}).find('a', href = True)
        new_list = [name, hotel_url['href']]
        result.append(new_list)

In [6]:
def get_next_page(html):
    
    soup = BeautifulSoup(html, 'lxml')

    # Get next URL page if exists, else exit
    div = soup.find("div", {"class" : "unified pagination standard_pagination"})
    # check if last page
    if div.find('span', {'class' : 'nav next ui_button disabled'}):
        return False
    # If it is not las page there must be the Next URL
    hrefs = div.findAll('a', href= True)
    for href in hrefs:
        if href.find(text = True) == 'Next':
            return href['href']

In [7]:
def get_hotel_detail_page(hotel_url):
    
    # Get the hotellist according to the hotel link

    url = base_url + hotel_url
    # Sleep 2 sec before starting a new http request
    time.sleep(2)
    # Request page
    headers = { 'User-Agent' : user_agent }
    response = requests.get(url, headers=headers)
    html = response.text.encode('utf-8')
    # Save the 
    with open(('Boston-hotel_detail.html'), "wb") as h:
        h.write(html)
    return html

In [14]:
# city_url = get_city_url("Boston")
# hotel_url = get_hotel_url(city_url)
hotel_url = "/Hotels-g60745-Boston_Massachusetts-Hotels.html"

result = []
while(True):
    html = get_hotel_list_page(hotel_url)
    get_hotel_list(html, result)
    hotel_url = get_next_page(html)
    if (hotel_url is False):
        break

http://www.tripadvisor.com/Hotels-g60745-Boston_Massachusetts-Hotels.html
http://www.tripadvisor.com/Hotels-g60745-oa30-Boston_Massachusetts-Hotels.html#ACCOM_OVERVIEW
http://www.tripadvisor.com/Hotels-g60745-oa60-Boston_Massachusetts-Hotels.html#ACCOM_OVERVIEW


In [15]:
def parse_hotel_list(html, traveler_ratings, hotel_name):
    soup = BeautifulSoup(html, 'lxml')
    rating_filter = soup.find('div', {'id' : 'ratingFilter'})
    rating_boxes = rating_filter.select('li')
    rating = []
    rating.append(hotel_name)
    for rating_box in rating_boxes:
        label = rating_box.find('label')
        span = label.find_all('span')
        rating.append(span[2].text.rstrip())
    traveler_ratings.append(rating)

In [16]:
traveler_ratings = []
for x in result:
    hotel_name = x[0]
    hotel_url = x[1]
    html = get_hotel_detail_page(hotel_url)
    parse_hotel_list(html, traveler_ratings, hotel_name)


In [19]:
import csv
with open('traverler_ratings.csv', 'w', newline = '') as fp:
    wr = csv.writer(fp)
    head = ['hotel_name', 'rating', 'count']
    wr.writerow(head)
    for x in traveler_ratings:
        name = x[0]
        for i in range(1,6):
            if (i == 1):
                rates = 'excellent'
                data = [name, rates, x[i]]
            elif (i == 2):
                rates = 'very good'
                data = [name, rates, x[i]]
            elif (i == 3):
                rates = 'average'
                data = [name, rates, x[i]]
            elif (i == 4):
                rates = 'poor'
                data = [name, rates, x[i]]
            elif (i == 5):
                rates = 'terrible'
                data = [name, rates, x[i]]
            wr.writerow(data)
fp.close()

-------

Next, scrape all the reviews of each hotel for the star ratings of the following attributes: Value, Location, Sleep Quality, Rooms, Cleanliness, Service. Note that some reviews may not have attribute ratings and some may only have some of the attributes. **(25 pts)**

![Information to be scraped](attribute_ratings.png)

Save the data in "attribute_ratings.csv" in the following format:

hotel_name, review_id, attribute, star_value

In [24]:
def get_next_review_page(html):
    
    soup = BeautifulSoup(html, 'lxml')

    # Get next URL page if exists, else exit
    div = soup.find("div", {"class" : "unified pagination "})
    # check if last page
    if div.find('span', {'class' : 'nav next disabled'}):
        return False
    # If it is not las page there must be the Next URL
    hrefs = div.findAll('a', href= True)
    for href in hrefs:
        if href.find(text = True) == 'Next':
            return href['href']

In [25]:
def redirect_review_page(html):
    soup = BeautifulSoup(html, 'lxml')
    # Extract hotel name, star rating and number of reviews
    review_boxes = soup.select('div.reviewSelector')
    if review_boxes is None:
        return None
    div = review_boxes[0].find('div', {'class' : 'innerBubble'})
    if (div is None):
        return None
    url = div.find('a', href = True)
    if (url is None):
        return None
    html = get_hotel_detail_page(url['href'])
    return html

In [38]:
def parse_review_page(html, review_list, hotel_name):
    soup = BeautifulSoup(html, 'lxml')
    review_boxes = soup.select('div.reviewSelector')
    for review_box in review_boxes:
        review_id = review_box.get('id')
        review_id = review_id[7:]
        data = {}
        data['hotel_name'] = hotel_name
        data['review_id'] = review_id
        rating_list = review_box.find('div', {'class' : 'rating-list'})
        if (rating_list is None):
            continue
        spans = rating_list.select('span.rate.sprite-rating_ss.rating_ss')
        if(spans is None):
            continue
        keys = rating_list.find_all('div', {'class' : 'recommend-description'})
        if (len(keys) == 0):
            continue
        for i in range(0, len(spans)):
            key = keys[i].getText()
            span = spans[i]
            img = span.find('img')
            value = img['alt'].split()[0]
            data[key] = value
        
        review_list.append(data)


In [39]:
import collections

def write_csv(review_list, c):
    i = 1
    
    with open('review_ratings_' + str(c) + '.csv', 'w', newline = '') as fp:
#         w = csv.writer(fp)
#         header = ['hotel_name', 'review_id', 'attribute', 'star_value']
#         w.writerow(head)
        w = csv.writer(fp)
        for x in review_list:
            if ('Value' in x):
                data = [x['hotel_name'], x['review_id'], 'Value', x['Value']]
                w.writerow(data)
            if ('Sleep Quality' in x):
                data = [x['hotel_name'], x['review_id'], 'Sleep Quality', x['Sleep Quality']]
                w.writerow(data)
            if ('Service' in x):
                data = [x['hotel_name'], x['review_id'], 'Service', x['Service']]
                w.writerow(data)
            if ('cleanliness' in x):
                data = [x['hotel_name'], x['review_id'], 'Cleanliness', x['Cleanliness']]
                w.writerow(data)
            if ('Location' in x):
                data = [x['hotel_name'], x['review_id'], 'Location', x['Location']]
                w.writerow(data)
            if ('Rooms' in x):
                data = [x['hotel_name'], x['review_id'], 'Rooms', x['Rooms']]
                w.writerow(data)
    fp.close()

In [43]:
c = 0
error_list = []
for x in result:
    review_list = []
    hotel_name = x[0]
    hotel_url = x[1]
    html = get_hotel_detail_page(hotel_url)
    html = redirect_review_page(html)
    print("Start capture reviews " + hotel_name + " from " + hotel_url)
    if (html is None):
        print("Skip" + str(c))
        error_list.append(c)
        c+=1
        continue
    while(True):
        parse_review_page(html, review_list, hotel_name)
        hotel_url = get_next_review_page(html)
        if (hotel_url is False):
            break
        html = get_hotel_detail_page(hotel_url)
        
    write_csv(review_list, c)
    print("Success capture Reviews for " + x[0] + " from " + x[1])
    c+=1
    

Start capture reviewsMarriott Vacation Club Pulse at Custom House, Boston from /Hotel_Review-g60745-d94344-Reviews-Marriott_Vacation_Club_Pulse_at_Custom_House_Boston-Boston_Massachusetts.html
Success capture Reviews for Marriott Vacation Club Pulse at Custom House, Boston from /Hotel_Review-g60745-d94344-Reviews-Marriott_Vacation_Club_Pulse_at_Custom_House_Boston-Boston_Massachusetts.html
Start capture reviewsBoston Harbor Hotel from /Hotel_Review-g60745-d89575-Reviews-Boston_Harbor_Hotel-Boston_Massachusetts.html
Success capture Reviews for Boston Harbor Hotel from /Hotel_Review-g60745-d89575-Reviews-Boston_Harbor_Hotel-Boston_Massachusetts.html
Start capture reviewsSeaport Boston Hotel from /Hotel_Review-g60745-d94330-Reviews-Seaport_Boston_Hotel-Boston_Massachusetts.html
Success capture Reviews for Seaport Boston Hotel from /Hotel_Review-g60745-d94330-Reviews-Seaport_Boston_Hotel-Boston_Massachusetts.html
Start capture reviewsFour Seasons Hotel Boston from /Hotel_Review-g60745-d895

In [50]:
def redirect_err_review_page(html):
    soup = BeautifulSoup(html, 'lxml')
    # Extract hotel name, star rating and number of reviews
    review_boxes = soup.select('div.reviewSelector')
    div = review_boxes[0].find('div', {'class' : 'innerBubble'})
    url = div.find('a', href = True)
    html = get_hotel_detail_page(url['href'])
    return html

In [51]:
review_list = []
c = 76
x = result[76]
hotel_name = x[0]
hotel_url = x[1]
html = get_hotel_detail_page(hotel_url)
html = redirect_err_review_page(html)
print("Start capture reviews " + hotel_name + " from " + hotel_url)
while(True):
    parse_review_page(html, review_list, hotel_name)
    hotel_url = get_next_review_page(html)
    if (hotel_url is False):
        break
    html = get_hotel_detail_page(hotel_url)

write_csv(review_list, c)
print("Success capture Reviews for " + x[0] + " from " + x[1])
c+=1

Start capture reviews Ramada Boston from /Hotel_Review-g60745-d217148-Reviews-Ramada_Boston-Boston_Massachusetts.html
Success capture Reviews for Ramada Boston from /Hotel_Review-g60745-d217148-Reviews-Ramada_Boston-Boston_Massachusetts.html


In [55]:
with open ('attribute_ratings.csv', 'w', newline = '') as fp:
    w = csv.writer(fp)
    header = ['hotel_name', 'review_id', 'attribute', 'star_value']
    w.writerow(header)
    for i in range(0, 82):
        with open ('review_ratings_' + str(i) + '.csv', newline = '') as fd:
            rows = csv.reader(fd)
            for row in rows:
                w.writerow(row)
        fd.close()
fp.close()
        

-------