In [1]:
from bs4 import BeautifulSoup
import requests
import os
import re
import pymongo
from pymongo import MongoClient

In [2]:
cluster = MongoClient('localhost',27017)
db = cluster['hotel']
collection = db['decriptions_reviews']

In [3]:
class hotel():
    # Returns a dictionary of description and reviews using a hotel's html file
    
    def __init__(self,html_link):
        self.hotel_data = {}
        self.link = html_link
        self.hotel_data["link"] = self.link
        
        # Declare the parser for the file
        html_file = requests.get(html_link).text
        soup = BeautifulSoup(html_file,'lxml')
        
        # Find all description like objects
        desc_like_class = soup.find_all("div", class_="ssr-init-26f")

        # Find all description in these objects
        for des_class in desc_like_class:
            desc = des_class.find("div", class_="cPQsENeY")
            if desc!=None:
                self.hotel_data["description"]=desc.text
        
        # Find all review like objects
        review_like_class = soup.find_all("div", class_="_3hDPbqWO")

        # Find all reviews in these objects
        for review_class in review_like_class:
            review = review_class.find("div", class_="cPQsENeY")
            if review!=None:
                if "review" not in self.hotel_data.keys():
                    self.hotel_data["review"]=[]
                self.hotel_data["review"].append([review.text]) 

In [4]:
# class hotel_list():
#     # Stores a list of all hotel link in a search page. Grabs the descriptions and reviews of the hotels in that page
#     # ______
#     # Attributes:
#     # html_search_file: A page with list of 30 hotel listed
#     # hotel_links: A list of all hotels in that search page
#     # hotel_dr: List of all descriptions and reviews of hotels found in that page.
        
#     def __init__(self,html_search_files):
        
#         self.html_search_file = html_search_file
#         self.hotel_links = []
#         self.hotel_dr = []
#         self.prefix = "https://www.tripadvisor.in/"
        
#         for search_file in html_search_files:
#             self.get_hotel_links()
#             self.get_hotel_data()

#     def get_hotel_links(self):  
#         soup = BeautifulSoup(self.html_search_file,'lxml')
#         hotel_page = soup.find_all("div", class_="listing_title")
#         for h in hotel_page:
            
#             link = self.prefix+h.h2.a["href"]
#             self.hotel_links.append(link)
        
#     def get_hotel_data(self):
#         for i,h in enumerate(self.hotel_links):
#             html_file = requests.get(h).text
#             ho = hotel(html_file)
#             self.hotel_dr.append(ho)
#             print("Completed ",i+1," hotels")

In [5]:
# class search_files():
    
#     def __init__(self,master_page):
#         self.orig_page = master_page
#         self.pages = []
#         self.get_all_page()

#     def get_all_page(self):
#         locs = [m.start() for m in re.finditer('-', self.orig_page)]
#         self.pages.append(self.orig_page)
        
#         for i in range(1,10):
#             insert_string = "-oa"+str(30*i)
#             new_page = self.orig_page[:locs[1]] + insert_string + self.orig_page[locs[1]:]
#             self.pages.append(new_page)

In [19]:
class hotel_scrapper():
    
    def __init__(self):
        
        self.pages = []
        self.n_similar_pages = 0       
        self.hotel_links = []
        self.hotel_dr = []
        self.last_hotel_index = 0
        self.prefix = "https://www.tripadvisor.in/"
        
    def get_template_pages(self,master_page,n_pages=2):
        self.orig_page = master_page
        self.n_similar_pages = n_pages
        
        self.__get_pages()
        self.get_page_hotel_list()
        
    def __get_pages(self):
        # Creates a list of pages where each page has 30 list of hotels
        
        # Find locations of hypherns. Select the second location to add prefix to links for next pages
        locs = [m.start() for m in re.finditer('-', self.orig_page)]
        
        # Add the first page to page list
        self.pages.append(self.orig_page)
        
        # Add the next N pages to page list
        for i in range(1,self.n_similar_pages):
            insert_string = "-oa"+str(30*i)
            new_page = self.orig_page[:locs[1]] + insert_string + self.orig_page[locs[1]:]
            self.pages.append(new_page) 
            
    def get_page_hotel_list(self):
        # For each listed hotel page containing 30 hotels, gets the list of links to each hotel. Then goes to each of them and gets the data
        
        for search_file in self.pages[-self.n_similar_pages:]:
            self.get_hotel_links(search_file)
            self.get_hotel_data()
            
            
    def get_hotel_links(self,search_file)  : 
        # Retrieves the list of 30 hotel's page from a listed hotel page containing 30 hotels
        html_search_file = requests.get(search_file).text
        soup = BeautifulSoup(html_search_file,'lxml')
        
        hotel_page = soup.find_all("div", class_="listing_title")
        
        for h in hotel_page:
            hotel_name = h.find('a')
            link = self.prefix+hotel_name["href"]
#             link = self.prefix+h.h2.a["href"]
            self.hotel_links.append(link)
    
    def get_hotel_data(self):
        # For each hotel page, gets description and reviews
        for i,h_link in enumerate(self.hotel_links[self.last_hotel_index:]):
            
            ho = hotel(h_link)
            self.hotel_dr.append(ho.hotel_data)
            # Add to collection on MongoDB
            collection.insert_one(ho.hotel_data)
            print("Adding Hotel Number ",i+1," in page")
        self.last_hotel_index = self.last_hotel_index + len(self.hotel_links[self.last_hotel_index:])
        print("Total Hotels reviewed: ",len(self.hotel_links))

In [20]:
scrapped = hotel_scrapper()

In [21]:
orig_links = ['https://www.tripadvisor.in/Hotels-g297586-Hyderabad_Hyderabad_District_Telangana-Hotels.html',
              'https://www.tripadvisor.in/Hotels-g304556-Chennai_Madras_Chennai_District_Tamil_Nadu-Hotels.html',
              'https://www.tripadvisor.in/Hotels-g304554-Mumbai_Maharashtra-Vacations.html',
              'https://www.tripadvisor.in/Hotels-g297628-Bengaluru_Bangalore_District_Karnataka-Vacations.html',
              'https://www.tripadvisor.in/Hotels-g304551-New_Delhi_National_Capital_Territory_of_Delhi-Vacations.html',
              'https://www.tripadvisor.in/Hotels-g297654-Pune_Pune_District_Maharashtra-Hotels.html']

In [22]:

for orig_link in orig_links:
    scrapped.get_template_pages(orig_link)

Adding Hotel Number  1  in page
Adding Hotel Number  2  in page
Adding Hotel Number  3  in page
Adding Hotel Number  4  in page
Adding Hotel Number  5  in page
Adding Hotel Number  6  in page
Adding Hotel Number  7  in page
Adding Hotel Number  8  in page
Adding Hotel Number  9  in page
Adding Hotel Number  10  in page
Adding Hotel Number  11  in page
Adding Hotel Number  12  in page
Adding Hotel Number  13  in page
Adding Hotel Number  14  in page
Adding Hotel Number  15  in page
Adding Hotel Number  16  in page
Adding Hotel Number  17  in page
Adding Hotel Number  18  in page
Adding Hotel Number  19  in page
Adding Hotel Number  20  in page
Adding Hotel Number  21  in page
Adding Hotel Number  22  in page
Adding Hotel Number  23  in page
Adding Hotel Number  24  in page
Adding Hotel Number  25  in page
Adding Hotel Number  26  in page
Adding Hotel Number  27  in page
Adding Hotel Number  28  in page
Adding Hotel Number  29  in page
Adding Hotel Number  30  in page
Total Hotels review

Adding Hotel Number  5  in page
Adding Hotel Number  6  in page
Adding Hotel Number  7  in page
Adding Hotel Number  8  in page
Adding Hotel Number  9  in page
Adding Hotel Number  10  in page
Adding Hotel Number  11  in page
Adding Hotel Number  12  in page
Adding Hotel Number  13  in page
Adding Hotel Number  14  in page
Adding Hotel Number  15  in page
Adding Hotel Number  16  in page
Adding Hotel Number  17  in page
Adding Hotel Number  18  in page
Adding Hotel Number  19  in page
Adding Hotel Number  20  in page
Adding Hotel Number  21  in page
Adding Hotel Number  22  in page
Adding Hotel Number  23  in page
Adding Hotel Number  24  in page
Adding Hotel Number  25  in page
Adding Hotel Number  26  in page
Adding Hotel Number  27  in page
Adding Hotel Number  28  in page
Adding Hotel Number  29  in page
Adding Hotel Number  30  in page
Total Hotels reviewed:  270
Adding Hotel Number  1  in page
Adding Hotel Number  2  in page
Adding Hotel Number  3  in page
Adding Hotel Number  4 

In [23]:
results = collection.find({},{"link":1})

for result in results:
    print(result)

{'_id': ObjectId('60857a630ba2d13efefef152'), 'link': 'https://www.tripadvisor.in//Hotel_Review-g297586-d613223-Reviews-Novotel_Hyderabad_Convention_Centre-Hyderabad_Hyderabad_District_Telangana.html'}
{'_id': ObjectId('60857a660ba2d13efefef153'), 'link': 'https://www.tripadvisor.in//Hotel_Review-g297586-d1674886-Reviews-The_Park_Hyderabad-Hyderabad_Hyderabad_District_Telangana.html'}
{'_id': ObjectId('60857a690ba2d13efefef154'), 'link': 'https://www.tripadvisor.in//Hotel_Review-g297586-d729829-Reviews-The_Golkonda_Resorts_Spa-Hyderabad_Hyderabad_District_Telangana.html'}
{'_id': ObjectId('60857a6a0ba2d13efefef155'), 'link': 'https://www.tripadvisor.in//Hotel_Review-g297586-d1149721-Reviews-The_Manohar_Hyderabad-Hyderabad_Hyderabad_District_Telangana.html'}
{'_id': ObjectId('60857a6c0ba2d13efefef156'), 'link': 'https://www.tripadvisor.in//Hotel_Review-g297586-d1148813-Reviews-Novotel_Hyderabad_Airport-Hyderabad_Hyderabad_District_Telangana.html'}
{'_id': ObjectId('60857a6e0ba2d13efefef

<pymongo.cursor.Cursor at 0x24561f88cc0>