In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Scrapping Text data

## 1 Download pages using the Python requests- Crawl

In [72]:
# Target url to scrap
url = 'https://www.goibibo.com/hotels/hotels-in-shimla-ct/'

In [73]:
# Send request to download the data
response = requests.request('GET',url)

In [74]:
response.status_code

200

##  2 Parsing a page with BeautifulSoup

In [75]:
# Parse the downloaded data using BeautifulSoup
data = BeautifulSoup(response.text, features = 'html.parser')
# print(data)

In [76]:
# Details of particular hotel are on different card. So next step is to filter this card data from complete source code

# In the web page --> hover the mouse on the required element in the card - Right click --> Inspect --> To get the source code of that particular element in the card


In [82]:
# Find all the sections with specified class name

# The class names of all cards would be the same.
# We can get the list of those cards by just passing the tag name (<div\>) and attributes like <class> tag with its name.

In [83]:
#  data.find_all('p')[0].get_text()

In [87]:
hotel_card = data.find_all('div', attrs = {'class','HotelCardstyles__WrapperSectionMetaDiv-sc-1s80tyk-3 fLGqLz'})
# print(hotel_card)

In [85]:
len(hotel_card)

30

In [124]:
# Source code of hotel cards
# Get hotel name and price from hotel cards

for card in hotel_card:
    #get the hotel name , found in 'a' tag , only one tag if this kind for each card 
    hotel_name  = card.find('a')
    
    #get the room price , found in p tag , 
    price = card.find('p',attrs = {'class','HotelCardstyles__CurrentPrice-sc-1s80tyk-28 inUyrJ'})
    
    rating = card.find('span', attrs = {'class','ReviewAndRatingsstyles__TotalReviewCountText-sc-1nxmeoo-4 ckCNHU'})
    
    room_type = card.find('span',attrs = {'class','HotelCardstyles__RoomTypeTextWrapper-sc-1s80tyk-16 jYuA-DB'})
    
   # offer = card.find('div',attrs = {'class','OfferTagstyles__PercentageOffWrapperDiv-sc-16zl30v-10 bqOixo'})
    
    print(hotel_name.text,',', price.text,',', rating.text,',', room_type.text)

Snow Valley Resorts , 3868 , 2315 Ratings , Premium Room (Centrally Heated)
Marina- Shimla First Designer Boutique Hotel , 10800 , 894 Ratings , Deluxe Room
Meena Bagh Shimla , 8479 , 56 Ratings , Two Room Unit - Khaddus Den
Rocky Knob (Explore World Art in One Property) , 3137 , 196 Ratings , Rocky Room
Radisson Hotel Shimla , 12000 , 514 Ratings , Superior Room with Free Breakfast
Hotel Baljees Regency , 3495 , 1339 Ratings , Premium Room
Hotel Dhroov , 5955 , 441 Ratings , Premium Room(Valley View)
Hotel Willow Banks , 6883 , 853 Ratings , Deluxe Room - Mall road facing
Hotel Shingar , 2727 , 1127 Ratings , Deluxe Room
OYO 1706 Hotel The Alpine Heritage Residency , 2269 , 167 Ratings , Classic (2X)
Clarkes Hotel, A grand heritage hotel since 1898 , 7500 , 48 Ratings , Superior Room
Goldenfern Resort Shimla , 3703 , 79 Ratings , Classic Room Non View
Hotel Combermere , 7717 , 917 Ratings , Luxury Room
The Oberoi Cecil , 12000 , 64 Ratings , Deluxe Room
Sanobar The Grand White , 2396 

In [129]:
# place_card = data.find_all('div', attrs = {'color','#2274E0'})
# len(place_card)

## 3 Store the data

In [109]:
# The final step is to store the extracted data in the CSV file. Here, for each card, we will extract the Hotel Name,Price,
# ratings, room_type  and store it in a Python dictionary. We will then finally append it to a list.

In [123]:
# create a list to store the data
scrapped_data = []

for card in hotel_card:
    # Initialize the dictionary
    details = {}
    
    hotel_name  = card.find('a')
        
    price = card.find('p',attrs = {'class','HotelCardstyles__CurrentPrice-sc-1s80tyk-28 inUyrJ'})
    
    rating = card.find('span', attrs = {'class','ReviewAndRatingsstyles__TotalReviewCountText-sc-1nxmeoo-4 ckCNHU'})
    
    room_type = card.find('span',attrs = {'class','HotelCardstyles__RoomTypeTextWrapper-sc-1s80tyk-16 jYuA-DB'})
    
    # Add data to dictionary
    details['hotel_name'] = hotel_name.text
    details['price'] = price.text
    details['rating'] = rating.text
    details['room_type'] = room_type.text
    
    # Append the scrapped data to list
    scrapped_data.append(details)  #This is the list of dictionaries , adding dictionaries to the list 
    
    df = pd.DataFrame.from_dict(scrapped_data)
    
    # Save the scrapped data as CSV file
    df.to_csv('hotels_data.csv',index =False, sep =',')
   

In [133]:
df = pd.read_csv('hotels_data.csv')
df.head()

Unnamed: 0,hotel_name,price,rating,room_type
0,Snow Valley Resorts,3868,2315 Ratings,Premium Room (Centrally Heated)
1,Marina- Shimla First Designer Boutique Hotel,10800,894 Ratings,Deluxe Room
2,Meena Bagh Shimla,8479,56 Ratings,Two Room Unit - Khaddus Den
3,Rocky Knob (Explore World Art in One Property),3137,196 Ratings,Rocky Room
4,Radisson Hotel Shimla,12000,514 Ratings,Superior Room with Free Breakfast


## Scrapping Image - Find all images with image(img) tag

In [32]:
url = 'https://www.goibibo.com/hotels/hotels-in-shimla-ct/'

In [33]:
response= requests.request('GET',url)

In [34]:
response.status_code

200

In [35]:
data = BeautifulSoup(response.text, features = 'html.parser')

In [64]:
# Find all images with image tag
# The <img> tag is used to embed an image in an HTML page

# The <img> tag has two required attributes:

    # src - Specifies the path to the image
    # alt - Specifies an alternate text for the image, if the image for some reason cannot be displayed

images = data.find_all('img',src = True)


img_links = data.find_all(itemprop = 'image')
img_links


[<meta content="https://cdn1.goibibo.com/voy_ing/t_g/1de95162354211e5bbab001ec9b85d13.jfif" itemprop="image"/>,
 <meta content="https://cdn1.goibibo.com/voy_ing/t_g/e1566082499a11e689d90022195573b9.jpg" itemprop="image"/>,
 <meta content="https://cdn1.goibibo.com/voy_ing/t_g/73cf03ba08bc11e896ce0a9df65c8753.jpg" itemprop="image"/>,
 <meta content="https://cdn1.goibibo.com/voy_ing/t_g/b137aec64c0011eaaaa40242ac110002.jpg" itemprop="image"/>,
 <meta content="https://cdn1.goibibo.com/voy_ing/t_g/844a789ee6f911e7ab8902ed4d4e40dc.jpg" itemprop="image"/>,
 <meta content="https://cdn1.goibibo.com/voy_ing/t_g/3eef9f52c87811e8a99002fc98a94198.jpg" itemprop="image"/>,
 <meta content="https://cdn1.goibibo.com/voy_mmt/t_g/htl-imgs/200701120646243660-d326698cb60311e689b002bf5ac07431.jpg" itemprop="image"/>,
 <meta content="https://cdn1.goibibo.com/voy_ing/t_g/94b8771c018911e8b4f902755708f0b3.jfif" itemprop="image"/>,
 <meta content="https://cdn1.goibibo.com/voy_mmt/t_g/htl-imgs/201810312049042531-3

In [65]:
# For urls

# From all the images tags, select only the src part. Also notice that the images are in .jpg format.So will select only that

img_src1 = [x['content']   for x in img_links]

# Select only .jpg format image
img_src = [x   for x in img_src1 if x.endswith('.jpg')]

for img in img_src:
    print(img)
   

https://cdn1.goibibo.com/voy_ing/t_g/e1566082499a11e689d90022195573b9.jpg
https://cdn1.goibibo.com/voy_ing/t_g/73cf03ba08bc11e896ce0a9df65c8753.jpg
https://cdn1.goibibo.com/voy_ing/t_g/b137aec64c0011eaaaa40242ac110002.jpg
https://cdn1.goibibo.com/voy_ing/t_g/844a789ee6f911e7ab8902ed4d4e40dc.jpg
https://cdn1.goibibo.com/voy_ing/t_g/3eef9f52c87811e8a99002fc98a94198.jpg
https://cdn1.goibibo.com/voy_mmt/t_g/htl-imgs/200701120646243660-d326698cb60311e689b002bf5ac07431.jpg
https://cdn1.goibibo.com/voy_mmt/t_g/htl-imgs/201810312049042531-38ef306c142311e99aae0242ac110003.jpg
https://cdn1.goibibo.com/voy_mmt/t_g/htl-imgs/200701131553348226-06708104c3fa11e980d90242ac110002.jpg
https://cdn1.goibibo.com/voy_mmt/t_g/htl-imgs/201711301634567287-c4f2338ce66b11e991bf0242ac110002.jpg
https://cdn1.goibibo.com/voy_mmt/t_g/htl-imgs/20070124170346216-167a294c2d4011e5b10b0022195573b9.jpg
https://cdn1.goibibo.com/voy_mmt/t_g/htl-imgs/200701121106345886-c3ae59803fb711e88b030a1a5053b91a.jpg
https://cdn1.goibib

In [66]:
# Now we have list of image urls. Now request the image content and write it in a file
# Make sure you open the file 'wb'(write binary) form

img_count = 1

for img in img_src:
    with open('img_' + str(img_count) + '.jpg', 'wb') as f: # wb - write binary
        res = requests.get(img)
        f.write(res.content)
    img_count = img_count + 1
        
# Images are saved in the respective folder