In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

## Preparing a list of URLs to scrape

In [2]:
root_url = 'https://www.zomato.com/kolkata/dinner'

In [3]:
# Lying to the server so that we can scrape without consequences
headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

In [4]:
rest_urls = []
pbar = tqdm(range(1, 151))
for i in pbar:
    # Downloading page containing list of restaurants
    page = requests.get(f'{root_url}?page={i}', headers=headers)
    
    # Cooking a soup for easy digestion
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # Extracting a list of all restaurants
    res = soup.find_all('div', attrs={'class': 'search_left_featured clearfix'})
    
    for entry in res:
        # Only storing the URL of the restaurant
        url = entry.find('a', href=True)['href']
        
        # Replacing /info with /reviews so as to navigate to the reviews directly
        rest_urls.append(url.replace('/info', '/reviews'))
    pbar.set_description(f'Page [{i}] scraped!')

Page [150] scraped!: 100%|██████████| 150/150 [06:13<00:00,  2.49s/it]


In [None]:
len(rest_urls)

2250

## Scraping reviews and corresponding ratings

In [None]:
texts = []
labels = []
pbar = tqdm(rest_urls)
for url in pbar:
    try:
        # A little verbosity doesn't hurt
        pbar.set_description(f'Fetching {url}')
        
        # Downloading the restaurant page
        page = requests.get(url, headers=headers)
        
        # Cooking a soup
        page = BeautifulSoup(page.text, 'html.parser')
        
        # Looking for reviews
        reviews = page.find_all('div', attrs={'class': 'rev-text mbot0 '})
        
        for review in reviews:
            # Extracting the label
            label = float(review.find('div')['aria-label'][-3:])
            
            # Clearing out some junk
            review.find('div').decompose()
            
            # Extracting the text
            text = review.text.strip()

            # Storing our treasures
            labels.append(label)
            texts.append(text)
            
    except KeyboardInterrupt:
        # Manually breaking the loop
        break   
    except:
        # Something went wrong so ...
        # Skipping the current url :)
        pbar.write(f'Failed to retrieve url: {url}')

# Making a pandas DataFrame from our data
data = pd.DataFrame()
data['text'] = texts
data['label'] = labels

Fetching https://www.zomato.com/kolkata/awesome-sector-3-salt-lake/reviews:  94%|█████████▍| 2123/2250 [57:26<03:26,  1.62s/it]                                         

## Glimpse of the data

In [None]:
data.head()

In [None]:
data.shape

## Saving the data to disk

In [None]:
data.to_csv('data/reviews.csv', index=False)